|
@ -0,0 +1,105 @@ |
|
|
|
|
|
#include <stdio.h> |
|
|
|
|
|
#include "Ttiming.h" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void set(int *a,int *b, int N){ |
|
|
|
|
|
int i =blockIdx.x * blockDim.x + threadIdx.x; //patrz w niej w 'multiply' |
|
|
|
|
|
while(i<N){ |
|
|
|
|
|
a[i] = 1 + (int)(sinf(i) * i * threadIdx.x) % 20; |
|
|
|
|
|
b[i] = 1 + (int)(cosf(i) * i * threadIdx.x) % 20; |
|
|
|
|
|
|
|
|
|
|
|
i+=blockDim.x * gridDim.x; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
__global__ void multiply(int *a,int *b,long *c, int N){ |
|
|
|
|
|
int i =blockIdx.x * blockDim.x + threadIdx.x; //obliczam którą komurkę tablicy mam liczyć, fajne wytłumaczenie o co chodzi jest w "cuda w przykładach" na stronie 47 |
|
|
|
|
|
long sum = 0; |
|
|
|
|
|
int temp,temp2; |
|
|
|
|
|
while(i<N*N){ |
|
|
|
|
|
temp = (int)floor((float)(i/N*N)); |
|
|
|
|
|
temp2 = i%N; |
|
|
|
|
|
for(int j=0;j<N;j++){ |
|
|
|
|
|
sum +=a[temp + j] * b[j*N + temp2 ]; |
|
|
|
|
|
} |
|
|
|
|
|
c[i]=sum; |
|
|
|
|
|
sum =0; |
|
|
|
|
|
i+=blockDim.x * gridDim.x; //inkrementacja o ilość wszystkich 'watków' |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
int main(int argc, char *argv[]){ |
|
|
|
|
|
|
|
|
|
|
|
int size; |
|
|
|
|
|
int N; |
|
|
|
|
|
char *endptr; |
|
|
|
|
|
int *dev_a,*dev_b; |
|
|
|
|
|
long *dev_c; |
|
|
|
|
|
long *c; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
//do pomiaru czasu |
|
|
|
|
|
TTiming tt; |
|
|
|
|
|
long time; |
|
|
|
|
|
|
|
|
|
|
|
size = strtol(argv[1], &endptr, 10); |
|
|
|
|
|
|
|
|
|
|
|
if (*endptr) |
|
|
|
|
|
{ |
|
|
|
|
|
perror(" Invalid array size format\n"); |
|
|
|
|
|
exit(1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (size <= 0) |
|
|
|
|
|
{ |
|
|
|
|
|
perror("The number of matrix dimension must be positive\n"); |
|
|
|
|
|
exit(1); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
N = size*size; |
|
|
|
|
|
c = (long*)malloc(N * sizeof(long)); |
|
|
|
|
|
|
|
|
|
|
|
cudaMalloc((void**)&dev_a,N*sizeof(int)); |
|
|
|
|
|
cudaMalloc((void**)&dev_b,N*sizeof(int)); |
|
|
|
|
|
cudaMalloc((void**)&dev_c,N*sizeof(long)); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
set<<<20,192>>>(dev_a,dev_b,N); //wypełaianie tablic a i b 'losowymi' wartościami |
|
|
|
|
|
cudaDeviceSynchronize(); //synchronizacja, bo inaczej pomar czasu nie ma sensu |
|
|
|
|
|
|
|
|
|
|
|
//czas start |
|
|
|
|
|
tt.Begin(); |
|
|
|
|
|
|
|
|
|
|
|
//mnożenie a i b; c to wynik |
|
|
|
|
|
multiply<<<10,384>>>(dev_a,dev_b,dev_c,size); //takie wartosći <<<10,384>>> bo jest dla nich najszybsze, ale jeszcze zrobie dokładne testy i ewentualnie je poprawie |
|
|
|
|
|
|
|
|
|
|
|
//czas stop |
|
|
|
|
|
cudaDeviceSynchronize();//synchronizacja, bo inaczej pomar czasu nie ma sensu |
|
|
|
|
|
time = tt.End(); |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if(false){ |
|
|
|
|
|
//kopiowanie tablicy 'c' z pamieci urządzenia do hosta co by można było to odczytać, wyswietlić itp. |
|
|
|
|
|
cudaMemcpy(c,dev_c,N*sizeof(long),cudaMemcpyDeviceToHost); |
|
|
|
|
|
for(int i=0;i<N;i++){ |
|
|
|
|
|
printf("%ld; ",c[i]); |
|
|
|
|
|
} |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
printf("czas : %ld ms\n" ,time); |
|
|
|
|
|
|
|
|
|
|
|
cudaFree(dev_c); |
|
|
|
|
|
cudaFree(dev_b); |
|
|
|
|
|
cudaFree(dev_a); |
|
|
|
|
|
free(c); |
|
|
|
|
|
|
|
|
|
|
|
return 0; |
|
|
|
|
|
} |