當(dāng)前位置：首頁 > news >正文

web個人網(wǎng)站設(shè)計方案互聯(lián)網(wǎng)

news 2025/7/9 5:03:14

web個人網(wǎng)站設(shè)計方案,互聯(lián)網(wǎng),常州做網(wǎng)站基本流程,海淀網(wǎng)站制作1. 背景： 項目中使用到了紋理進(jìn)行插值的加速，因此記錄一些自己在學(xué)習(xí)tex2D的一些過程 2. 代碼： #include "cuda_runtime.h" #include "device_launch_parameters.h" #include <assert.h> #include <stdio.h>…

1. 背景：

????????項目中使用到了紋理進(jìn)行插值的加速，因此記錄一些自己在學(xué)習(xí)tex2D的一些過程

2. 代碼：

????????

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <cuda_fp16.h>
#include <vector>void Data2Half(half* pDst, const int16_t* pSrc, const int Ndots);
static __global__ void Tex2DTest(cudaTextureObject_t p_rf_data, float* pfRes1, float* pfRes2);static __global__ void data2half(half* pDst, const int16_t* pSrc, const int Ndots)
{const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid >= Ndots)return;pDst[tid] = __short2half_rn(pSrc[tid]);
}cudaTextureObject_t m_tex   = 0;
cudaArray* m_pRFData        = nullptr;
int16_t* m_i16RFDataBuffer  = nullptr; // 設(shè)備端的RF數(shù)據(jù)
half* m_pHalfRFDataCache    = nullptr; // 轉(zhuǎn)換為半浮點型的RF數(shù)據(jù)緩存，用于將SHORT類型轉(zhuǎn)換為FLOAT類型int main()
{const int nRx     = 2;const int Nsample = 2;const int IQ      = 1;cudaError_t error;cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf();error                             = cudaMallocArray(&m_pRFData, &channelDesc, nRx * IQ, Nsample, cudaArrayTextureGather);assert(m_pRFData);cudaResourceDesc texRes;memset(&texRes, 0, sizeof(cudaResourceDesc));texRes.resType         = cudaResourceTypeArray;texRes.res.array.array = m_pRFData;cudaTextureDesc texDescr;memset(&texDescr, 0, sizeof(cudaTextureDesc));texDescr.normalizedCoords = false;texDescr.filterMode       = cudaFilterModeLinear;  // 這里很重要texDescr.addressMode[0]   = cudaAddressModeBorder;texDescr.addressMode[1]   = cudaAddressModeBorder;error = cudaCreateTextureObject(&m_tex, &texRes, &texDescr, NULL);//int16_t pi16Src[nRx * Nsample * IQ] = {1, 11, 2, 22,//                                    3, 33, 4, 44, //                                    5, 55, 6, 66, //                                    7, 77, 8, 88};//int16_t pi16Src[nRx * Nsample * IQ] = { 1, 11, 2, 22,//                                        3, 33, 4, 44};int16_t pi16Src[nRx * Nsample * IQ] = { 1,2,3,4 };error = cudaMalloc(&m_i16RFDataBuffer, sizeof(int16_t) * nRx * IQ * Nsample);error = cudaMemcpy(m_i16RFDataBuffer, pi16Src, sizeof(int16_t) * nRx * IQ * Nsample, cudaMemcpyHostToDevice);error = cudaMalloc(&m_pHalfRFDataCache, sizeof(half) * nRx * IQ * Nsample);Data2Half(m_pHalfRFDataCache, m_i16RFDataBuffer, nRx * IQ * Nsample);error = cudaMemcpy2DToArray(m_pRFData, 0, 0, m_pHalfRFDataCache, sizeof(half) * nRx * IQ, sizeof(half) * nRx * IQ, Nsample, cudaMemcpyDeviceToDevice);float* pf_res1 = nullptr;float* pf_res2 = nullptr;error = cudaMalloc(&pf_res1, nRx * Nsample * sizeof(float)); cudaMemset(pf_res1, 0, nRx * Nsample * sizeof(float));error = cudaMalloc(&pf_res2, nRx * Nsample * sizeof(float)); cudaMemset(pf_res2, 0, nRx * Nsample * sizeof(float));error = cudaGetLastError();dim3 block_dim = dim3(1, 1);dim3 grid_dim  = dim3(1, 1);Tex2DTest << <grid_dim, block_dim >> > (m_tex, pf_res1, pf_res2);cudaDeviceSynchronize();std::vector<float> vf_res_1(nRx * Nsample, 0);std::vector<float> vf_res_2(nRx * Nsample, 0);cudaMemcpy(vf_res_1.data(), pf_res1, sizeof(float) * vf_res_1.size(), cudaMemcpyDeviceToHost);cudaMemcpy(vf_res_2.data(), pf_res2, sizeof(float) * vf_res_2.size(), cudaMemcpyDeviceToHost);return 0;
}void Data2Half(half* pDst, const int16_t* pSrc, const int Ndots)
{dim3 block = dim3(512, 1);dim3 grid = dim3((Ndots - 1) / block.x + 1, 1);data2half << < grid, block >> > (pDst, pSrc, Ndots);
}static __global__ void Tex2DTest(cudaTextureObject_t p_rf_data, float *pfRes1, float *pfRes2)
{for (size_t y = 0; y < 2; ++y){for (size_t x = 0; x < 2; ++x) {float value = tex2D<float>(p_rf_data, x,     y);//pfRes1[y * 4 + y] = printf("x: %f\n", value);}}
}

3. 輸出分析：

可以看到執(zhí)行結(jié)果是

為什么呢？

原因是因為tex2D插值導(dǎo)致的，上面測試數(shù)據(jù)是

1? 2

3? ?4

那在進(jìn)行插值的時候會變成

0? 0? ?0? ?0

0? ?1? ?2? 0

0? ?3? ?4? 0

每個點的輸出都是當(dāng)前前和左上角3個點進(jìn)行平均計算出來的

比如第一個輸出計算為：（1 + 0 + 0 + 0）/4 = 0.25

最后一個輸出的計算為：（1 + 2 + 3 + 4） / 4 = 2.5

4. 問題

? ? ? ? 上面只是單獨數(shù)據(jù)實數(shù)點的計算，如果我的數(shù)據(jù)集合是復(fù)數(shù)怎么辦？

? ? ? ? 比如一組2 * 2大小的數(shù)據(jù)對

? ? ? ? （1， 2， 3， 4;

? ? ? ? ? ?5,? ?6， 7， 8）

? ? ? ? 數(shù)據(jù)實際表示含義是

? ? ? ? ?（1 + j * 2,? ?3 + j * 4;

? ? ? ? ? ? 5 + j * 6,? ?7 + j * 8）

? ? ? ? 這種情況下怎么做到正確插值呢，比如第一個實數(shù)點的輸出結(jié)果應(yīng)該是

? ? ? ? ?（1 + 0 + 0 + 0）/ 4

? ? ? ? ? ?最后一個實數(shù)點的輸出應(yīng)該是：

? ? ? ? ? ? （1 + 3 + 5 + 7） / 4

? ? ? ? ? ?同理，最后一個虛數(shù)點的輸出應(yīng)該是：
? ? ? ? ? ?（2 + 4 + 6 + 8）/ 4

5. 解決

? ? ? ? ?

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <assert.h>
#include <stdio.h>
#include <iostream>
#include <cuda_fp16.h>
#include <vector>void Data2Half(half* pDst, const int16_t* pSrc, const int Ndots);
static __global__ void Tex2DTest(cudaTextureObject_t p_rf_data, float* pfRes1, float* pfRes2);static __global__ void data2half(half* pDst, const int16_t* pSrc, const int Ndots)
{const int tid = blockIdx.x * blockDim.x + threadIdx.x;if (tid >= Ndots)return;pDst[tid] = __short2half_rn(pSrc[tid]);
}cudaTextureObject_t m_tex = 0;
cudaArray* m_pRFData = nullptr;
int16_t* m_i16RFDataBuffer = nullptr; // 設(shè)備端的RF數(shù)據(jù)
half* m_pHalfRFDataCache = nullptr; // 轉(zhuǎn)換為半浮點型的RF數(shù)據(jù)緩存，用于將SHORT類型轉(zhuǎn)換為FLOAT類型using namespace std;int main()
{const int nRx = 2;const int Nsample = 2;const int IQ = 2;cudaError_t error;cudaChannelFormatDesc channelDesc = cudaCreateChannelDescHalf2();error = cudaMallocArray(&m_pRFData, &channelDesc, nRx, Nsample, cudaArrayTextureGather);assert(m_pRFData);cudaResourceDesc texRes;memset(&texRes, 0, sizeof(cudaResourceDesc));texRes.resType = cudaResourceTypeArray;texRes.res.array.array = m_pRFData;cudaTextureDesc texDescr;memset(&texDescr, 0, sizeof(cudaTextureDesc));texDescr.normalizedCoords = false;texDescr.filterMode = cudaFilterModeLinear;  // 這里很重要texDescr.addressMode[0] = cudaAddressModeBorder;texDescr.addressMode[1] = cudaAddressModeBorder;error = cudaCreateTextureObject(&m_tex, &texRes, &texDescr, NULL);//int16_t pi16Src[nRx * Nsample * IQ] = {1, 11, 2, 22,//                                    3, 33, 4, 44, //                                    5, 55, 6, 66, //                                    7, 77, 8, 88};//int16_t pi16Src[nRx * Nsample * IQ] = { 1, 11, 2, 22,//                                        3, 33, 4, 44};int16_t pi16Src[nRx * Nsample * IQ] = { 1, 2, 3, 4,5, 6, 7, 8 };error = cudaMalloc(&m_i16RFDataBuffer, sizeof(int16_t) * nRx * IQ * Nsample);error = cudaMemcpy(m_i16RFDataBuffer, pi16Src, sizeof(int16_t) * nRx * IQ * Nsample, cudaMemcpyHostToDevice);error = cudaMalloc(&m_pHalfRFDataCache, sizeof(half) * nRx * IQ * Nsample);Data2Half(m_pHalfRFDataCache, m_i16RFDataBuffer, nRx * IQ * Nsample);error = cudaMemcpy2DToArray(m_pRFData, 0, 0, m_pHalfRFDataCache, sizeof(half2) * nRx, sizeof(half2) * nRx, Nsample, cudaMemcpyDeviceToDevice);float* pf_res1 = nullptr;float* pf_res2 = nullptr;error = cudaMalloc(&pf_res1, nRx * Nsample * sizeof(float)); cudaMemset(pf_res1, 0, nRx * Nsample * sizeof(float));error = cudaMalloc(&pf_res2, nRx * Nsample * sizeof(float)); cudaMemset(pf_res2, 0, nRx * Nsample * sizeof(float));error = cudaGetLastError();dim3 block_dim = dim3(1, 1);dim3 grid_dim  = dim3(1, 1);Tex2DTest << <grid_dim, block_dim >> > (m_tex, pf_res1, pf_res2);cudaDeviceSynchronize();std::vector<float> vf_res_1(nRx * Nsample, 0);std::vector<float> vf_res_2(nRx * Nsample, 0);cudaMemcpy(vf_res_1.data(), pf_res1, sizeof(float) * vf_res_1.size(), cudaMemcpyDeviceToHost);cudaMemcpy(vf_res_2.data(), pf_res2, sizeof(float) * vf_res_2.size(), cudaMemcpyDeviceToHost);return 0;
}void Data2Half(half* pDst, const int16_t* pSrc, const int Ndots)
{dim3 block = dim3(512, 1);dim3 grid = dim3((Ndots - 1) / block.x + 1, 1);data2half << < grid, block >> > (pDst, pSrc, Ndots);
}static __global__ void Tex2DTest(cudaTextureObject_t p_rf_data, float* pfRes1, float* pfRes2)
{for (size_t y = 0; y < 2; ++y){for (size_t x = 0; x < 2; ++x){float2 value = tex2D<float2>(p_rf_data, x, y);//pfRes1[y * 4 + y] = printf("x: %f, y: %f", value.x, value.y);// printf("x: %f, y: %f\n", value.x, value.y);}printf("\n");}
}

其實關(guān)鍵是在tex2D的構(gòu)造

然后按照half2的方式進(jìn)行排布就好了

查看全文

http://www.risenshineclean.com/news/2765.html

中文亚洲精品无码_熟女乱子伦免费_人人超碰人人爱国产_亚洲熟妇女综合网