使用cudaHostRegister映射的Linux字符设备内存注册结果无效教程
我正在尝试通过以下方法来增强DMA-CPU-GPU的数据传输:
1.将我的(专有)设备Linux内核分配的内存映射到用户空间
2.使用cudaHostRegister API函数将后面的(映射的内存)注册到Cuda.
将用户空间分配的内存映射到我的设备DMA,然后使用cudaHostRegister注册到Cuda时,效果很好,尝试注册“ kmalloc”的内存会导致cudaHostRegister返回“无效参数”错误.
首先,我认为问题出在对齐问题上,或者我的设备驱动程序需要复杂的内存池管理,因此我编写了一个最简单的字符设备,该设备实现了.mmap(),其中kzallocated 10Kb缓冲区用remap\_pfn\_range重新映射,但问题仍然存在.
不幸的是,我没有在网上找到任何类似的问题,因此我衷心希望能在这里找到答案.
一些系统信息和内核驱动程序<->用户空间应用程序代码运行时日志信息:
CUDA : 8.0
OS Dist : Ubuntu 14.04
Kernel : 3.16.0-31-generic
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 375.26 Driver Version: 375.26 |
|-------------------------------+----------------------+----------------------+
| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |
|===============================+======================+======================|
| 0 GeForce GTX 770 Off | 0000:83:00.0 N/A | N/A |
| 26% 32C P8 N/A / N/A | 79MiB / 1997MiB | N/A Default |
+-------------------------------+----------------------+----------------------+
字符设备mmap()代码:
#define MEM_CHUNK_SIZE 4 * _K
#define MEM_POOL_SIZE 10 * _K
/**/
static int chdv_mmap(struct file *filp, struct vm_area_struct *vma)
{
unsigned int pages_per_buf = ( MEM_CHUNK_SIZE >> PAGE_SHIFT ) ;
unsigned long pfn, vsize;
/*make sure the buffer is allocated*/
if((NULL == g_membuff) &&
(NULL == (g_membuff = kzalloc(MEM_POOL_SIZE , GFP_KERNEL))))
{
kdbgprintln("Error: Not enough memory");
return -ENOMEM;
}
vsize = vma->vm_end - vma->vm_start ;
kdbgprintln("MEM_CHUNK_SIZE %u, pages_per_buf %u, vsize %lu vma->vm_pgoff %lu",
MEM_CHUNK_SIZE,
pages_per_buf,
vsize,
vma->vm_pgoff);
if(vsize > MEM_POOL_SIZE)
{
kdbgprintln("Error: vsize %lu > MEM_POOL_SIZE %u", vsize, MEM_POOL_SIZE);
return -EINVAL;
}
/* We allow only mapping of one whole buffer so offset must be multiple
* of pages_per_buf and size must be equal to dma_buf_size.
*/
if( vma->vm_pgoff % pages_per_buf )
{
kdbgprintln("Error:Mapping DMA buffers is allowed only from beginning");
return -EINVAL ;
}
vma->vm_flags = vma->vm_flags | (VM_DONTEXPAND | VM_LOCKED | VM_IO);
/*Get the PFN for remap*/
pfn = page_to_pfn(virt_to_page((unsignedcudaHostRegister char *)g_membuff));
kdbgprintln("PFN : %lu", pfn);
if(remap_pfn_range(vma, vma->vm_start, pfn, vsize, vma->vm_page_prot))
{
kdbgprintln("Error:Failed to remap memory");
return -EINVAL;
}
/*Sealing data header & footer*/
*((unsigned long *)g_membuff) = 0xCDFFFFFFFFFFFFAB;
*((unsigned long *)g_membuff + 1) = 0xAB000000000000EF;
*(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)) = 0xEF0000000C0000AA;
kdbgprintln("Mapped 'kalloc' buffer" \
"\n\t\tFirst 8 bytes: %lX" \
"\n\t\tSecond 8 bytes: %lX" \
"\n\t\tLast 8 bytes: %lX",
*((unsigned long *)g_membuff),
*((unsigned long *)g_membuff + 1),
*(unsigned long *)((unsigned char *)g_membuff + vsize - sizeof(unsigned long)));
return 0;
}
测试应用程序代码:
static unsigned long map_mem_size;
int main(int argc, char** argv)
{
int fd;
const char dev_name[] = "/dev/chardev";
void * address = NULL;
long page_off = 0;
cudaError_t cudarc;
switch(argc)
{
case 2:
page_off = atoi(argv[1]) * getpagesize();
break;
default:
page_off = 0;
break;
}
map_mem_size = 2 * getpagesize();
printf("Opening %s file\n", dev_name);
errno = 0;
if(0 > (fd = open(dev_name, O_RDWR) ))
{
printf("Error %d - %s\n", errno, strerror(errno));
}
else
{
printf("About to map %lu bytes of %s device memory\n", map_mem_size, dev_name);
errno = 0;
if(MAP_FAILED == (address = mmap(NULL, map_mem_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, page_off)))
{
printf("Error %d - %s\n", errno, strerror(errno));
}
else
{
printf("mapped %s driver 'kmalloc' memory" \
"\n\t\tFirst 8 bytes : %lX" \
"\n\t\tSecond 8 bytes: %lX" \
"\n\t\tLast 8 bytes: %lX\n",
dev_name,
*((unsigned long *)address),
*((unsigned long *)address + 1),
*(unsigned long *)((unsigned char *)address + map_mem_size - sizeof(unsigned long)));
if (cudaSuccess != (cudarc = cudaHostRegister(address, map_mem_size, cudaHostRegisterDefault)))
{
printf("Error: Failed cudaHostRegister: %s, address %p\n", cudaGetErrorString(cudarc), address);
}
}
}
/*Release resources block*/
return EXIT_SUCCESS;
}
运行时调试信息:
用户空间:
./chrdev_test
Opening /dev/chardev file
About to map 8192 bytes of /dev/chardev device memory
mapped /dev/chardev driver 'kmalloc' memory
First 8 bytes : CDFFFFFFFFFFFFAB
Second 8 bytes: AB000000000000EF
Last 8 bytes: EF0000000C0000AA
Error: Failed cudaHostRegister: invalid argument
Unmapping /dev/chardev file
Closing /dev/chardev file
内核空间(tail -f / var / log / syslog):
[ 4814.119537] [chardev] chardev.c, chdv_mmap, line 292:MEM_CHUNK_SIZE 4096, pages_per_buf 1, vsize 8192 vma->vm_pgoff 0
[ 4814.119538] [chardev] chardev.c, chdv_mmap, line 311:PFN : 16306184
[ 4814.119543] [chardev] chardev.c, chdv_mmap, line 330:Mapped 'kzalloced' buffer
[ 4814.119543] First 8 bytes: CDFFFFFFFFFFFFAB
[ 4814.119543] Second 8 bytes: AB000000000000EF
[ 4814.119543] Last 8 bytes: EF0000000C0000AA
谢谢你
解决方法:
成功了!
完整答案可在以下位置找到:
https://devtalk.nvidia.com/default/topic/1014391/cuda-programming-and-performance/registering-mapped-linux-character-device-memory-with-cudahostregister-results-in-invalid-argument/?offset=3#5174771
内存块超过2页(> 8K)时出现问题
与Cuda合作…
谢谢,
约尔.