Linux 内核学习(4) - 内存管理教程

内存管理

内存初始化

内存布局探测：E820图

E820介绍
- 在x86的机器上，由bios提供的中断，中断号是0x15，在调用的时候AX寄存器必须为0xE820，每次返回一段内存的空间的起始地址和大小以及它的属性（可用的RAM or 被BIOS保留的）

注：以下代码有的是linux 5.10，有的是linux 2.6.30.4

代码1 arch/x86/boot/memory.c

static void detect_memory_e820(void)
{
    int count = 0;
    struct biosregs ireg, oreg;
    struct boot_e820_entry *desc = boot_params.e820_table;
    static struct boot_e820_entry buf; /* static so it is zeroed */

    initregs(&ireg);        // 初始化寄存器
    ireg.ax  = 0xe820;      // 规范
    ireg.cx  = sizeof(buf); // 缓冲区大小
    ireg.edx = SMAP;        
    ireg.di  = (size_t)&buf; 

    /*
     * Note: at least one BIOS is known which assumes that the
     * buffer pointed to by one e820 call is the same one as
     * the previous call, and only changes modified fields.  Therefore,
     * we use a temporary buffer and copy the results entry by entry.
     *
     * This routine deliberately does not try to account for
     * ACPI 3+ extended attributes.  This is because there are
     * BIOSes in the field which report zero for the valid bit for
     * all ranges, and we don't currently make any use of the
     * other attribute bits.  Revisit this if we see the extended
     * attribute bits deployed in a meaningful way in the future.
     */

    do {  // 一条一条地取出所有entry
        intcall(0x15, &ireg, &oreg); // int指令产生(模拟)0x15中断
        ireg.ebx = oreg.ebx; /* for next iteration... */ // 表示下一次要读取的序号

        /* BIOSes which terminate the chain with CF = 1 as opposed
           to %ebx = 0 don't always report the SMAP signature on
           the final, failing, probe. */
        if (oreg.eflags & X86_EFLAGS_CF) // 表示调用产生了error，中止
            break;

        /* Some BIOSes stop returning SMAP in the middle of
           the search loop.  We don't know exactly how the BIOS
           screwed up the map at that point, we might have a
           partial map, the full map, or complete garbage, so
           just return failure. */
        if (oreg.eax != SMAP) { // 检查一下签名是不是SMAP，不是就是出问题了
            count = 0;
            break;
        }

        *desc++ = buf;
        count++;
    } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_table)); // 直到取出的ebx为0

    boot_params.e820_entries = count; 
}

代码2 e820 entry定义 /usr/include/x86\_64-linux-gnu/asm/bootparam.h

struct boot_e820_entry {
    __u64 addr;
    __u64 size;
    __u32 type;
} __attribute__((packed));

代码3 打印E820图 arch/x86/kernel/e820.c

void __init e820_print_map(char *who)
{
    int i;

    for (i = 0; i < e820.nr_map; i++) {
        printk(KERN_INFO " %s: %016Lx - %016Lx ", who,
               (unsigned long long) e820.map[i].addr, // 起始地址
               (unsigned long long) // 终止地址
               (e820.map[i].addr + e820.map[i].size));
        e820_print_type(e820.map[i].type); // 类型
        printk(KERN_CONT "\n");
    }
}

memblock

用于启动阶段的一个简单的分配器，它负责page alloc初始化之前的内存分配管理以及在系统boot阶段满足最大内存的请求(请求大小超过page alloc的最大限制)

实现：所有状态都保存在一个全局变量中

代码4 include/linux/memblock.h

/**
 * struct memblock_region - represents a memory region
 * @base: base address of the region
 * @size: size of the region
 * @flags: memory region attributes
 * @nid: NUMA node id
 */
struct memblock_region {
    phys_addr_t base;
    phys_addr_t size;
    enum memblock_flags flags;
#ifdef CONFIG_NEED_MULTIPLE_NODES
    int nid;  // NUMA用于非对称内存访问
#endif
};

/**
 * struct memblock_type - collection of memory regions of certain type
 * @cnt: number of regions
 * @max: size of the allocated array
 * @total_size: size of all regions
 * @regions: array of regions
 * @name: the memory type symbolic name
 */
struct memblock_type {  // 内存区间
    unsigned long cnt;  // 区间的个数
    unsigned long max;
    phys_addr_t total_size;
    struct memblock_region *regions;  // 这个东西是数组
    char *name;
};

/**
 * struct memblock - memblock allocator metadata
 * @bottom_up: is bottom up direction?
 * @current_limit: physical address of the current allocation limit
 * @memory: usable memory regions
 * @reserved: reserved memory regions
 */
struct memblock {
    bool bottom_up;  /* is bottom up direction? */
    phys_addr_t current_limit;
    struct memblock_type memory;    // 可用内存区域
    struct memblock_type reserved;  // 保留内存区域
};

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-DzwrtMg0-1608991788791)(D:\微云同步助手\同步的文件\学习\Linux内核学习.assets\image-20201226152333853.png)]

代码5 添加内存区域 mm/memblock.c

/**
 * memblock_add_range - add new memblock region
 * @type: memblock type to add new region into
 * @base: base address of the new region
 * @size: size of the new region
 * @nid: nid of the new region
 * @flags: flags of the new region
 *
 * Add new memblock region [@base, @base + @size) into @type.  The new region
 * is allowed to overlap with existing ones - overlaps don't affect already
 * existing regions.  @type is guaranteed to be minimal (all neighbouring
 * compatible regions are merged) after the addition.
 *
 * Return:
 * 0 on success, -errno on failure.
 */
static int __init_memblock memblock_add_range(struct memblock_type *type,
                phys_addr_t base, phys_addr_t size,
                int nid, enum memblock_flags flags)
{
    bool insert = false;
    phys_addr_t obase = base;
    phys_addr_t end = base + memblock_cap_size(base, &size);
    int idx, nr_new;
    struct memblock_region *rgn;

    if (!size)
        return 0;

    /* special case for empty array */
    if (type->regions[0].size == 0) {  // 如果一项也没有，直接插进去
        WARN_ON(type->cnt != 1 || type->total_size);
        type->regions[0].base = base;
        type->regions[0].size = size;
        type->regions[0].flags = flags;
        memblock_set_region_node(&type->regions[0], nid);
        type->total_size = size;
        return 0;
    }
repeat:
    /*
     * The following is executed twice.  Once with %false @insert and
     * then with %true.  The first counts the number of regions needed
     * to accommodate the new area.  The second actually inserts them.
     */
    base = obase;
    nr_new = 0;

    for_each_memblock_type(idx, type, rgn) {  // 遍历原有的区域，检查有没有重合
        phys_addr_t rbase = rgn->base;
        phys_addr_t rend = rbase + rgn->size;

        if (rbase >= end)  // 表示遍历结束
            break;
        if (rend <= base)
            continue;  // 表示没有重合
        /*
         * @rgn overlaps.  If it separates the lower part of new
         * area, insert that portion.
         */
        if (rbase > base) {
#ifdef CONFIG_NEED_MULTIPLE_NODES
            WARN_ON(nid != memblock_get_region_node(rgn));
#endif
            WARN_ON(flags != rgn->flags);
            nr_new++;
            if (insert)
                memblock_insert_region(type, idx++, base,
                               rbase - base, nid,
                               flags);
        }
        /* area below @rend is dealt with, forget about it */
        base = min(rend, end);
    }

    /* insert the remaining portion */
    if (base < end) {  // 插入memblock region
        nr_new++;
        if (insert)
            memblock_insert_region(type, idx, base, end - base,
                           nid, flags);
    }

    if (!nr_new)
        return 0;

    /*
     * If this was the first round, resize array and repeat for actual
     * insertions; otherwise, merge and return.
     */
    if (!insert) {
        while (type->cnt + nr_new > type->max)  // 如果当前数组空间不够就进行扩展
            if (memblock_double_array(type, obase, size) < 0)
                return -ENOMEM;
        insert = true;
        goto repeat;  // 再重复一遍循环，进行区域的插入
    } else {
        memblock_merge_regions(type);  // 合并相邻区间
        return 0;
    }
}

将一段区域设为可用 memblock_add_node
将一段区域设为保留 memblock_reserve
从memblock中分配内存:

其基本算法是，找到在memblock.memory但不在memblock.reserved的满足size大小的区域，然后将该段区域加入到memblock.reserved中

代码6 memblock分配内存

static void * __init memblock_alloc_internal(
                phys_addr_t size, phys_addr_t align,
                phys_addr_t min_addr, phys_addr_t max_addr,
                int nid, bool exact_nid)

memblock释放内存：memblock_free()
Linux Kernel中memblock的使用

kernel将自己占用的内存部分设为reserved，例如kernel的image所占内存,initrd所占的内存等
将e820探测的可用内存加入到memblock.memory中
总而言之，系统的空闲内存存在于memory中单不包括reserved的部分

page allocator

Linux内存中在运行阶段可用的大内存分配器，是以页为单位
分配的大小以2的倍数为单位，范围从20到2MAX\_ORDER，MAX_ORDER可以编译选项CONFIG_FORCE_MAX_ZONEORDER配置，默认是11。即最大可以请求210个页面，一个页面是4K
涉及的基本概念

Node: NUMA的概念，即系统中的内存节点，每个node都在struct pglist_data *node_data[]中有对应的一项，以node的ID为序号。NUMA的内存布局探测是在ACPI中完成的，和E820不一样。在代码中对应NODE_DATA()
Zone: 可以理解为每个页面的类型

  
  

- 每个node中都有对应的zone, 存放在`node_data[node.id]->node_zones[MAX_NR_ZONES];`
- Zone的类型有：`ZONE_DMA`;`ZONE_DMA32`;`ZONE_NORMAL`

Zone order：即zone的查找次序，它决定如果当前请求的类型不满足后应该随后要到哪个zone中去分配。比如，用户可能想请求ZONE_HIGH,但ZONE_HIGH中的内存已经分配完了，这时候可以让它到其它的ZONE
Kernel中有两种类型的order

  
  

- A: NODE序，即所有请求都优先在本地节点完成
- B: ZONE序，即在各个节点中优先分配相同类型的内存

Kernel中每一个页面都有一个表示结构，即struct page，存放在struct page mem\_map[]中，它以物理页面的序号作为索引，每个页面只能属于zone，函数page\_zone(page)可以找到page的zone

  
  
 ![image-20201226163804706]()

算法：伙伴系统

需要高效，且尽量避免碎片
按照2的幂大小来组织内存，幂为0~MAX\_ORDER，对应有MAX\_ORDER-1条链表来组织空间
每个ZONE都有MAX\_ORDER-1条链表，存放在zone->free_area[MAX_ORDER]中
分配时，根据请求的大小匹配到最佳空闲区，然后进行分配
如果最佳空闲区没有空闲页面了，则一直往上请求然后将上层进行拆分，如再请求2(n+1)个内存
![image-20201226164527612]()
释放页面时，看它是否和相近的合并，如这种情况：
![image-20201226164655858]()
释放页面时，如果可以合并则拼成一个大的空闲区并将之移动到上层，一直这样合并下去：
![image-20201226164721833]()
Linux Kernel中的页面组织方式

  
  

- 为了让`struct page`尽可能的小，page allocator和其它子系统复用很多的成员
- `page->lru`链表用来链接对应order的空闲块
- `page->_mapcount`为`PAGE_BUDDY_MAPCOUNT_VALUE`时表示以该page为起始页面的内存块是空闲的（中间块的计数为-1），如果分配出去则置位-1
- `page->private`表示该page为起始页面的内存块所有的order，即该内存块的大小是2page->private
- `page`所有的node和zone的信息都被编码在`page->flags`中，通过`page_to_nid(page)`可得到该page所在的node，通过`page_zonenum(page)`可得到该page所有的zone类型，`page_zone(page)`直接返回page所在的zone
- 找到自己的小伙伴的过程 
  - 先将自己的页号 对 (1 << MAX\_ORDER) 取余，即：  
     `page_index = page_to_pfn(page) & (1 << MAX_ORDER - 1)`
  - 然后找到小伙伴的index也就是 `__find_buddy_index()`  
     `buddy_idx = page_idx ^ (1 << order)`
  - 最后找到小伙伴的页面  
     `buddy = page + (buddy_idx - page_idx)`

基于 page migration type 的页面分组：

  
  

- 将页面按照migration进行分组，为了避免外碎片而导入的机制，对应migration type的请求都会到相应的区域中寻找
- > 内部碎片的产生：因为所有的内存分配必须起始于可被 4、8 或 16 整除（视处理器体系结构而定）的地址或者因为MMU的分页机制的限制，决定内存分配算法仅能把预定大小的内存块分配给客户。假设当某个客户请求一个 43 字节的内存块时，因为没有适合大小的内存，所以它可能会获得 44字节、48字节等稍大一点的字节，因此由所需大小四舍五入而产生的多余空间就叫内部碎片。  
  >  外部碎片的产生： 频繁的分配与回收物理页面会导致大量的、连续且小的页面块夹杂在已分配的页面中间，就会产生外部碎片。
- 如果想请求的type不能满足，会fallback到其它类型中
- 每个zone都有自己单独的分组
- `enum {MIGRATE_UNMOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_PCPTYPES, MIGRATE_RESERVE}`

Linux Kernel为了加速单个页面分配释放同时又提高cache利用率而导入的缓存

  
  

- 每个zone都有一个称为pcp（struct per\_cpu\_pages）的percpu缓存，例如  
   `pcp = &this_cpu_ptr(zone->pageset)->pcp`
- 如果是为cold的页表，也就是说长时间没有使用的页面可能不在cache中了，在释放的时候加至list末尾，否则加到头部(为了优先请求hot页面)

位于：include/linux/mmzone.h

内存映射

硬件背景

使用从page allocator中得到的物理页面
- 页面对应的是物理地址
- CPU仅能使用虚拟地址来访问内存
- 所以，应将物理地址关联到CPU寻址的虚拟地址
x86背景
- 由于历史原因，x86寻址比较复杂：段映射+页面映射
- 段映射在x86\_64中被废弃掉了
- 简言之，就是一个radix-tree like的算法，将线性地址分成几个区域，然后各区域值作为对应页表(paging-structure)的偏移
CPU模式不同，寻址方式上有些小差别
- 32位：2层页表(10+10+12)
- 32位 PAE: 3层页表(2+9+9+12)
- 64位：4层页表(9+9+9+9+12)
如果映射有异常，CPU会产生page fault异常
TLB介绍
- TLB用来缓存从虚拟地址到物理地址的映射
- 在reload页表的时候回自动刷新
- 如果映射关系有修改需要手动刷新TLB项

Linux Kernel 地址空间

起源是CPU的内存保护机制：特权级和非特权级
- 特权级可以做一切事情
- 非特权级不能执行特权指令来修改系统资源
两大空间：内核空间和用户空间
- 内核空间位于特权层
- 用户空间位于非特权层
- x86中有4个特权级，但是Linux内核只用了两级
- x86_64有2个特权级
内核空间和用户空间复用一部分地址空间
- 用来避免统一进程内核态和用户态转换时对TLB的刷新
- 在x86 CPU是内核空间1G，用户空间3G
- 在x86\_64中，0xffff880000000000以下为用户空间，以上为内核空间
将映射层次抽象成PGD, PUD, PMD, PTE，如果某层不存在，则其对应位数为0
在内核中，虚拟地址的映射如下：
- 基础映射，物理地址和虚拟地址在偏移地址(PAGE\_OFFSET)上是以1比1的关系映射的，即在内核中，物理地址对应的虚拟地址为: VG=PAGE\_OFFSET+PA
- 内存拼接的映射，即vmalloc()映射的内存，用来将不连续的物理内存拼接成连续的虚拟地址供用户使用，用来减少内存碎片

Linux Kernel 内存映射

32位CPU上的映射问题
- 内核仅能使用1G的地址空间，也就是在同一时刻最大能使用1G的物理地址空间，如果物理内存超过1G，如何访问1G以上的内存？
- 在这种情况下，kernel将一部分地址用来做动态映射，将不能直接访问的物理内存映射到这个地址中
- ![image-20201226213145975]()
各种API
- vmalloc的APIs:
  - void *vmalloc(unsigned long size) / vfree(void *p)
- kmap的APIs：
  - void *kmap(struct page *page) / kunmap(struct page *page)
- kmap\_atomic的APIs：
  - void *kmap_atomic(struct page *page) / kunmap_atomic(void *addr)

slab allocator

目的：相对应土工页面大小4K的page allocator来说，slab提供的小内存分配器
背景：

当前linux内核中提供了3种slab分配器，分别是slab,slub,slob，他们提供给外部使用的API都是一样的，在编译内核的时候只能选择其一
默认为slub，slob仅适合用于嵌入式中（占用资源极少）
slab vs slub

  
  

- slab管理结构很大，设计很复杂，slub简化了一切
- slub便于调试
- 下面的算法分析以slub为例

slab简而言之，就是一个对象的缓存器，当有对象释放的时候，就缓存到slab里面，然后需要分配的时候，就从slab缓存中取出来
slab以page allocator作为后端，当缓存对象不够时，就从page allocator中取
API

  
  

- 创建一个slab: `struct kmem_cache *kmem_cache_create(name, size, align, flags, ctor)`
- 销毁slab: `kmem_cache_destroy`
- 从slab中分配对象 `kmem_cache_alloc`
- 将内存释放到slab中：`kmem_cache_free`

除此之外，slab还内建了一些slab cache, 用于不需要特殊处理的对象分配，对用户可见的接口为

  
  

- 内存分配 `__always_inline`
- 内存释放 `kfree`
- 对于大块的内存请求，会落入到page allocator中

Linux 内核学习(4) - 内存管理教程

内存管理

内存初始化

内存布局探测：E820图

memblock

page allocator

内存映射

硬件背景

Linux Kernel 地址空间

Linux Kernel 内存映射

slab allocator

添加新评论，含*的栏目为必填

最新文章

网站分类

最近回复

其它

Linux 内核学习(4) - 内存管理教程

内存管理

内存初始化

内存布局探测：E820图

memblock

page allocator

内存映射

硬件背景

Linux Kernel 地址空间

Linux Kernel 内存映射

slab allocator

相关文章推荐

添加新评论，含*的栏目为必填

最新文章

网站分类

最近回复

其它