伙伴系统分配器大体上分为两类。__get_free_pages()类函数返回分配的第一个页面的线性地址;alloc_pages()类函数返回页面描述符地址。不管以哪种函数进行分配,最终会调用alloc_pages()进行分配页面。
为清楚了解其分配制度,先给个伙伴系统数据的存储框图
也就是每个order对应一个free_area结构,free_area以不同的类型以链表的方式存储这些内存块。
二、主分配函数
下面我们来看这个函数(在UMA模式下)
- #define alloc_pages(gfp_mask, order) \
- alloc_pages_node(numa_node_id(), gfp_mask, order)
- static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
- unsigned int order)
- {
- /* Unknown node is current node */
- if (nid < 0)
- nid = numa_node_id();
- return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
- }
- static inline struct page *
- __alloc_pages(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist)
- {
- return __alloc_pages_nodemask(gfp_mask, order, zonelist, NULL);
- }
上层分配函数__alloc_pages_nodemask()
- /*
- * This is the 'heart' of the zoned buddy allocator.
- */
- /*上层分配器运用了各种方式进行*/
- struct page *
- __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, nodemask_t *nodemask)
- {
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
- struct page *page;
- /* Convert GFP flags to their corresponding migrate type */
- int migratetype = allocflags_to_migratetype(gfp_mask);
- gfp_mask &= gfp_allowed_mask;
- /*调试用*/
- lockdep_trace_alloc(gfp_mask);
- /*如果__GFP_WAIT标志设置了,需要等待和重新调度*/
- might_sleep_if(gfp_mask & __GFP_WAIT);
- /*没有设置对应的宏*/
- if (should_fail_alloc_page(gfp_mask, order))
- return NULL;
- /*
- * Check the zones suitable for the gfp_mask contain at least one
- * valid zone. It's possible to have an empty zonelist as a result
- * of GFP_THISNODE and a memoryless node
- */
- if (unlikely(!zonelist->_zonerefs->zone))
- return NULL;
- /* The preferred zone is used for statistics later */
- /* 英文注释所说*/
- first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone);
- if (!preferred_zone)
- return NULL;
- /* First allocation attempt */
- /*从pcp和伙伴系统中正常的分配内存空间*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, ALLOC_WMARK_LOW|ALLOC_CPUSET,
- preferred_zone, migratetype);
- if (unlikely(!page))/*如果上面没有分配到空间,调用下面函数慢速分配,允许等待和回收*/
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, migratetype);
- /*调试用*/
- trace_mm_page_alloc(page, order, gfp_mask, migratetype);
- return page;
- }
三、从pcp和伙伴系统中正常的分配内存空间
函数get_page_from_freelist()
- /*
- * get_page_from_freelist goes through the zonelist trying to allocate
- * a page.
- */
- /*为分配制定内存空间,遍历每个zone*/
- static struct page *
- get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int migratetype)
- {
- struct zoneref *z;
- struct page *page = NULL;
- int classzone_idx;
- struct zone *zone;
- nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
- int zlc_active = 0; /* set if using zonelist_cache */
- int did_zlc_setup = 0; /* just call zlc_setup() one time */
- /*zone对应的下标*/
- classzone_idx = zone_idx(preferred_zone);
- zonelist_scan:
- /*
- * Scan zonelist, looking for a zone with enough free.
- * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
- */
- /*遍历每个zone,进行分配*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- /*在UMA模式下不成立*/ high_zoneidx, nodemask) {
- if (NUMA_BUILD && zlc_active &&
- !zlc_zone_worth_trying(zonelist, z, allowednodes))
- continue;
- if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed_softwall(zone, gfp_mask))
- goto try_next_zone;
- BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
- /*需要关注水位*/
- if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
- unsigned long mark;
- int ret;
- /*从flags中取的mark*/
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- /*如果水位正常,从本zone中分配*/
- if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto try_this_zone;
- if (zone_reclaim_mode == 0)/*如果上面检查的水位低于正常值,且没有设置页面回收值*/
- goto this_zone_full;
- /*在UMA模式下下面函数直接返回0*/
- ret = zone_reclaim(zone, gfp_mask, order);
- switch (ret) {
- case ZONE_RECLAIM_NOSCAN:
- /* did not scan */
- goto try_next_zone;
- case ZONE_RECLAIM_FULL:
- /* scanned but unreclaimable */
- goto this_zone_full;
- default:
- /* did we reclaim enough */
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
- goto this_zone_full;
- }
- }
- try_this_zone:/*本zone正常水位*/
- /*先从pcp中分配,然后不行的话再从伙伴系统中分配*/
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
- this_zone_full:
- if (NUMA_BUILD)/*UMA模式为0*/
- zlc_mark_zone_full(zonelist, z);
- try_next_zone:
- if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) {
- /*
- * we do zlc_setup after the first zone is tried but only
- * if there are multiple nodes make it worthwhile
- */
- allowednodes = zlc_setup(zonelist, alloc_flags);
- zlc_active = 1;
- did_zlc_setup = 1;
- }
- }
- if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
- /* Disable zlc cache for second zonelist scan */
- zlc_active = 0;
- goto zonelist_scan;
- }
- return page;/*返回页面*/
- }
主分配函数
- /*
- * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
- * we cheat by calling it from here, in the order > 0 path. Saves a branch
- * or two.
- */
- /*先考虑从pcp中分配空间,当order大于0时再考虑从伙伴系统中分配*/
- static inline
- struct page *buffered_rmqueue(struct zone *preferred_zone,
- struct zone *zone, int order, gfp_t gfp_flags,
- int migratetype)
- {
- unsigned long flags;
- struct page *page;
- int cold = !!(gfp_flags & __GFP_COLD);/*如果分配参数指定了__GFP_COLD标志,则设置cold标志*/
- int cpu;
- again:
- cpu = get_cpu();
- if (likely(order == 0)) {/*分配一个页面时,使用pcp*/
- struct per_cpu_pages *pcp;
- struct list_head *list;
- /*找到zone对应的pcp*/
- pcp = &zone_pcp(zone, cpu)->pcp;
- list = &pcp->lists[migratetype];/*pcp中对应类型的list*/
- /* 这里需要关中断,因为内存回收过程可能发送核间中断,强制每个核从每CPU
- 缓存中释放页面。而且中断处理函数也会分配单页。 */
- local_irq_save(flags);
- if (list_empty(list)) {/*如果pcp中没有页面,需要补充*/
- /*从伙伴系统中获得batch个页面
- batch为一次分配的页面数*/
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype, cold);
- /*如果链表仍然为空,申请失败返回*/
- if (unlikely(list_empty(list)))
- goto failed;
- }
- /* 如果分配的页面不需要考虑硬件缓存(注意不是每CPU页面缓存)
- ,则取出链表的最后一个节点返回给上层*/
- if (cold)
- page = list_entry(list->prev, struct page, lru);
- else/* 如果要考虑硬件缓存,则取出链表的第一个页面,这个页面是最近刚释放到每CPU
- 缓存的,缓存热度更高 */
- page = list_entry(list->next, struct page, lru);
- list_del(&page->lru);/*从pcp中脱离*/
- pcp->count--;/*pcp计数减一*/
- }
- else {/*当order为大于1时,不从pcp中分配,直接考虑从伙伴系统中分配*/
- if (unlikely(gfp_flags & __GFP_NOFAIL)) {
- /*
- * __GFP_NOFAIL is not to be used in new code.
- *
- * All __GFP_NOFAIL callers should be fixed so that they
- * properly detect and handle allocation failures.
- *
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with
- * __GFP_NOFAIL.
- */
- WARN_ON_ONCE(order > 1);
- }
- /* 关中断,并获得管理区的锁*/
- spin_lock_irqsave(&zone->lock, flags);
- /*从伙伴系统中相应类型的相应链表中分配空间*/
- page = __rmqueue(zone, order, migratetype);
- /* 已经分配了1 << order个页面,这里进行管理区空闲页面统计计数*/
- __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order));
- spin_unlock(&zone->lock);/* 这里仅仅打开自旋锁,待后面统计计数设置完毕后再开中断*/
- if (!page)
- goto failed;
- }
- /*事件统计计数,调试*/
- __count_zone_vm_events(PGALLOC, zone, 1 << order);
- zone_statistics(preferred_zone, zone);
- local_irq_restore(flags);/*恢复中断*/
- put_cpu();
- VM_BUG_ON(bad_range(zone, page));
- /* 这里进行安全性检查,并进行一些善后工作。
- 如果页面标志破坏,返回的页面出现了问题,则返回试图分配其他页面*/
- if (prep_new_page(page, order, gfp_flags))
- goto again;
- return page;
- failed:
- local_irq_restore(flags);
- put_cpu();
- return NULL;
- }