Index: linux-2.6.11/include/asm-xen/asm-i386/pgtable-3level-defs.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.11/include/asm-xen/asm-i386/pgtable-3level-defs.h 2005-06-24 16:49:12.000000000 +0200 @@ -0,0 +1,25 @@ +#ifndef _I386_PGTABLE_3LEVEL_DEFS_H +#define _I386_PGTABLE_3LEVEL_DEFS_H + +#define HAVE_SHARED_KERNEL_PMD 0 + +/* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +#define PGDIR_SHIFT 30 +#define PTRS_PER_PGD 4 +#define PTRS_PER_PGD_NO_HV 4 + +/* + * PMD_SHIFT determines the size of the area a middle-level + * page table can map + */ +#define PMD_SHIFT 21 +#define PTRS_PER_PMD 512 + +/* + * entries per page directory level + */ +#define PTRS_PER_PTE 512 + +#endif /* _I386_PGTABLE_3LEVEL_DEFS_H */ Index: linux-2.6.11/include/asm-xen/asm-i386/pgtable-3level.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.11/include/asm-xen/asm-i386/pgtable-3level.h 2005-06-24 16:49:12.000000000 +0200 @@ -0,0 +1,187 @@ +#ifndef _I386_PGTABLE_3LEVEL_H +#define _I386_PGTABLE_3LEVEL_H + +#include + +/* + * Intel Physical Address Extension (PAE) Mode - three-level page + * tables on PPro+ CPUs. + * + * Copyright (C) 1999 Ingo Molnar + */ + +#define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%08lx%08lx).\n", __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low) +#define pmd_ERROR(e) \ + printk("%s:%d: bad pmd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pmd_val(e)) +#define pgd_ERROR(e) \ + printk("%s:%d: bad pgd %p(%016Lx).\n", __FILE__, __LINE__, &(e), pgd_val(e)) + +#define pud_none(pud) 0 +#define pud_bad(pud) 0 +#define pud_present(pud) 1 + +/* + * Is the pte executable? + */ +static inline int pte_x(pte_t pte) +{ + return !(pte_val(pte) & _PAGE_NX); +} + +/* + * All present user-pages with !NX bit are user-executable: + */ +static inline int pte_exec(pte_t pte) +{ + return pte_user(pte) && pte_x(pte); +} +/* + * All present pages with !NX bit are kernel-executable: + */ +static inline int pte_exec_kernel(pte_t pte) +{ + return pte_x(pte); +} + +/* Rules for using set_pte: the pte being assigned *must* be + * either not present or in a state where the hardware will + * not attempt to update the pte. In places where this is + * not possible, use pte_get_and_clear to obtain the old pte + * value and then use set_pte to update it. -ben + */ +#define __HAVE_ARCH_SET_PTE_ATOMIC + +#if 1 +/* use writable pagetables */ +static inline void set_pte(pte_t *ptep, pte_t pte) +{ + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; +} +# define set_pte_atomic(pteptr,pteval) \ + set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval)) +#else +/* no writable pagetables */ +# define set_pte(pteptr,pteval) \ + xen_l1_entry_update((pteptr), (pteval)) +# define set_pte_atomic(pteptr,pteval) set_pte(pteptr,pteval) +#endif + +#ifdef CONFIG_XEN_SHADOW_MODE +# define set_pmd(pmdptr,pmdval) \ + set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval)) +# define set_pud(pudptr,pudval) \ + set_64bit((unsigned long long *)(pudptr),pud_val(pudval)) +#else +# define set_pmd(pmdptr,pmdval) \ + xen_l2_entry_update((pmdptr), (pmdval)) +# define set_pud(pudptr,pudval) \ + xen_l3_entry_update((pudptr), (pudval)) +#endif + +/* + * Pentium-II erratum A13: in PAE mode we explicitly have to flush + * the TLB via cr3 if the top-level pgd is changed... + * We do not let the generic code free and clear pgd entries due to + * this erratum. + */ +static inline void pud_clear (pud_t * pud) { } + +#define pmd_page(pmd) (pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT)) + +#define pmd_page_kernel(pmd) \ +((unsigned long) __va(pmd_val(pmd) & PAGE_MASK)) + +#define pud_page(pud) \ +((struct page *) __va(pud_val(pud) & PAGE_MASK)) + +#define pud_page_kernel(pud) \ +((unsigned long) __va(pud_val(pud) & PAGE_MASK)) + + +/* Find an entry in the second-level page table.. */ +#define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) + +static inline pte_t ptep_get_and_clear(pte_t *ptep) +{ + pte_t res; + + /* xchg acts as a barrier before the setting of the high bits */ + res.pte_low = xchg(&ptep->pte_low, 0); + res.pte_high = ptep->pte_high; + ptep->pte_high = 0; + + return res; +} + +static inline int pte_same(pte_t a, pte_t b) +{ + return a.pte_low == b.pte_low && a.pte_high == b.pte_high; +} + +#define pte_page(x) pfn_to_page(pte_pfn(x)) + +static inline int pte_none(pte_t pte) +{ + return !pte.pte_low && !pte.pte_high; +} + +#define INVALID_P2M_ENTRY (~0U) +#define FOREIGN_FRAME(_m) ((_m) | (1UL<<((sizeof(unsigned long)*8)-1))) +#define pte_mfn(_pte) ((_pte).pte_low >> PAGE_SHIFT) /* FIXME */ +#define pte_pfn(_pte) \ +({ \ + unsigned long mfn = pte_mfn(_pte); \ + unsigned long pfn = mfn_to_pfn(mfn); \ + if ((pfn >= max_mapnr) || (pfn_to_mfn(pfn) != mfn)) \ + pfn = max_mapnr; /* special: force !pfn_valid() */ \ + pfn; \ +}) + +extern unsigned long long __supported_pte_mask; + +static inline pte_t pfn_pte_ma(unsigned long page_nr, pgprot_t pgprot) +{ + pte_t pte; + + pte.pte_high = (page_nr >> (32 - PAGE_SHIFT)) | \ + (pgprot_val(pgprot) >> 32); + pte.pte_high &= (__supported_pte_mask >> 32); + pte.pte_low = ((page_nr << PAGE_SHIFT) | pgprot_val(pgprot)) & \ + __supported_pte_mask; + return pte; +} + +static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) +{ + return pfn_pte_ma(pfn_to_mfn(page_nr), pgprot); +} + +static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) +{ + BUG(); panic("needs review"); + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | \ + pgprot_val(pgprot)) & __supported_pte_mask); +} + +/* + * Bits 0, 6 and 7 are taken in the low part of the pte, + * put the 32 bits of offset into the high part. + */ +#define pte_to_pgoff(pte) ((pte).pte_high) +#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) +#define PTE_FILE_MAX_BITS 32 + +/* Encode and de-code a swap entry */ +#define __swp_type(x) (((x).val) & 0x1f) +#define __swp_offset(x) ((x).val >> 5) +#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5}) +#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high }) +#define __swp_entry_to_pte(x) ((pte_t){ 0, (x).val }) + +#define __pmd_free_tlb(tlb, x) do { } while (0) + +#endif /* _I386_PGTABLE_3LEVEL_H */ Index: linux-2.6.11/arch/xen/i386/mm/init.c =================================================================== --- linux-2.6.11.orig/arch/xen/i386/mm/init.c 2005-06-24 16:12:18.000000000 +0200 +++ linux-2.6.11/arch/xen/i386/mm/init.c 2005-06-24 16:50:05.000000000 +0200 @@ -57,9 +57,10 @@ static pmd_t * __init one_md_table_init( { pud_t *pud; pmd_t *pmd_table; - + #ifdef CONFIG_X86_PAE pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); + make_page_readonly(pmd_table); set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); pud = pud_offset(pgd, 0); if (pmd_table != pmd_offset(pud, 0)) @@ -115,13 +116,13 @@ static void __init page_table_range_init pmd_idx = pmd_index(vaddr); pgd = pgd_base + pgd_idx; - for ( ; (pgd_idx < PTRS_PER_PGD_NO_HV) && (vaddr != end); pgd++, pgd_idx++) { + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { if (pgd_none(*pgd)) one_md_table_init(pgd); pud = pud_offset(pgd, vaddr); pmd = pmd_offset(pud, vaddr); for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { - if (pmd_none(*pmd)) + if (vaddr < HYPERVISOR_VIRT_START && pmd_none(*pmd)) one_page_table_init(pmd); vaddr += PMD_SIZE; @@ -160,13 +161,26 @@ static void __init kernel_physical_mappi pmd_idx = pmd_index(PAGE_OFFSET); pte_ofs = pte_index(PAGE_OFFSET); - for (; pgd_idx < PTRS_PER_PGD_NO_HV; pgd++, pgd_idx++) { + for (; pgd_idx < PTRS_PER_PGD; pgd++, pgd_idx++) { +#ifdef CONFIG_XEN + /* + * Native linux hasn't PAE-paging enabled yet at this + * point. When running as xen domain we are in PAE + * mode already, thus we can't simply hook a empty + * pmd. That would kill the mappings we are currently + * using ... + */ + pmd = pmd_offset(pud_offset(pgd, PAGE_OFFSET), PAGE_OFFSET); +#else pmd = one_md_table_init(pgd); +#endif if (pfn >= max_low_pfn) continue; pmd += pmd_idx; for (; pmd_idx < PTRS_PER_PMD && pfn < max_low_pfn; pmd++, pmd_idx++) { unsigned int address = pfn * PAGE_SIZE + PAGE_OFFSET; + if (address >= HYPERVISOR_VIRT_START) + continue; /* Map with big pages if possible, otherwise create normal page tables. */ if (cpu_has_pse) { @@ -353,6 +367,7 @@ static void __init pagetable_init (void) * page directory, write-protect the new page directory, then switch to * it. We clean up by write-enabling and then freeing the old page dir. */ +#ifndef CONFIG_X86_PAE memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); make_page_readonly(pgd_base); xen_pgd_pin(__pa(pgd_base)); @@ -361,8 +376,31 @@ static void __init pagetable_init (void) make_page_writable(old_pgd); __flush_tlb_all(); free_bootmem(__pa(old_pgd), PAGE_SIZE); - init_mm.context.pinned = 1; +#else + { + pud_t *old_pud = pud_offset(old_pgd+3, PAGE_OFFSET); + pmd_t *old_pmd = pmd_offset(old_pud, PAGE_OFFSET); + pmd_t *new_pmd = alloc_bootmem_low_pages(PAGE_SIZE); + + memcpy(new_pmd, old_pmd, PAGE_SIZE); + memcpy(pgd_base, old_pgd, PTRS_PER_PGD_NO_HV*sizeof(pgd_t)); + set_pgd(&pgd_base[3], __pgd(__pa(new_pmd) | _PAGE_PRESENT)); + + make_page_readonly(new_pmd); + make_page_readonly(pgd_base); + xen_pgd_pin(__pa(pgd_base)); + load_cr3(pgd_base); + xen_pgd_unpin(__pa(old_pgd)); + make_page_writable(old_pgd); + make_page_writable(old_pmd); + __flush_tlb_all(); + + free_bootmem(__pa(old_pgd), PAGE_SIZE); + free_bootmem(__pa(old_pmd), PAGE_SIZE); + } +#endif + init_mm.context.pinned = 1; kernel_physical_mapping_init(pgd_base); remap_numa_kva(); @@ -375,7 +413,7 @@ static void __init pagetable_init (void) permanent_kmaps_init(pgd_base); -#ifdef CONFIG_X86_PAE +#if 0 /* def CONFIG_X86_PAE */ /* * Add low memory identity-mappings - SMP needs it when * starting up on an AP from real-mode. In the non-PAE @@ -383,7 +421,7 @@ static void __init pagetable_init (void) * All user-space mappings are explicitly cleared after * SMP startup. */ - pgd_base[0] = pgd_base[USER_PTRS_PER_PGD]; + set_pgd(&pgd_base[0], pgd_base[USER_PTRS_PER_PGD]); #endif } @@ -418,7 +456,7 @@ void zap_low_mappings (void) * us, because pgd_clear() is a no-op on i386. */ for (i = 0; i < USER_PTRS_PER_PGD; i++) -#ifdef CONFIG_X86_PAE +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page))); #else set_pgd(swapper_pg_dir+i, __pgd(0)); @@ -429,9 +467,10 @@ void zap_low_mappings (void) #ifndef CONFIG_DISCONTIGMEM void __init zone_sizes_init(void) { - unsigned long zones_size[MAX_NR_ZONES] = {0, 0, 0}; + unsigned long zones_size[MAX_NR_ZONES] = + { [ 0 ... (MAX_NR_ZONES-1) ] = 0 }; unsigned int /*max_dma,*/ high, low; - + /* * XEN: Our notion of "DMA memory" is fake when running over Xen. * We simply put all RAM in the DMA zone so that those drivers which @@ -442,7 +481,7 @@ void __init zone_sizes_init(void) /*max_dma = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;*/ low = max_low_pfn; high = highend_pfn; - + /*if (low < max_dma)*/ zones_size[ZONE_DMA] = low; /*else*/ { @@ -549,10 +588,12 @@ void __init paging_init(void) pagetable_init(); -#ifdef CONFIG_X86_PAE +#if defined(CONFIG_X86_PAE) && !defined(CONFIG_XEN) /* * We will bail out later - printk doesn't work right now so * the user would just see a hanging kernel. + * when running as xen domain we are already in PAE mode at + * this point. */ if (cpu_has_pae) set_in_cr4(X86_CR4_PAE); @@ -729,8 +770,13 @@ void __init pgtable_cache_init(void) panic("pgtable_cache_init(): cannot create pmd cache"); } pgd_cache = kmem_cache_create("pgd", +#if 0 /* How the heck _this_ works in native linux ??? */ PTRS_PER_PGD*sizeof(pgd_t), PTRS_PER_PGD*sizeof(pgd_t), +#else + PAGE_SIZE, + PAGE_SIZE, +#endif 0, pgd_ctor, pgd_dtor); Index: linux-2.6.11/include/asm-xen/asm-i386/bug.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ linux-2.6.11/include/asm-xen/asm-i386/bug.h 2005-06-24 16:49:12.000000000 +0200 @@ -0,0 +1,16 @@ +#ifndef _I386_BUG_H +#define _I386_BUG_H + +#include + +#define BUG() do { \ + printk("kernel BUG at %s:%d (%s)!\n", \ + __FILE__, __LINE__, __FUNCTION__); \ + dump_stack(); \ + panic("BUG!"); \ +} while (0) +#define HAVE_ARCH_BUG + +#include + +#endif