untrusted comment: signature from openbsd 6.1 base secret key RWQEQa33SgQSEk3iUQjZBwKN4yXDFvjYuePYUJ88IGAlHIYDhp9eYqQik/wsL4UjnTymi1j3Orp4BPFWWEU3y7O/IjBA1IGUZgg= OpenBSD 6.1 errata 002, May 2, 2017: vmm(4) mismanaged floating point contexts. Apply by doing: signify -Vep /etc/signify/openbsd-61-base.pub -x 002_vmmfpu.patch.sig \ -m - | (cd /usr/src && patch -p0) And then rebuild and install a new kernel: KK=`sysctl -n kern.osversion | cut -d# -f1` cd /usr/src/sys/arch/`machine`/compile/$KK make obj make config make make install Index: sys/arch/amd64/amd64/fpu.c =================================================================== --- sys/arch/amd64/amd64/fpu.c 21 Apr 2016 22:08:27 -0000 1.33 +++ sys/arch/amd64/amd64/fpu.c 27 Apr 2017 06:16:39 -0000 1.34 @@ -74,41 +74,10 @@ * state is saved. */ -#define fninit() __asm("fninit") -#define fwait() __asm("fwait") -#define fnclex() __asm("fnclex") -#define fxsave(addr) __asm("fxsave %0" : "=m" (*addr)) -#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*addr)) -#define ldmxcsr(addr) __asm("ldmxcsr %0" : : "m" (*addr)) -#define fldcw(addr) __asm("fldcw %0" : : "m" (*addr)) -#define clts() __asm("clts") -#define stts() lcr0(rcr0() | CR0_TS) - /* * The mask of enabled XSAVE features. */ uint64_t xsave_mask; - -static inline void -xsave(struct savefpu *addr, uint64_t mask) -{ - uint32_t lo, hi; - - lo = mask; - hi = mask >> 32; - __asm volatile("xsave %0" : "=m" (*addr) : "a" (lo), "d" (hi) : - "memory"); -} - -static inline void -xrstor(struct savefpu *addr, uint64_t mask) -{ - uint32_t lo, hi; - - lo = mask; - hi = mask >> 32; - __asm volatile("xrstor %0" : : "m" (*addr), "a" (lo), "d" (hi)); -} void fpudna(struct cpu_info *); static int x86fpflags_to_siginfo(u_int32_t); Index: sys/arch/amd64/amd64/vmm.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vmm.c,v --- sys/arch/amd64/amd64/vmm.c 26 Apr 2017 09:53:28 -0000 1.133 +++ sys/arch/amd64/amd64/vmm.c 27 Apr 2017 06:16:39 -0000 1.134 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +32,7 @@ #include +#include #include #include #include @@ -145,6 +147,7 @@ int vcpu_vmx_check_cap(struct vcpu *, ui int vcpu_vmx_compute_ctrl(uint64_t, uint16_t, uint32_t, uint32_t, uint32_t *); int vmx_get_exit_info(uint64_t *, uint64_t *); int vmx_handle_exit(struct vcpu *); +int vmx_handle_xsetbv(struct vcpu *); int vmm_handle_cpuid(struct vcpu *); int vmx_handle_rdmsr(struct vcpu *); int vmx_handle_wrmsr(struct vcpu *); @@ -360,7 +363,7 @@ vmm_attach(struct device *parent, struct pool_init(&vm_pool, sizeof(struct vm), 0, IPL_NONE, PR_WAITOK, "vmpool", NULL); - pool_init(&vcpu_pool, sizeof(struct vcpu), 0, IPL_NONE, PR_WAITOK, + pool_init(&vcpu_pool, sizeof(struct vcpu), 64, IPL_NONE, PR_WAITOK, "vcpupl", NULL); vmm_softc = sc; @@ -2373,6 +2376,9 @@ vcpu_reset_regs_vmx(struct vcpu *vcpu, s /* XXX CR0 shadow */ /* XXX CR4 shadow */ + /* xcr0 power on default sets bit 0 (x87 state) */ + vcpu->vc_gueststate.vg_xcr0 = XCR0_X87; + /* Flush the VMCS */ if (vmclear(&vcpu->vc_control_pa)) { ret = EINVAL; @@ -2498,7 +2504,7 @@ vcpu_init_vmx(struct vcpu *vcpu) } /* Host CR0 */ - cr0 = rcr0(); + cr0 = rcr0() & ~CR0_TS; if (vmwrite(VMCS_HOST_IA32_CR0, cr0)) { ret = EINVAL; goto exit; @@ -3354,6 +3360,42 @@ vcpu_must_stop(struct vcpu *vcpu) } /* + * vmm_fpusave + * + * Modified version of fpusave_cpu from fpu.c that only saves the FPU context + * and does not call splipi/splx. Must be called with interrupts disabled. + */ +void +vmm_fpusave(void) +{ + struct proc *p; + struct cpu_info *ci = curcpu(); + + p = ci->ci_fpcurproc; + if (p == NULL) + return; + + if (ci->ci_fpsaving != 0) + panic("%s: recursive save!", __func__); + /* + * Set ci->ci_fpsaving, so that any pending exception will be + * thrown away. (It will be caught again if/when the FPU + * state is restored.) + */ + ci->ci_fpsaving = 1; + if (xsave_mask) + xsave(&p->p_addr->u_pcb.pcb_savefpu, xsave_mask); + else + fxsave(&p->p_addr->u_pcb.pcb_savefpu); + ci->ci_fpsaving = 0; + + p->p_addr->u_pcb.pcb_cr0 |= CR0_TS; + + p->p_addr->u_pcb.pcb_fpcpu = NULL; + ci->ci_fpcurproc = NULL; +} + +/* * vcpu_run_vmx * * VMX main loop used to run a VCPU. @@ -3404,6 +3446,8 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v break; case VMX_EXIT_CPUID: break; + case VMX_EXIT_XSETBV: + break; #ifdef VMM_DEBUG case VMX_EXIT_TRIPLE_FAULT: DPRINTF("%s: vm %d vcpu %d triple fault\n", @@ -3528,10 +3572,76 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v /* Start / resume the VCPU */ KERNEL_ASSERT_LOCKED(); + + /* Disable interrupts and save the current FPU state. */ + disable_intr(); + clts(); + vmm_fpusave(); + + /* Initialize the guest FPU if not inited already */ + if (!vcpu->vc_fpuinited) { + fninit(); + bzero(&vcpu->vc_g_fpu.fp_fxsave, + sizeof(vcpu->vc_g_fpu.fp_fxsave)); + vcpu->vc_g_fpu.fp_fxsave.fx_fcw = + __INITIAL_NPXCW__; + vcpu->vc_g_fpu.fp_fxsave.fx_mxcsr = + __INITIAL_MXCSR__; + fxrstor(&vcpu->vc_g_fpu.fp_fxsave); + + vcpu->vc_fpuinited = 1; + } + + if (xsave_mask) { + /* Restore guest XCR0 and FPU context */ + if (vcpu->vc_gueststate.vg_xcr0 & ~xsave_mask) { + DPRINTF("%s: guest attempted to set invalid " + "bits in xcr0\n", __func__); + ret = EINVAL; + stts(); + enable_intr(); + break; + } + + /* Restore guest %xcr0 */ + xrstor(&vcpu->vc_g_fpu, xsave_mask); + xsetbv(0, vcpu->vc_gueststate.vg_xcr0); + } else + fxrstor(&vcpu->vc_g_fpu.fp_fxsave); + KERNEL_UNLOCK(); ret = vmx_enter_guest(&vcpu->vc_control_pa, &vcpu->vc_gueststate, resume); + /* + * On exit, interrupts are disabled, and we are running with + * the guest FPU state still possibly on the CPU. Save the FPU + * state before re-enabling interrupts. + */ + if (xsave_mask) { + /* Save guest %xcr0 */ + vcpu->vc_gueststate.vg_xcr0 = xgetbv(0); + + /* Restore host %xcr0 */ + xsetbv(0, xsave_mask); + + /* + * Save full copy of FPU state - guest content is + * always a subset of host's save area (see xsetbv + * exit handler) + */ + xsave(&vcpu->vc_g_fpu, xsave_mask); + } else + fxsave(&vcpu->vc_g_fpu); + + /* + * FPU state is invalid, set CR0_TS to force DNA trap on next + * access. + */ + stts(); + + enable_intr(); + exit_reason = VM_EXIT_NONE; if (ret == 0) { /* @@ -3545,6 +3655,7 @@ vcpu_run_vmx(struct vcpu *vcpu, struct v printf("%s: can't read guest rflags during " "exit\n", __func__); ret = EINVAL; + KERNEL_LOCK(); break; } } @@ -3826,6 +3937,10 @@ vmx_handle_exit(struct vcpu *vcpu) ret = vmx_handle_wrmsr(vcpu); update_rip = 1; break; + case VMX_EXIT_XSETBV: + ret = vmx_handle_xsetbv(vcpu); + update_rip = 1; + break; case VMX_EXIT_TRIPLE_FAULT: #ifdef VMM_DEBUG DPRINTF("%s: vm %d vcpu %d triple fault\n", __func__, @@ -4351,6 +4466,62 @@ vmx_handle_rdmsr(struct vcpu *vcpu) } /* + * vmx_handle_xsetbv + * + * Handler for xsetbv instructions. We allow the guest VM to set xcr0 values + * limited to the xsave_mask in use in the host. + * + * Parameters: + * vcpu: vcpu structure containing instruction info causing the exit + * + * Return value: + * 0: The operation was successful + * EINVAL: An error occurred + */ +int +vmx_handle_xsetbv(struct vcpu *vcpu) +{ + uint64_t insn_length; + uint64_t *rax, *rdx, *rcx;; + + if (vmread(VMCS_INSTRUCTION_LENGTH, &insn_length)) { + printf("%s: can't obtain instruction length\n", __func__); + return (EINVAL); + } + + /* All XSETBV instructions are 0x0F 0x01 0xD1 */ + KASSERT(insn_length == 3); + + rax = &vcpu->vc_gueststate.vg_rax; + rcx = &vcpu->vc_gueststate.vg_rcx; + rdx = &vcpu->vc_gueststate.vg_rdx; + + if (*rcx != 0) { + DPRINTF("%s: guest specified invalid xcr register number " + "%lld\n", __func__, *rcx); + /* XXX this should #GP(0) instead of killing the guest */ + return (EINVAL); + } + + /* + * No bits in %edx are currently supported. Check this, and validate + * against the host mask. + */ + if (*rdx != 0 || (*rax & ~xsave_mask)) { + DPRINTF("%s: guest specified invalid xcr0 content " + "(0x%llx:0x%llx)\n", __func__, *rdx, *rax); + /* XXX this should #GP(0) instead of killing the guest */ + return (EINVAL); + } + + vcpu->vc_gueststate.vg_xcr0 = *rax; + + vcpu->vc_gueststate.vg_rip += insn_length; + + return (0); +} + +/* * vmx_handle_wrmsr * * Handler for wrmsr instructions. This handler logs the access, and discards @@ -4413,6 +4584,7 @@ vmm_handle_cpuid(struct vcpu *vcpu) { uint64_t insn_length; uint64_t *rax, *rbx, *rcx, *rdx; + uint32_t eax, ebx, ecx, edx; if (vmm_softc->mode == VMM_MODE_VMX || vmm_softc->mode == VMM_MODE_EPT) { @@ -4431,9 +4603,11 @@ vmm_handle_cpuid(struct vcpu *vcpu) rcx = &vcpu->vc_gueststate.vg_rcx; rdx = &vcpu->vc_gueststate.vg_rdx; + CPUID_LEAF(rax, 0, eax, ebx, ecx, edx); + switch (*rax) { case 0x00: /* Max level and vendor ID */ - *rax = 0x07; /* cpuid_level */ + *rax = 0x0d; /* cpuid_level */ *rbx = *((uint32_t *)&cpu_vendor); *rdx = *((uint32_t *)&cpu_vendor + 1); *rcx = *((uint32_t *)&cpu_vendor + 2); @@ -4580,13 +4754,19 @@ vmm_handle_cpuid(struct vcpu *vcpu) *rcx = 0; *rdx = 0; break; - case 0x0d: /* Processor ext. state information (not supported) */ - DPRINTF("%s: function 0x0d (ext. state info) not supported\n", - __func__); - *rax = 0; - *rbx = 0; - *rcx = 0; - *rdx = 0; + case 0x0d: /* Processor ext. state information */ + if (*rcx == 0) { + *rax = xsave_mask; + *rbx = ebx; + *rcx = ecx; + *rdx = edx; + } else { + CPUID_LEAF(rax, *rcx, eax, ebx, ecx, edx); + *rax = eax; + *rbx = ebx; + *rcx = ecx; + *rdx = edx; + } break; case 0x0f: /* QoS info (not supported) */ DPRINTF("%s: function 0x0f (QoS info) not supported\n", Index: sys/arch/amd64/amd64/vmm_support.S =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/vmm_support.S,v --- sys/arch/amd64/amd64/vmm_support.S 25 Mar 2017 15:25:20 -0000 1.7 +++ sys/arch/amd64/amd64/vmm_support.S 27 Apr 2017 06:16:39 -0000 1.8 @@ -17,6 +17,7 @@ #include "assym.h" #include +#include #include /* @@ -154,6 +155,9 @@ skip_init: */ pushfq + popq %rax + andq $(~PSL_I), %rax + pushq %rax /* * Save (possibly) lazy-switched selectors @@ -354,7 +358,6 @@ restore_host: * first. This is to accommodate possibly lazy-switched * selectors from above */ - cli popq %rdx popq %rax movq $MSR_KERNELGSBASE, %rcx @@ -371,7 +374,6 @@ restore_host: popq %rax movq $MSR_FSBASE, %rcx wrmsr - sti popw %ax movw %ax, %ss Index: sys/arch/amd64/include/cpufunc.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/cpufunc.h,v --- sys/arch/amd64/include/cpufunc.h 4 Sep 2016 09:22:28 -0000 1.13 +++ sys/arch/amd64/include/cpufunc.h 27 Apr 2017 06:16:39 -0000 1.14 @@ -333,6 +333,16 @@ xsetbv(uint32_t reg, uint64_t mask) lo = mask; hi = mask >> 32; __asm volatile("xsetbv" :: "c" (reg), "a" (lo), "d" (hi) : "memory"); +} + +static __inline uint64_t +xgetbv(uint32_t reg) +{ + uint32_t lo, hi; + + __asm volatile("xgetbv" : "=a" (lo), "=d" (hi) : "c" (reg)); + + return (((uint64_t)hi << 32) | (uint64_t)lo); } /* Break into DDB/KGDB. */ Index: sys/arch/amd64/include/fpu.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/fpu.h,v --- sys/arch/amd64/include/fpu.h 25 Mar 2015 21:05:18 -0000 1.11 +++ sys/arch/amd64/include/fpu.h 27 Apr 2017 06:16:39 -0000 1.12 @@ -70,6 +70,37 @@ void fpusave_proc(struct proc *, int); void fpusave_cpu(struct cpu_info *, int); void fpu_kernel_enter(void); void fpu_kernel_exit(void); + +#define fninit() __asm("fninit") +#define fwait() __asm("fwait") +#define fnclex() __asm("fnclex") +#define fxsave(addr) __asm("fxsave %0" : "=m" (*addr)) +#define fxrstor(addr) __asm("fxrstor %0" : : "m" (*addr)) +#define ldmxcsr(addr) __asm("ldmxcsr %0" : : "m" (*addr)) +#define fldcw(addr) __asm("fldcw %0" : : "m" (*addr)) +#define clts() __asm("clts") +#define stts() lcr0(rcr0() | CR0_TS) + +static inline void +xsave(struct savefpu *addr, uint64_t mask) +{ + uint32_t lo, hi; + + lo = mask; + hi = mask >> 32; + __asm volatile("xsave %0" : "=m" (*addr) : "a" (lo), "d" (hi) : + "memory"); +} + +static inline void +xrstor(struct savefpu *addr, uint64_t mask) +{ + uint32_t lo, hi; + + lo = mask; + hi = mask >> 32; + __asm volatile("xrstor %0" : : "m" (*addr), "a" (lo), "d" (hi)); +} #endif Index: sys/arch/amd64/include/vmmvar.h =================================================================== RCS file: /cvs/src/sys/arch/amd64/include/vmmvar.h,v --- sys/arch/amd64/include/vmmvar.h 23 Mar 2017 08:05:58 -0000 1.32 +++ sys/arch/amd64/include/vmmvar.h 27 Apr 2017 06:16:39 -0000 1.33 @@ -638,6 +638,7 @@ struct vmx_gueststate uint64_t vg_rip; /* 0x80 */ uint32_t vg_exit_reason; /* 0x88 */ uint64_t vg_rflags; /* 0x90 */ + uint64_t vg_xcr0; /* 0x98 */ }; /* @@ -649,6 +650,12 @@ struct vm; * Virtual CPU */ struct vcpu { + /* + * Guest FPU state - this must remain as the first member of the struct + * to ensure 64-byte alignment (set up during vcpu_pool init) + */ + struct savefpu vc_g_fpu; + /* VMCS / VMCB pointer */ vaddr_t vc_control_va; uint64_t vc_control_pa; @@ -673,6 +680,10 @@ struct vcpu { uint16_t vc_intr; uint8_t vc_irqready; + + uint8_t vc_fpuinited; + + uint64_t vc_h_xcr0; /* VMX only */ uint64_t vc_vmx_basic;