[PATCH 0/3] hw/arm/virt: Introduce kvm-steal-time

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
15 messages Options
Reply | Threaded
Open this post in threaded view
|

[PATCH 0/3] hw/arm/virt: Introduce kvm-steal-time

Andrew Jones-15
KVM supports the ability to publish the amount of time that VCPUs
were runnable, but not running due to other host threads running
instead, to the guest. The guest scheduler may use that information
when making decisions and the guest may expose it to its userspace
(Linux publishes this information in /proc/stat). This feature is
called "steal time" as it represents the amount of time stolen from
a guest by scheduling out its VCPUs. To enable this feature KVM
userspace must provide a memory region that will be used to publish
the information to the guest. The memory region is typical migratable
region. The GPA of the region is given to KVM through a VCPU device
ioctl interface. This feature is only available for 64-bit hosts
running 64-bit guests.

This series provides the QEMU support of this feature. It will
be enabled by default for 5.1 machine types and later, but may
be disabled with a new CPU property "kvm-steal-time".

While testing migration it was observed that the amount of
steal time as viewed by the guest was getting reset on each
migration. Patch 3/5 of a pvtime fix series posted[*] for KVM
should fix that.

Migration testing:
* virt-5.0 can migrate as usual, no kvm-steal-time enabled
* virt-5.1 can migrate between hosts with steal-time enabled and
  disabled when both hosts support steal-time
* virt-5.1 with steal-time disabled can migrate to a host that
  does not support steal-time
* virt-5.1 with steal-time enabled will cleanly fail when migrating
  to a host that does not support steal-time

[*] https://lists.cs.columbia.edu/pipermail/kvmarm/2020-July/041538.html

Thanks,
drew


Andrew Jones (3):
  hw/arm/virt: Move post cpu realize check into its own function
  hw/arm/virt: Move kvm pmu setup to virt_cpu_post_init
  hw/arm/virt: Implement kvm-steal-time

 docs/system/arm/cpu-features.rst |  11 ++++
 hw/arm/virt.c                    | 103 +++++++++++++++++++++----------
 include/hw/arm/virt.h            |   2 +
 target/arm/cpu.c                 |  10 +++
 target/arm/cpu.h                 |   4 ++
 target/arm/kvm.c                 |  20 ++++++
 target/arm/kvm32.c               |   5 ++
 target/arm/kvm64.c               |  70 +++++++++++++++++++--
 target/arm/kvm_arm.h             |  24 ++++++-
 target/arm/monitor.c             |   2 +-
 tests/qtest/arm-cpu-features.c   |  25 ++++++--
 11 files changed, 232 insertions(+), 44 deletions(-)

--
2.25.4


Reply | Threaded
Open this post in threaded view
|

[PATCH 1/3] hw/arm/virt: Move post cpu realize check into its own function

Andrew Jones-15
We'll add more to this new function in coming patches so we also
state the gic must be created and call it below create_gic().

No functional change intended.

Signed-off-by: Andrew Jones <[hidden email]>
---
 hw/arm/virt.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 9005dae356be..cb2fa99b1ef5 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -1672,6 +1672,31 @@ static void finalize_gic_version(VirtMachineState *vms)
     }
 }
 
+/*
+ * virt_cpu_post_init() must be called after the CPUs have
+ * been realized and the GIC has been created.
+ */
+static void virt_cpu_post_init(VirtMachineState *vms)
+{
+    bool aarch64;
+
+    aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
+
+    if (!kvm_enabled()) {
+        if (aarch64 && vms->highmem) {
+            int requested_pa_size = 64 - clz64(vms->highest_gpa);
+            int pamax = arm_pamax(ARM_CPU(first_cpu));
+
+            if (pamax < requested_pa_size) {
+                error_report("VCPU supports less PA bits (%d) than "
+                             "requested by the memory map (%d)",
+                             pamax, requested_pa_size);
+                exit(1);
+            }
+        }
+    }
+}
+
 static void machvirt_init(MachineState *machine)
 {
     VirtMachineState *vms = VIRT_MACHINE(machine);
@@ -1873,22 +1898,6 @@ static void machvirt_init(MachineState *machine)
     fdt_add_timer_nodes(vms);
     fdt_add_cpu_nodes(vms);
 
-   if (!kvm_enabled()) {
-        ARMCPU *cpu = ARM_CPU(first_cpu);
-        bool aarch64 = object_property_get_bool(OBJECT(cpu), "aarch64", NULL);
-
-        if (aarch64 && vms->highmem) {
-            int requested_pa_size, pamax = arm_pamax(cpu);
-
-            requested_pa_size = 64 - clz64(vms->highest_gpa);
-            if (pamax < requested_pa_size) {
-                error_report("VCPU supports less PA bits (%d) than requested "
-                            "by the memory map (%d)", pamax, requested_pa_size);
-                exit(1);
-            }
-        }
-    }
-
     memory_region_add_subregion(sysmem, vms->memmap[VIRT_MEM].base,
                                 machine->ram);
     if (machine->device_memory) {
@@ -1900,6 +1909,8 @@ static void machvirt_init(MachineState *machine)
 
     create_gic(vms);
 
+    virt_cpu_post_init(vms);
+
     fdt_add_pmu_nodes(vms);
 
     create_uart(vms, VIRT_UART, sysmem, serial_hd(0));
--
2.25.4


Reply | Threaded
Open this post in threaded view
|

[PATCH 2/3] hw/arm/virt: Move kvm pmu setup to virt_cpu_post_init

Andrew Jones-15
In reply to this post by Andrew Jones-15
Move the KVM PMU setup part of fdt_add_pmu_nodes() to
virt_cpu_post_init(), which is a more appropriate location. Now
fdt_add_pmu_nodes() is also named more appropriately, because it
no longer does anything but fdt node creation.

No functional change intended.

Signed-off-by: Andrew Jones <[hidden email]>
---
 hw/arm/virt.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index cb2fa99b1ef5..63ef530933e5 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -521,30 +521,15 @@ static void fdt_add_gic_node(VirtMachineState *vms)
 
 static void fdt_add_pmu_nodes(const VirtMachineState *vms)
 {
-    CPUState *cpu;
-    ARMCPU *armcpu;
+    ARMCPU *armcpu = ARM_CPU(first_cpu);
     uint32_t irqflags = GIC_FDT_IRQ_FLAGS_LEVEL_HI;
 
-    CPU_FOREACH(cpu) {
-        armcpu = ARM_CPU(cpu);
-        if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
-            return;
-        }
-        if (kvm_enabled()) {
-            if (kvm_irqchip_in_kernel()) {
-                kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
-            }
-            kvm_arm_pmu_init(cpu);
-        }
-    }
-
     if (vms->gic_version == VIRT_GIC_VERSION_2) {
         irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START,
                              GIC_FDT_IRQ_PPI_CPU_WIDTH,
                              (1 << vms->smp_cpus) - 1);
     }
 
-    armcpu = ARM_CPU(qemu_get_cpu(0));
     qemu_fdt_add_subnode(vms->fdt, "/pmu");
     if (arm_feature(&armcpu->env, ARM_FEATURE_V8)) {
         const char compat[] = "arm,armv8-pmuv3";
@@ -1678,11 +1663,23 @@ static void finalize_gic_version(VirtMachineState *vms)
  */
 static void virt_cpu_post_init(VirtMachineState *vms)
 {
-    bool aarch64;
+    bool aarch64, pmu;
+    CPUState *cpu;
 
     aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
+    pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
 
-    if (!kvm_enabled()) {
+    if (kvm_enabled()) {
+        CPU_FOREACH(cpu) {
+            if (pmu) {
+                assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
+                if (kvm_irqchip_in_kernel()) {
+                    kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
+                }
+                kvm_arm_pmu_init(cpu);
+            }
+        }
+    } else {
         if (aarch64 && vms->highmem) {
             int requested_pa_size = 64 - clz64(vms->highest_gpa);
             int pamax = arm_pamax(ARM_CPU(first_cpu));
--
2.25.4


Reply | Threaded
Open this post in threaded view
|

[PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Andrew Jones-15
In reply to this post by Andrew Jones-15
We add the kvm-steal-time CPU property and implement it for machvirt.
A tiny bit of refactoring was also done to allow pmu and pvtime to
use the same vcpu device helper functions.

Signed-off-by: Andrew Jones <[hidden email]>
---
 docs/system/arm/cpu-features.rst | 11 +++++
 hw/arm/virt.c                    | 33 ++++++++++++++-
 include/hw/arm/virt.h            |  2 +
 target/arm/cpu.c                 | 10 +++++
 target/arm/cpu.h                 |  4 ++
 target/arm/kvm.c                 | 20 +++++++++
 target/arm/kvm32.c               |  5 +++
 target/arm/kvm64.c               | 70 +++++++++++++++++++++++++++++---
 target/arm/kvm_arm.h             | 24 ++++++++++-
 target/arm/monitor.c             |  2 +-
 tests/qtest/arm-cpu-features.c   | 25 ++++++++++--
 11 files changed, 193 insertions(+), 13 deletions(-)

diff --git a/docs/system/arm/cpu-features.rst b/docs/system/arm/cpu-features.rst
index 2d5c06cd016b..4e590e6e9f54 100644
--- a/docs/system/arm/cpu-features.rst
+++ b/docs/system/arm/cpu-features.rst
@@ -200,6 +200,17 @@ the list of KVM VCPU features and their descriptions.
                            adjustment, also restoring the legacy (pre-5.0)
                            behavior.
 
+  kvm-steal-time           Since v5.1, kvm-steal-time is enabled by
+                           default when KVM is enabled, the feature is
+                           supported, and the guest is 64-bit.
+
+                           When kvm-steal-time is enabled a 64-bit guest
+                           can account for time its CPUs were not running
+                           due to the host not scheduling the corresponding
+                           VCPU threads.  The accounting statistics may
+                           influence the guest scheduler behavior and/or be
+                           exposed to the guest userspace.
+
 SVE CPU Properties
 ==================
 
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 63ef530933e5..21965a1665ca 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -151,6 +151,7 @@ static const MemMapEntry base_memmap[] = {
     [VIRT_PCDIMM_ACPI] =        { 0x09070000, MEMORY_HOTPLUG_IO_LEN },
     [VIRT_ACPI_GED] =           { 0x09080000, ACPI_GED_EVT_SEL_LEN },
     [VIRT_NVDIMM_ACPI] =        { 0x09090000, NVDIMM_ACPI_IO_LEN},
+    [VIRT_PVTIME] =             { 0x090a0000, 0x00010000 },
     [VIRT_MMIO] =               { 0x0a000000, 0x00000200 },
     /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
     [VIRT_PLATFORM_BUS] =       { 0x0c000000, 0x02000000 },
@@ -1663,13 +1664,26 @@ static void finalize_gic_version(VirtMachineState *vms)
  */
 static void virt_cpu_post_init(VirtMachineState *vms)
 {
-    bool aarch64, pmu;
+    bool aarch64, pmu, steal_time;
     CPUState *cpu;
 
     aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
     pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
+    steal_time = object_property_get_bool(OBJECT(first_cpu),
+                                          "kvm-steal-time", NULL);
 
     if (kvm_enabled()) {
+        hwaddr pvtime_base = vms->memmap[VIRT_PVTIME].base;
+        hwaddr pvtime_size = vms->memmap[VIRT_PVTIME].size;
+
+        if (steal_time) {
+            MemoryRegion *pvtime = g_new(MemoryRegion, 1);
+
+            memory_region_init_ram(pvtime, NULL, "pvtime", pvtime_size, NULL);
+            memory_region_add_subregion(get_system_memory(), pvtime_base,
+                                        pvtime);
+        }
+
         CPU_FOREACH(cpu) {
             if (pmu) {
                 assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
@@ -1678,6 +1692,17 @@ static void virt_cpu_post_init(VirtMachineState *vms)
                 }
                 kvm_arm_pmu_init(cpu);
             }
+            if (steal_time) {
+                /*
+                 * We need 64 bytes for each CPU[*]. One 64k region gives
+                 * us up to 1024 CPUs, or some growing room for the pvtime
+                 * structure for less CPUs.
+                 *
+                 * [*] See Linux kernel arch/arm64/include/asm/pvclock-abi.h
+                 */
+                assert(pvtime_size >= MACHINE_GET_CLASS(vms)->max_cpus * 64);
+                kvm_arm_pvtime_init(cpu, pvtime_base + 64 * cpu->cpu_index);
+            }
         }
     } else {
         if (aarch64 && vms->highmem) {
@@ -1842,6 +1867,11 @@ static void machvirt_init(MachineState *machine)
             object_property_set_bool(cpuobj, "kvm-no-adjvtime", true, NULL);
         }
 
+        if (vmc->kvm_no_steal_time &&
+            object_property_find(cpuobj, "kvm-steal-time", NULL)) {
+            object_property_set_bool(cpuobj, false, "kvm-steal-time", NULL);
+        }
+
         if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) {
             object_property_set_bool(cpuobj, "pmu", false, NULL);
         }
@@ -2528,6 +2558,7 @@ static void virt_machine_5_0_options(MachineClass *mc)
     mc->numa_mem_supported = true;
     vmc->acpi_expose_flash = true;
     mc->auto_enable_numa_with_memdev = false;
+    vmc->kvm_no_steal_time = true;
 }
 DEFINE_VIRT_MACHINE(5, 0)
 
diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
index 54bcf17afd35..b5153afedcdf 100644
--- a/include/hw/arm/virt.h
+++ b/include/hw/arm/virt.h
@@ -80,6 +80,7 @@ enum {
     VIRT_PCDIMM_ACPI,
     VIRT_ACPI_GED,
     VIRT_NVDIMM_ACPI,
+    VIRT_PVTIME,
     VIRT_LOWMEMMAP_LAST,
 };
 
@@ -126,6 +127,7 @@ typedef struct {
     bool no_ged;   /* Machines < 4.2 has no support for ACPI GED device */
     bool kvm_no_adjvtime;
     bool acpi_expose_flash;
+    bool kvm_no_steal_time;
 } VirtMachineClass;
 
 typedef struct {
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index 5050e1843a85..f894ee3fdee8 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1308,6 +1308,16 @@ void arm_cpu_finalize_features(ARMCPU *cpu, Error **errp)
             return;
         }
     }
+
+    if (kvm_enabled()) {
+#ifdef TARGET_AARCH64
+        kvm_arm_steal_time_finalize(cpu, &local_err);
+        if (local_err != NULL) {
+            error_propagate(errp, local_err);
+            return;
+        }
+#endif
+    }
 }
 
 static void arm_cpu_realizefn(DeviceState *dev, Error **errp)
diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 9e8ed423ea1d..a4d4cb640c77 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -24,6 +24,7 @@
 #include "hw/registerfields.h"
 #include "cpu-qom.h"
 #include "exec/cpu-defs.h"
+#include "qapi/qapi-types-common.h"
 
 /* ARM processors have a weak memory model */
 #define TCG_GUEST_DEFAULT_MO      (0)
@@ -859,6 +860,9 @@ struct ARMCPU {
     bool kvm_vtime_dirty;
     uint64_t kvm_vtime;
 
+    /* KVM steal time */
+    OnOffAuto kvm_steal_time;
+
     /* Uniprocessor system with MP extensions */
     bool mp_is_up;
 
diff --git a/target/arm/kvm.c b/target/arm/kvm.c
index 8bb7318378b5..093a290453f6 100644
--- a/target/arm/kvm.c
+++ b/target/arm/kvm.c
@@ -192,6 +192,18 @@ static void kvm_no_adjvtime_set(Object *obj, bool value, Error **errp)
     ARM_CPU(obj)->kvm_adjvtime = !value;
 }
 
+#ifdef TARGET_AARCH64
+static bool kvm_steal_time_get(Object *obj, Error **errp)
+{
+    return ARM_CPU(obj)->kvm_steal_time != ON_OFF_AUTO_OFF;
+}
+
+static void kvm_steal_time_set(Object *obj, bool value, Error **errp)
+{
+    ARM_CPU(obj)->kvm_steal_time = value ? ON_OFF_AUTO_ON : ON_OFF_AUTO_OFF;
+}
+#endif
+
 /* KVM VCPU properties should be prefixed with "kvm-". */
 void kvm_arm_add_vcpu_properties(Object *obj)
 {
@@ -207,6 +219,14 @@ void kvm_arm_add_vcpu_properties(Object *obj)
                                         "the virtual counter. VM stopped time "
                                         "will be counted.");
     }
+
+#ifdef TARGET_AARCH64
+    cpu->kvm_steal_time = ON_OFF_AUTO_AUTO;
+    object_property_add_bool(obj, "kvm-steal-time", kvm_steal_time_get,
+                             kvm_steal_time_set);
+    object_property_set_description(obj, "kvm-steal-time",
+                                    "Set off to disable KVM steal time.");
+#endif
 }
 
 bool kvm_arm_pmu_supported(void)
diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
index 0af46b41c847..d3f3195077fa 100644
--- a/target/arm/kvm32.c
+++ b/target/arm/kvm32.c
@@ -560,6 +560,11 @@ void kvm_arm_pmu_init(CPUState *cs)
     qemu_log_mask(LOG_UNIMP, "%s: not implemented\n", __func__);
 }
 
+void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
+{
+    qemu_log_mask(LOG_UNIMP, "%s: not implemented\n", __func__);
+}
+
 #define ARM_REG_DFSR  ARM_CP15_REG32(0, 5, 0, 0)
 #define ARM_REG_TTBCR ARM_CP15_REG32(0, 2, 0, 2)
 /*
diff --git a/target/arm/kvm64.c b/target/arm/kvm64.c
index 116923790550..a5f4633f87c3 100644
--- a/target/arm/kvm64.c
+++ b/target/arm/kvm64.c
@@ -17,6 +17,7 @@
 #include <linux/kvm.h>
 
 #include "qemu-common.h"
+#include "qapi/error.h"
 #include "cpu.h"
 #include "qemu/timer.h"
 #include "qemu/error-report.h"
@@ -398,19 +399,20 @@ static CPUWatchpoint *find_hw_watchpoint(CPUState *cpu, target_ulong addr)
     return NULL;
 }
 
-static bool kvm_arm_pmu_set_attr(CPUState *cs, struct kvm_device_attr *attr)
+static bool kvm_arm_set_device_attr(CPUState *cs, struct kvm_device_attr *attr,
+                                    const char *name)
 {
     int err;
 
     err = kvm_vcpu_ioctl(cs, KVM_HAS_DEVICE_ATTR, attr);
     if (err != 0) {
-        error_report("PMU: KVM_HAS_DEVICE_ATTR: %s", strerror(-err));
+        error_report("%s: KVM_HAS_DEVICE_ATTR: %s", name, strerror(-err));
         return false;
     }
 
     err = kvm_vcpu_ioctl(cs, KVM_SET_DEVICE_ATTR, attr);
     if (err != 0) {
-        error_report("PMU: KVM_SET_DEVICE_ATTR: %s", strerror(-err));
+        error_report("%s: KVM_SET_DEVICE_ATTR: %s", name, strerror(-err));
         return false;
     }
 
@@ -427,7 +429,7 @@ void kvm_arm_pmu_init(CPUState *cs)
     if (!ARM_CPU(cs)->has_pmu) {
         return;
     }
-    if (!kvm_arm_pmu_set_attr(cs, &attr)) {
+    if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
         error_report("failed to init PMU");
         abort();
     }
@@ -444,12 +446,29 @@ void kvm_arm_pmu_set_irq(CPUState *cs, int irq)
     if (!ARM_CPU(cs)->has_pmu) {
         return;
     }
-    if (!kvm_arm_pmu_set_attr(cs, &attr)) {
+    if (!kvm_arm_set_device_attr(cs, &attr, "PMU")) {
         error_report("failed to set irq for PMU");
         abort();
     }
 }
 
+void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
+{
+    struct kvm_device_attr attr = {
+        .group = KVM_ARM_VCPU_PVTIME_CTRL,
+        .attr = KVM_ARM_VCPU_PVTIME_IPA,
+        .addr = (uint64_t)&ipa,
+    };
+
+    if (!ARM_CPU(cs)->kvm_steal_time) {
+        return;
+    }
+    if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
+        error_report("failed to init PVTIME IPA");
+        abort();
+    }
+}
+
 static int read_sys_reg32(int fd, uint32_t *pret, uint64_t id)
 {
     uint64_t ret;
@@ -652,6 +671,47 @@ bool kvm_arm_get_host_cpu_features(ARMHostCPUFeatures *ahcf)
     return true;
 }
 
+void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp)
+{
+    static bool has_steal_time;
+    static bool probed;
+    int fdarray[3];
+
+    if (!probed) {
+        probed = true;
+        if (kvm_check_extension(kvm_state, KVM_CAP_VCPU_ATTRIBUTES)) {
+            if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, NULL)) {
+                error_report("Failed to create scratch VCPU");
+                abort();
+            }
+
+            has_steal_time = kvm_device_check_attr(fdarray[2],
+                                                   KVM_ARM_VCPU_PVTIME_CTRL,
+                                                   KVM_ARM_VCPU_PVTIME_IPA);
+
+            kvm_arm_destroy_scratch_host_vcpu(fdarray);
+        }
+    }
+
+    if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
+        if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
+            cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
+        } else {
+            cpu->kvm_steal_time = ON_OFF_AUTO_ON;
+        }
+    } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
+        if (!has_steal_time) {
+            error_setg(errp, "'kvm-steal-time' cannot be enabled "
+                             "on this host");
+            return;
+        } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
+            error_setg(errp, "'kvm-steal-time' cannot be enabled "
+                             "for AArch32 guests");
+            return;
+        }
+    }
+}
+
 bool kvm_arm_aarch32_supported(void)
 {
     return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);
diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index adb38514bf20..21d776912b55 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -267,6 +267,16 @@ void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu);
  */
 void kvm_arm_add_vcpu_properties(Object *obj);
 
+/**
+ * kvm_arm_steal_time_finalize:
+ * @cpu: ARMCPU for which to finalize kvm-steal-time
+ * @errp: Pointer to Error* for error propagation
+ *
+ * Validate the kvm-steal-time property selection and set its default
+ * based on KVM support and guest configuration.
+ */
+void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp);
+
 /**
  * kvm_arm_aarch32_supported:
  *
@@ -340,6 +350,16 @@ int kvm_arm_vgic_probe(void);
 
 void kvm_arm_pmu_set_irq(CPUState *cs, int irq);
 void kvm_arm_pmu_init(CPUState *cs);
+
+/**
+ * kvm_arm_pvtime_init:
+ * @cs: CPUState
+ * @ipa: Per-vcpu guest physical base address of the pvtime structures
+ *
+ * Initializes PVTIME for the VCPU, setting the PVTIME IPA to @ipa.
+ */
+void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa);
+
 int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level);
 
 #else
@@ -355,6 +375,7 @@ static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu)
 }
 
 static inline void kvm_arm_add_vcpu_properties(Object *obj) {}
+static inline void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp) {}
 
 static inline bool kvm_arm_aarch32_supported(void)
 {
@@ -383,9 +404,8 @@ static inline int kvm_arm_vgic_probe(void)
 
 static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq) {}
 static inline void kvm_arm_pmu_init(CPUState *cs) {}
-
+static inline void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa) {}
 static inline void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map) {}
-
 static inline void kvm_arm_get_virtual_time(CPUState *cs) {}
 static inline void kvm_arm_put_virtual_time(CPUState *cs) {}
 #endif
diff --git a/target/arm/monitor.c b/target/arm/monitor.c
index ba6e01abd037..bd3590604a71 100644
--- a/target/arm/monitor.c
+++ b/target/arm/monitor.c
@@ -103,7 +103,7 @@ static const char *cpu_model_advertised_features[] = {
     "sve128", "sve256", "sve384", "sve512",
     "sve640", "sve768", "sve896", "sve1024", "sve1152", "sve1280",
     "sve1408", "sve1536", "sve1664", "sve1792", "sve1920", "sve2048",
-    "kvm-no-adjvtime",
+    "kvm-no-adjvtime", "kvm-steal-time",
     NULL
 };
 
diff --git a/tests/qtest/arm-cpu-features.c b/tests/qtest/arm-cpu-features.c
index f7e062c1891e..91b181f38268 100644
--- a/tests/qtest/arm-cpu-features.c
+++ b/tests/qtest/arm-cpu-features.c
@@ -452,6 +452,7 @@ static void test_query_cpu_model_expansion(const void *data)
     assert_set_feature(qts, "max", "pmu", true);
 
     assert_has_not_feature(qts, "max", "kvm-no-adjvtime");
+    assert_has_not_feature(qts, "max", "kvm-steal-time");
 
     if (g_str_equal(qtest_get_arch(), "aarch64")) {
         assert_has_feature_enabled(qts, "max", "aarch64");
@@ -493,6 +494,7 @@ static void test_query_cpu_model_expansion_kvm(const void *data)
     assert_set_feature(qts, "host", "kvm-no-adjvtime", false);
 
     if (g_str_equal(qtest_get_arch(), "aarch64")) {
+        bool kvm_supports_steal_time;
         bool kvm_supports_sve;
         char max_name[8], name[8];
         uint32_t max_vq, vq;
@@ -500,6 +502,10 @@ static void test_query_cpu_model_expansion_kvm(const void *data)
         QDict *resp;
         char *error;
 
+        assert_error(qts, "cortex-a15",
+            "We cannot guarantee the CPU type 'cortex-a15' works "
+            "with KVM on this host", NULL);
+
         assert_has_feature_enabled(qts, "host", "aarch64");
 
         /* Enabling and disabling pmu should always work. */
@@ -507,16 +513,26 @@ static void test_query_cpu_model_expansion_kvm(const void *data)
         assert_set_feature(qts, "host", "pmu", false);
         assert_set_feature(qts, "host", "pmu", true);
 
-        assert_error(qts, "cortex-a15",
-            "We cannot guarantee the CPU type 'cortex-a15' works "
-            "with KVM on this host", NULL);
-
+        /*
+         * Some features would be enabled by default, but they're disabled
+         * because this instance of KVM doesn't support them. Test that the
+         * features are present, and, when enabled, issue further tests.
+         */
+        assert_has_feature(qts, "host", "kvm-steal-time");
         assert_has_feature(qts, "host", "sve");
+
         resp = do_query_no_props(qts, "host");
+        kvm_supports_steal_time = resp_get_feature(resp, "kvm-steal-time");
         kvm_supports_sve = resp_get_feature(resp, "sve");
         vls = resp_get_sve_vls(resp);
         qobject_unref(resp);
 
+        if (kvm_supports_steal_time) {
+            /* If we have steal-time then we should be able to toggle it. */
+            assert_set_feature(qts, "host", "kvm-steal-time", false);
+            assert_set_feature(qts, "host", "kvm-steal-time", true);
+        }
+
         if (kvm_supports_sve) {
             g_assert(vls != 0);
             max_vq = 64 - __builtin_clzll(vls);
@@ -577,6 +593,7 @@ static void test_query_cpu_model_expansion_kvm(const void *data)
         assert_has_not_feature(qts, "host", "aarch64");
         assert_has_not_feature(qts, "host", "pmu");
         assert_has_not_feature(qts, "host", "sve");
+        assert_has_not_feature(qts, "host", "kvm-steal-time");
     }
 
     qtest_quit(qts);
--
2.25.4


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 0/3] hw/arm/virt: Introduce kvm-steal-time

Peter Maydell-5
In reply to this post by Andrew Jones-15
On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:

>
> KVM supports the ability to publish the amount of time that VCPUs
> were runnable, but not running due to other host threads running
> instead, to the guest. The guest scheduler may use that information
> when making decisions and the guest may expose it to its userspace
> (Linux publishes this information in /proc/stat). This feature is
> called "steal time" as it represents the amount of time stolen from
> a guest by scheduling out its VCPUs. To enable this feature KVM
> userspace must provide a memory region that will be used to publish
> the information to the guest. The memory region is typical migratable
> region. The GPA of the region is given to KVM through a VCPU device
> ioctl interface. This feature is only available for 64-bit hosts
> running 64-bit guests.
>
> This series provides the QEMU support of this feature. It will
> be enabled by default for 5.1 machine types and later, but may
> be disabled with a new CPU property "kvm-steal-time".

Just a note that this has missed 5.1, but I'll review it at
some point for 5.2.

thanks
-- PMM

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 2/3] hw/arm/virt: Move kvm pmu setup to virt_cpu_post_init

Peter Maydell-5
In reply to this post by Andrew Jones-15
On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:

>
> Move the KVM PMU setup part of fdt_add_pmu_nodes() to
> virt_cpu_post_init(), which is a more appropriate location. Now
> fdt_add_pmu_nodes() is also named more appropriately, because it
> no longer does anything but fdt node creation.
>
> No functional change intended.
>
> Signed-off-by: Andrew Jones <[hidden email]>
> ---
>  hw/arm/virt.c | 33 +++++++++++++++------------------
>  1 file changed, 15 insertions(+), 18 deletions(-)
>
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index cb2fa99b1ef5..63ef530933e5 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -521,30 +521,15 @@ static void fdt_add_gic_node(VirtMachineState *vms)
>
>  static void fdt_add_pmu_nodes(const VirtMachineState *vms)
>  {
> -    CPUState *cpu;
> -    ARMCPU *armcpu;
> +    ARMCPU *armcpu = ARM_CPU(first_cpu);
>      uint32_t irqflags = GIC_FDT_IRQ_FLAGS_LEVEL_HI;
>
> -    CPU_FOREACH(cpu) {
> -        armcpu = ARM_CPU(cpu);
> -        if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
> -            return;
> -        }

So previously we would say "if the CPU doesn't actually have
a PMU, don't put the PMU nodes in the FDT", but in the new logic
it looks like we put the PMU nodes in the FDT unconditionally ?

> -        if (kvm_enabled()) {
> -            if (kvm_irqchip_in_kernel()) {
> -                kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
> -            }
> -            kvm_arm_pmu_init(cpu);
> -        }
> -    }
> -
>      if (vms->gic_version == VIRT_GIC_VERSION_2) {
>          irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START,
>                               GIC_FDT_IRQ_PPI_CPU_WIDTH,
>                               (1 << vms->smp_cpus) - 1);
>      }
>
> -    armcpu = ARM_CPU(qemu_get_cpu(0));
>      qemu_fdt_add_subnode(vms->fdt, "/pmu");
>      if (arm_feature(&armcpu->env, ARM_FEATURE_V8)) {
>          const char compat[] = "arm,armv8-pmuv3";
> @@ -1678,11 +1663,23 @@ static void finalize_gic_version(VirtMachineState *vms)
>   */
>  static void virt_cpu_post_init(VirtMachineState *vms)
>  {
> -    bool aarch64;
> +    bool aarch64, pmu;
> +    CPUState *cpu;
>
>      aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
> +    pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
>
> -    if (!kvm_enabled()) {
> +    if (kvm_enabled()) {
> +        CPU_FOREACH(cpu) {
> +            if (pmu) {
> +                assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
> +                if (kvm_irqchip_in_kernel()) {
> +                    kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
> +                }
> +                kvm_arm_pmu_init(cpu);
> +            }
> +        }
> +    } else {
>          if (aarch64 && vms->highmem) {
>              int requested_pa_size = 64 - clz64(vms->highest_gpa);
>              int pamax = arm_pamax(ARM_CPU(first_cpu));

thanks
-- PMM

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 1/3] hw/arm/virt: Move post cpu realize check into its own function

Peter Maydell-5
In reply to this post by Andrew Jones-15
On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:

>
> We'll add more to this new function in coming patches so we also
> state the gic must be created and call it below create_gic().
>
> No functional change intended.
>
> Signed-off-by: Andrew Jones <[hidden email]>
> ---
>  hw/arm/virt.c | 43 +++++++++++++++++++++++++++----------------
>  1 file changed, 27 insertions(+), 16 deletions(-)


Reviewed-by: Peter Maydell <[hidden email]>

thanks
-- PMM

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Peter Maydell-5
In reply to this post by Andrew Jones-15
On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:

>
> We add the kvm-steal-time CPU property and implement it for machvirt.
> A tiny bit of refactoring was also done to allow pmu and pvtime to
> use the same vcpu device helper functions.
>
> Signed-off-by: Andrew Jones <[hidden email]>
> ---
>  docs/system/arm/cpu-features.rst | 11 +++++
>  hw/arm/virt.c                    | 33 ++++++++++++++-
>  include/hw/arm/virt.h            |  2 +
>  target/arm/cpu.c                 | 10 +++++
>  target/arm/cpu.h                 |  4 ++
>  target/arm/kvm.c                 | 20 +++++++++
>  target/arm/kvm32.c               |  5 +++
>  target/arm/kvm64.c               | 70 +++++++++++++++++++++++++++++---
>  target/arm/kvm_arm.h             | 24 ++++++++++-
>  target/arm/monitor.c             |  2 +-
>  tests/qtest/arm-cpu-features.c   | 25 ++++++++++--
>  11 files changed, 193 insertions(+), 13 deletions(-)
>
> diff --git a/docs/system/arm/cpu-features.rst b/docs/system/arm/cpu-features.rst
> index 2d5c06cd016b..4e590e6e9f54 100644
> --- a/docs/system/arm/cpu-features.rst
> +++ b/docs/system/arm/cpu-features.rst
> @@ -200,6 +200,17 @@ the list of KVM VCPU features and their descriptions.
>                             adjustment, also restoring the legacy (pre-5.0)
>                             behavior.
>
> +  kvm-steal-time           Since v5.1, kvm-steal-time is enabled by
> +                           default when KVM is enabled, the feature is
> +                           supported, and the guest is 64-bit.
> +
> +                           When kvm-steal-time is enabled a 64-bit guest
> +                           can account for time its CPUs were not running
> +                           due to the host not scheduling the corresponding
> +                           VCPU threads.  The accounting statistics may
> +                           influence the guest scheduler behavior and/or be
> +                           exposed to the guest userspace.
> +
>  SVE CPU Properties
>  ==================
>
> diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> index 63ef530933e5..21965a1665ca 100644
> --- a/hw/arm/virt.c
> +++ b/hw/arm/virt.c
> @@ -151,6 +151,7 @@ static const MemMapEntry base_memmap[] = {
>      [VIRT_PCDIMM_ACPI] =        { 0x09070000, MEMORY_HOTPLUG_IO_LEN },
>      [VIRT_ACPI_GED] =           { 0x09080000, ACPI_GED_EVT_SEL_LEN },
>      [VIRT_NVDIMM_ACPI] =        { 0x09090000, NVDIMM_ACPI_IO_LEN},
> +    [VIRT_PVTIME] =             { 0x090a0000, 0x00010000 },
>      [VIRT_MMIO] =               { 0x0a000000, 0x00000200 },
>      /* ...repeating for a total of NUM_VIRTIO_TRANSPORTS, each of that size */
>      [VIRT_PLATFORM_BUS] =       { 0x0c000000, 0x02000000 },
> @@ -1663,13 +1664,26 @@ static void finalize_gic_version(VirtMachineState *vms)
>   */
>  static void virt_cpu_post_init(VirtMachineState *vms)
>  {
> -    bool aarch64, pmu;
> +    bool aarch64, pmu, steal_time;
>      CPUState *cpu;
>
>      aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
>      pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
> +    steal_time = object_property_get_bool(OBJECT(first_cpu),
> +                                          "kvm-steal-time", NULL);
>
>      if (kvm_enabled()) {
> +        hwaddr pvtime_base = vms->memmap[VIRT_PVTIME].base;
> +        hwaddr pvtime_size = vms->memmap[VIRT_PVTIME].size;
> +
> +        if (steal_time) {
> +            MemoryRegion *pvtime = g_new(MemoryRegion, 1);
> +
> +            memory_region_init_ram(pvtime, NULL, "pvtime", pvtime_size, NULL);
> +            memory_region_add_subregion(get_system_memory(), pvtime_base,
> +                                        pvtime);

I guess when we are looking at how KVM interacts with MemTag
we'll need to also consider this kind of random-lump-of-ram
(it will need to have tags)... Same thing with framebuffers etc though.

Could you avoid get_system_memory() here and instead pass in
'sysmem' from the caller? The virt board has now 4 different
address spaces it creates and works with (sysmem, secure_sysmem,
tag_sysmem, secure_tag_sysmem) and I think it's going to be better
to aim to consistently have the top level function pass the
lower level functions the MRs they should be putting the devices
in rather than having the low level function say in some cases
"I happen to know that get_system_memory() is the same thing as
'sysmem'".

> +        }
> +
>          CPU_FOREACH(cpu) {
>              if (pmu) {
>                  assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));

> @@ -2528,6 +2558,7 @@ static void virt_machine_5_0_options(MachineClass *mc)
>      mc->numa_mem_supported = true;
>      vmc->acpi_expose_flash = true;
>      mc->auto_enable_numa_with_memdev = false;
> +    vmc->kvm_no_steal_time = true;

This will need to move into the 5_1 options, obviously.

>  }
>  DEFINE_VIRT_MACHINE(5, 0)

> diff --git a/target/arm/kvm32.c b/target/arm/kvm32.c
> index 0af46b41c847..d3f3195077fa 100644
> --- a/target/arm/kvm32.c
> +++ b/target/arm/kvm32.c
> @@ -560,6 +560,11 @@ void kvm_arm_pmu_init(CPUState *cs)
>      qemu_log_mask(LOG_UNIMP, "%s: not implemented\n", __func__);
>  }
>
> +void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
> +{
> +    qemu_log_mask(LOG_UNIMP, "%s: not implemented\n", __func__);
> +}

kvm32.c is going to be removed once 5.1 is out the door
(we deprecated it in 5.0 so can remove in 5.2, I think),
so this is fine.

> +
>  #define ARM_REG_DFSR  ARM_CP15_REG32(0, 5, 0, 0)
>  #define ARM_REG_TTBCR ARM_CP15_REG32(0, 2, 0, 2)
>  /*

> +void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp)
> +{
> +    static bool has_steal_time;
> +    static bool probed;
> +    int fdarray[3];
> +
> +    if (!probed) {
> +        probed = true;
> +        if (kvm_check_extension(kvm_state, KVM_CAP_VCPU_ATTRIBUTES)) {
> +            if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, NULL)) {
> +                error_report("Failed to create scratch VCPU");
> +                abort();
> +            }
> +
> +            has_steal_time = kvm_device_check_attr(fdarray[2],
> +                                                   KVM_ARM_VCPU_PVTIME_CTRL,
> +                                                   KVM_ARM_VCPU_PVTIME_IPA);
> +
> +            kvm_arm_destroy_scratch_host_vcpu(fdarray);

I was a bit surprised that we create a scratch VCPU here, but
it looks like we've opted for "create scratch VCPU, check specific
detail, destroy VCPU" as the usual coding pattern rather than trying
to coalesce into a single "create scratch VCPU once, cache all
the info we might need for later". I guess if somebody (a) cares
about startup performance and (b) finds through profiling that
creation-and-destruction of the scratch VMs/VCPUs is a significant
contributor they can write the refactoring themselves :-)

> +        }
> +    }
> +
> +    if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
> +        if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
> +            cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
> +        } else {
> +            cpu->kvm_steal_time = ON_OFF_AUTO_ON;
> +        }
> +    } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
> +        if (!has_steal_time) {
> +            error_setg(errp, "'kvm-steal-time' cannot be enabled "
> +                             "on this host");
> +            return;
> +        } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
> +            error_setg(errp, "'kvm-steal-time' cannot be enabled "
> +                             "for AArch32 guests");

Why not? Unlike aarch32-host KVM, aarch32-guest KVM is
still supported. What's the missing piece for kvm-steal-time
to work in that setup?

> +            return;
> +        }
> +    }
> +}
> +
>  bool kvm_arm_aarch32_supported(void)
>  {
>      return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);

>  static inline void kvm_arm_add_vcpu_properties(Object *obj) {}
> +static inline void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp) {}

Does this stub need to report an error to the caller via errp,
or is it a "never called but needs to exist to avoid linker errors" ?


thanks
-- PMM

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 0/3] hw/arm/virt: Introduce kvm-steal-time

Andrew Jones-15
In reply to this post by Peter Maydell-5
On Mon, Jul 20, 2020 at 11:16:41AM +0100, Peter Maydell wrote:

> On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:
> >
> > KVM supports the ability to publish the amount of time that VCPUs
> > were runnable, but not running due to other host threads running
> > instead, to the guest. The guest scheduler may use that information
> > when making decisions and the guest may expose it to its userspace
> > (Linux publishes this information in /proc/stat). This feature is
> > called "steal time" as it represents the amount of time stolen from
> > a guest by scheduling out its VCPUs. To enable this feature KVM
> > userspace must provide a memory region that will be used to publish
> > the information to the guest. The memory region is typical migratable
> > region. The GPA of the region is given to KVM through a VCPU device
> > ioctl interface. This feature is only available for 64-bit hosts
> > running 64-bit guests.
> >
> > This series provides the QEMU support of this feature. It will
> > be enabled by default for 5.1 machine types and later, but may
> > be disabled with a new CPU property "kvm-steal-time".
>
> Just a note that this has missed 5.1, but I'll review it at
> some point for 5.2.
>

Yup. No problem. I was out on vacation all last week, so there wasn't
a chance for me to follow up until now anyway. Thanks for the review.
I'll get to the responses / respinning tomorrow after I dig out of my
email backlog.

drew


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 2/3] hw/arm/virt: Move kvm pmu setup to virt_cpu_post_init

Andrew Jones-15
In reply to this post by Peter Maydell-5
On Tue, Jul 21, 2020 at 11:02:30AM +0100, Peter Maydell wrote:

> On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:
> >
> > Move the KVM PMU setup part of fdt_add_pmu_nodes() to
> > virt_cpu_post_init(), which is a more appropriate location. Now
> > fdt_add_pmu_nodes() is also named more appropriately, because it
> > no longer does anything but fdt node creation.
> >
> > No functional change intended.
> >
> > Signed-off-by: Andrew Jones <[hidden email]>
> > ---
> >  hw/arm/virt.c | 33 +++++++++++++++------------------
> >  1 file changed, 15 insertions(+), 18 deletions(-)
> >
> > diff --git a/hw/arm/virt.c b/hw/arm/virt.c
> > index cb2fa99b1ef5..63ef530933e5 100644
> > --- a/hw/arm/virt.c
> > +++ b/hw/arm/virt.c
> > @@ -521,30 +521,15 @@ static void fdt_add_gic_node(VirtMachineState *vms)
> >
> >  static void fdt_add_pmu_nodes(const VirtMachineState *vms)
> >  {
> > -    CPUState *cpu;
> > -    ARMCPU *armcpu;
> > +    ARMCPU *armcpu = ARM_CPU(first_cpu);
> >      uint32_t irqflags = GIC_FDT_IRQ_FLAGS_LEVEL_HI;
> >
> > -    CPU_FOREACH(cpu) {
> > -        armcpu = ARM_CPU(cpu);
> > -        if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
> > -            return;
> > -        }
>
> So previously we would say "if the CPU doesn't actually have
> a PMU, don't put the PMU nodes in the FDT", but in the new logic
> it looks like we put the PMU nodes in the FDT unconditionally ?

Ah, an unintentional change in this patch with no intended changes.
How about adding something like this to the top of fdt_add_pmu_nodes()

    if (!arm_feature(&armcpu->env, ARM_FEATURE_PMU)) {
        assert(!object_property_get_bool(OBJECT(armcpu), "pmu", NULL));
        return;
    }

where the assert is optional - it just mirrors the consistency check in
virt_cpu_post_init()

Thanks,
drew

>
> > -        if (kvm_enabled()) {
> > -            if (kvm_irqchip_in_kernel()) {
> > -                kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
> > -            }
> > -            kvm_arm_pmu_init(cpu);
> > -        }
> > -    }
> > -
> >      if (vms->gic_version == VIRT_GIC_VERSION_2) {
> >          irqflags = deposit32(irqflags, GIC_FDT_IRQ_PPI_CPU_START,
> >                               GIC_FDT_IRQ_PPI_CPU_WIDTH,
> >                               (1 << vms->smp_cpus) - 1);
> >      }
> >
> > -    armcpu = ARM_CPU(qemu_get_cpu(0));
> >      qemu_fdt_add_subnode(vms->fdt, "/pmu");
> >      if (arm_feature(&armcpu->env, ARM_FEATURE_V8)) {
> >          const char compat[] = "arm,armv8-pmuv3";
> > @@ -1678,11 +1663,23 @@ static void finalize_gic_version(VirtMachineState *vms)
> >   */
> >  static void virt_cpu_post_init(VirtMachineState *vms)
> >  {
> > -    bool aarch64;
> > +    bool aarch64, pmu;
> > +    CPUState *cpu;
> >
> >      aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
> > +    pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
> >
> > -    if (!kvm_enabled()) {
> > +    if (kvm_enabled()) {
> > +        CPU_FOREACH(cpu) {
> > +            if (pmu) {
> > +                assert(arm_feature(&ARM_CPU(cpu)->env, ARM_FEATURE_PMU));
> > +                if (kvm_irqchip_in_kernel()) {
> > +                    kvm_arm_pmu_set_irq(cpu, PPI(VIRTUAL_PMU_IRQ));
> > +                }
> > +                kvm_arm_pmu_init(cpu);
> > +            }
> > +        }
> > +    } else {
> >          if (aarch64 && vms->highmem) {
> >              int requested_pa_size = 64 - clz64(vms->highest_gpa);
> >              int pamax = arm_pamax(ARM_CPU(first_cpu));
>
> thanks
> -- PMM
>


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Andrew Jones-15
In reply to this post by Peter Maydell-5
On Tue, Jul 21, 2020 at 11:46:12AM +0100, Peter Maydell wrote:

> > +    if (!probed) {
> > +        probed = true;
> > +        if (kvm_check_extension(kvm_state, KVM_CAP_VCPU_ATTRIBUTES)) {
> > +            if (!kvm_arm_create_scratch_host_vcpu(NULL, fdarray, NULL)) {
> > +                error_report("Failed to create scratch VCPU");
> > +                abort();
> > +            }
> > +
> > +            has_steal_time = kvm_device_check_attr(fdarray[2],
> > +                                                   KVM_ARM_VCPU_PVTIME_CTRL,
> > +                                                   KVM_ARM_VCPU_PVTIME_IPA);
> > +
> > +            kvm_arm_destroy_scratch_host_vcpu(fdarray);
>
> I was a bit surprised that we create a scratch VCPU here, but
> it looks like we've opted for "create scratch VCPU, check specific
> detail, destroy VCPU" as the usual coding pattern rather than trying
> to coalesce into a single "create scratch VCPU once, cache all
> the info we might need for later". I guess if somebody (a) cares
> about startup performance and (b) finds through profiling that
> creation-and-destruction of the scratch VMs/VCPUs is a significant
> contributor they can write the refactoring themselves :-)

There's still a chance I'll be changing this to a KVM CAP if the KVM
maintainers accept the patch I proposed to add one.

>
> > +        }
> > +    }
> > +
> > +    if (cpu->kvm_steal_time == ON_OFF_AUTO_AUTO) {
> > +        if (!has_steal_time || !arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
> > +            cpu->kvm_steal_time = ON_OFF_AUTO_OFF;
> > +        } else {
> > +            cpu->kvm_steal_time = ON_OFF_AUTO_ON;
> > +        }
> > +    } else if (cpu->kvm_steal_time == ON_OFF_AUTO_ON) {
> > +        if (!has_steal_time) {
> > +            error_setg(errp, "'kvm-steal-time' cannot be enabled "
> > +                             "on this host");
> > +            return;
> > +        } else if (!arm_feature(&cpu->env, ARM_FEATURE_AARCH64)) {
> > +            error_setg(errp, "'kvm-steal-time' cannot be enabled "
> > +                             "for AArch32 guests");
>
> Why not? Unlike aarch32-host KVM, aarch32-guest KVM is
> still supported. What's the missing piece for kvm-steal-time
> to work in that setup?

The specification. DEN0057A chapter 2 says "This specification only covers
systems in which the Execution state of the hypervisor as well as EL1 of
virtual machines is AArch64.". And, to ensure that the smc/hvc calls are
only specified as smc64/hvc64. I find that a bit disappointing, since
there's nothing about steal-time that should be 64-bit specific, but
that's how this cookie is crumbling...

I'll add a comment to explain this error for v2.

>
> > +            return;
> > +        }
> > +    }
> > +}
> > +
> >  bool kvm_arm_aarch32_supported(void)
> >  {
> >      return kvm_check_extension(kvm_state, KVM_CAP_ARM_EL1_32BIT);
>
> >  static inline void kvm_arm_add_vcpu_properties(Object *obj) {}
> > +static inline void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp) {}
>
> Does this stub need to report an error to the caller via errp,
> or is it a "never called but needs to exist to avoid linker errors" ?

The second one, as we can't have kvm_enabled() and !defined(CONFIG_KVM).
Hmm, these types of stubs would be more robust to refactoring if we put
build bugs in them. I can try to analyze all the stubs in this #else to
see which ones should be returning false/error/nothing vs. build bugging.

Thanks,
drew


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Peter Maydell-5
In reply to this post by Andrew Jones-15
On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:
> We add the kvm-steal-time CPU property and implement it for machvirt.
> A tiny bit of refactoring was also done to allow pmu and pvtime to
> use the same vcpu device helper functions.
>
> Signed-off-by: Andrew Jones <[hidden email]>

Hi; I'm forwarding a couple of comments here from Beata,
(whose secondment with Linaro is coming to an end so she won't
have access to her Linaro email address to continue the conversation):


>  static void virt_cpu_post_init(VirtMachineState *vms)
>  {
> -    bool aarch64, pmu;
> +    bool aarch64, pmu, steal_time;
>      CPUState *cpu;
>
>      aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
>      pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
> +    steal_time = object_property_get_bool(OBJECT(first_cpu),
> +                                          "kvm-steal-time", NULL);
>
>      if (kvm_enabled()) {
> +        hwaddr pvtime_base = vms->memmap[VIRT_PVTIME].base;
> +        hwaddr pvtime_size = vms->memmap[VIRT_PVTIME].size;
> +
> +        if (steal_time) {
> +            MemoryRegion *pvtime = g_new(MemoryRegion, 1);
> +
> +            memory_region_init_ram(pvtime, NULL, "pvtime", pvtime_size, NULL);
> +            memory_region_add_subregion(get_system_memory(), pvtime_base,
> +                                        pvtime);
> +        }

B: I'm not sure whether it wouldn't be useful to have the area
allocated with size determined by number of VCPUs instead of having
pre-defined size.

> +        if (vmc->kvm_no_steal_time &&
> +            object_property_find(cpuobj, "kvm-steal-time", NULL)) {
> +            object_property_set_bool(cpuobj, false, "kvm-steal-time", NULL);
> +        }
> +
>          if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) {
>              object_property_set_bool(cpuobj, "pmu", false, NULL);
>          }
> @@ -2528,6 +2558,7 @@ static void virt_machine_5_0_options(MachineClass *mc)
>      mc->numa_mem_supported = true;
>      vmc->acpi_expose_flash = true;
>      mc->auto_enable_numa_with_memdev = false;
> +    vmc->kvm_no_steal_time = true;
>  }
>  DEFINE_VIRT_MACHINE(5, 0)
>
> diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> index 54bcf17afd35..b5153afedcdf 100644
> --- a/include/hw/arm/virt.h
> +++ b/include/hw/arm/virt.h
> @@ -80,6 +80,7 @@ enum {
>      VIRT_PCDIMM_ACPI,
>      VIRT_ACPI_GED,
>      VIRT_NVDIMM_ACPI,
> +    VIRT_PVTIME,
>      VIRT_LOWMEMMAP_LAST,
>  };
>
> @@ -126,6 +127,7 @@ typedef struct {
>      bool no_ged;   /* Machines < 4.2 has no support for ACPI GED device */
>      bool kvm_no_adjvtime;
>      bool acpi_expose_flash;
> +    bool kvm_no_steal_time;

B: It is slightly confusing : using kvm_no_steal_time vs kvm_steal_time

P: I have to admit I get confused about which sense this flag
should have. I think the sense of the flags in this struct is
"the false case is the one that the older virt boards had",
so original virt didn't have an ITS or a PMU and so we have
no_its and no_pmu. Similarly here old virt didn't have steal-time
and so we want a no_ flag (ie the patch is correct). Why
kvm_no_steal_time rather than no_kvm_steal_time, though ?

>  } VirtMachineClass;

> +void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
> +{
> +    struct kvm_device_attr attr = {
> +        .group = KVM_ARM_VCPU_PVTIME_CTRL,
> +        .attr = KVM_ARM_VCPU_PVTIME_IPA,
> +        .addr = (uint64_t)&ipa,
> +    };
> +
> +    if (!ARM_CPU(cs)->kvm_steal_time) {
> +        return;
> +    }
> +    if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
> +        error_report("failed to init PVTIME IPA");
> +        abort();
> +    }
> +}

B: I am probably missing smth but .. there is a trigger missing to
update the stats
and write them back to pre-allocated guest memory.
Looking at the kernel code the stats are updated upon pending
VCPU request :
in arch/arm64/kvm/arm.c:
static void check_vcpu_requests(struct kvm_vcpu *vcpu) {
        ...
         if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
                kvm_update_stolen_time(vcpu);
}


thanks
-- PMM

Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Andrew Jones-15
On Fri, Jul 31, 2020 at 03:54:07PM +0100, Peter Maydell wrote:

> On Sat, 11 Jul 2020 at 11:10, Andrew Jones <[hidden email]> wrote:
> > We add the kvm-steal-time CPU property and implement it for machvirt.
> > A tiny bit of refactoring was also done to allow pmu and pvtime to
> > use the same vcpu device helper functions.
> >
> > Signed-off-by: Andrew Jones <[hidden email]>
>
> Hi; I'm forwarding a couple of comments here from Beata,
> (whose secondment with Linaro is coming to an end so she won't
> have access to her Linaro email address to continue the conversation):
>
>
> >  static void virt_cpu_post_init(VirtMachineState *vms)
> >  {
> > -    bool aarch64, pmu;
> > +    bool aarch64, pmu, steal_time;
> >      CPUState *cpu;
> >
> >      aarch64 = object_property_get_bool(OBJECT(first_cpu), "aarch64", NULL);
> >      pmu = object_property_get_bool(OBJECT(first_cpu), "pmu", NULL);
> > +    steal_time = object_property_get_bool(OBJECT(first_cpu),
> > +                                          "kvm-steal-time", NULL);
> >
> >      if (kvm_enabled()) {
> > +        hwaddr pvtime_base = vms->memmap[VIRT_PVTIME].base;
> > +        hwaddr pvtime_size = vms->memmap[VIRT_PVTIME].size;
> > +
> > +        if (steal_time) {
> > +            MemoryRegion *pvtime = g_new(MemoryRegion, 1);
> > +
> > +            memory_region_init_ram(pvtime, NULL, "pvtime", pvtime_size, NULL);
> > +            memory_region_add_subregion(get_system_memory(), pvtime_base,
> > +                                        pvtime);
> > +        }
>
> B: I'm not sure whether it wouldn't be useful to have the area
> allocated with size determined by number of VCPUs instead of having
> pre-defined size.

We can't go smaller than one host-sized page, so this 64k region is the
smallest we can go. The assert in the next hunk, which was snipped
out of the reply, ensures that 64k is large enough to cover the maximum
number of VCPUs that could ever be configured. I don't think there's
anything else we should do at this time. If the pvtime structure grows,
or if we increase the maximum number of VCPUs to be larger than 1024,
then we can revisit this in order to determine when additional 64k pages
should be allocated.

For now, if it would help, I could extend the comment (which was also
snipped) to mention that 64k was chosen because it's the maximum host
page size, and that at least one host-sized page must be allocated for
this region.

>
> > +        if (vmc->kvm_no_steal_time &&
> > +            object_property_find(cpuobj, "kvm-steal-time", NULL)) {
> > +            object_property_set_bool(cpuobj, false, "kvm-steal-time", NULL);
> > +        }
> > +
> >          if (vmc->no_pmu && object_property_find(cpuobj, "pmu", NULL)) {
> >              object_property_set_bool(cpuobj, "pmu", false, NULL);
> >          }
> > @@ -2528,6 +2558,7 @@ static void virt_machine_5_0_options(MachineClass *mc)
> >      mc->numa_mem_supported = true;
> >      vmc->acpi_expose_flash = true;
> >      mc->auto_enable_numa_with_memdev = false;
> > +    vmc->kvm_no_steal_time = true;
> >  }
> >  DEFINE_VIRT_MACHINE(5, 0)
> >
> > diff --git a/include/hw/arm/virt.h b/include/hw/arm/virt.h
> > index 54bcf17afd35..b5153afedcdf 100644
> > --- a/include/hw/arm/virt.h
> > +++ b/include/hw/arm/virt.h
> > @@ -80,6 +80,7 @@ enum {
> >      VIRT_PCDIMM_ACPI,
> >      VIRT_ACPI_GED,
> >      VIRT_NVDIMM_ACPI,
> > +    VIRT_PVTIME,
> >      VIRT_LOWMEMMAP_LAST,
> >  };
> >
> > @@ -126,6 +127,7 @@ typedef struct {
> >      bool no_ged;   /* Machines < 4.2 has no support for ACPI GED device */
> >      bool kvm_no_adjvtime;
> >      bool acpi_expose_flash;
> > +    bool kvm_no_steal_time;
>
> B: It is slightly confusing : using kvm_no_steal_time vs kvm_steal_time
>
> P: I have to admit I get confused about which sense this flag
> should have. I think the sense of the flags in this struct is
> "the false case is the one that the older virt boards had",
> so original virt didn't have an ITS or a PMU and so we have
> no_its and no_pmu. Similarly here old virt didn't have steal-time
> and so we want a no_ flag (ie the patch is correct). Why
> kvm_no_steal_time rather than no_kvm_steal_time, though ?

Correct. We want the default value of the boolean (false) to mean
"not disabled", so the boolean must have a negative name in order
to mean "disabled" when it is true. I'm not opposed to
no_kvm_steal_time vs. kvm_no_steal_time, since the property name
is "kvm-steal-time". I think I ended up with the 'kvm' and 'no'
swapped by copy+pasting kvm_no_adjvtime. However that one is
appropriately named because the property name is "kvm-no-adjvtime".
I'll change this for v2.

>
> >  } VirtMachineClass;
>
> > +void kvm_arm_pvtime_init(CPUState *cs, uint64_t ipa)
> > +{
> > +    struct kvm_device_attr attr = {
> > +        .group = KVM_ARM_VCPU_PVTIME_CTRL,
> > +        .attr = KVM_ARM_VCPU_PVTIME_IPA,
> > +        .addr = (uint64_t)&ipa,
> > +    };
> > +
> > +    if (!ARM_CPU(cs)->kvm_steal_time) {
> > +        return;
> > +    }
> > +    if (!kvm_arm_set_device_attr(cs, &attr, "PVTIME IPA")) {
> > +        error_report("failed to init PVTIME IPA");
> > +        abort();
> > +    }
> > +}
>
> B: I am probably missing smth but .. there is a trigger missing to
> update the stats
> and write them back to pre-allocated guest memory.
> Looking at the kernel code the stats are updated upon pending
> VCPU request :
> in arch/arm64/kvm/arm.c:
> static void check_vcpu_requests(struct kvm_vcpu *vcpu) {
>         ...
>          if (kvm_check_request(KVM_REQ_RECORD_STEAL, vcpu))
>                 kvm_update_stolen_time(vcpu);
> }

kvm_arch_vcpu_load() unconditionally makes that request when pvtime
is enabled.

Thanks,
drew


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Andrew Jones-15
On Sat, Aug 01, 2020 at 02:00:34PM +0200, Andrew Jones wrote:

> > >      if (kvm_enabled()) {
> > > +        hwaddr pvtime_base = vms->memmap[VIRT_PVTIME].base;
> > > +        hwaddr pvtime_size = vms->memmap[VIRT_PVTIME].size;
> > > +
> > > +        if (steal_time) {
> > > +            MemoryRegion *pvtime = g_new(MemoryRegion, 1);
> > > +
> > > +            memory_region_init_ram(pvtime, NULL, "pvtime", pvtime_size, NULL);
> > > +            memory_region_add_subregion(get_system_memory(), pvtime_base,
> > > +                                        pvtime);
> > > +        }
> >
> > B: I'm not sure whether it wouldn't be useful to have the area
> > allocated with size determined by number of VCPUs instead of having
> > pre-defined size.
>
> We can't go smaller than one host-sized page, so this 64k region is the
> smallest we can go. The assert in the next hunk, which was snipped
> out of the reply, ensures that 64k is large enough to cover the maximum
> number of VCPUs that could ever be configured. I don't think there's
> anything else we should do at this time. If the pvtime structure grows,
> or if we increase the maximum number of VCPUs to be larger than 1024,
> then we can revisit this in order to determine when additional 64k pages
> should be allocated.
>
> For now, if it would help, I could extend the comment (which was also
> snipped) to mention that 64k was chosen because it's the maximum host
> page size, and that at least one host-sized page must be allocated for
> this region.
>

In the end, I think I will calculate the size based on host-page-size
and the number of possible guest cpus. The main reason is that it's
actually more awkward to properly document it in a comment than to just
code it. And, we'll also reduce the number of pages that would need to
be migrated when the host is using 4k pages.

Thanks,
drew


Reply | Threaded
Open this post in threaded view
|

Re: [PATCH 3/3] hw/arm/virt: Implement kvm-steal-time

Andrew Jones-15
In reply to this post by Andrew Jones-15
On Wed, Jul 29, 2020 at 04:40:54PM +0200, Andrew Jones wrote:

> > > +static inline void kvm_arm_steal_time_finalize(ARMCPU *cpu, Error **errp) {}
> >
> > Does this stub need to report an error to the caller via errp,
> > or is it a "never called but needs to exist to avoid linker errors" ?
>
> The second one, as we can't have kvm_enabled() and !defined(CONFIG_KVM).
> Hmm, these types of stubs would be more robust to refactoring if we put
> build bugs in them. I can try to analyze all the stubs in this #else to
> see which ones should be returning false/error/nothing vs. build bugging.
>

I couldn't come up with a way to die at compile-time, but I think we
should be able to use g_assert_not_reached() in these functions. How
about something like the diff below?

Thanks,
drew


diff --git a/target/arm/kvm_arm.h b/target/arm/kvm_arm.h
index adb38514bf20..0da00eb6b20c 100644
--- a/target/arm/kvm_arm.h
+++ b/target/arm/kvm_arm.h
@@ -344,16 +344,10 @@ int kvm_arm_set_irq(int cpu, int irqtype, int irq, int level);
 
 #else
 
-static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu)
-{
-    /*
-     * This should never actually be called in the "not KVM" case,
-     * but set up the fields to indicate an error anyway.
-     */
-    cpu->kvm_target = QEMU_KVM_ARM_TARGET_NONE;
-    cpu->host_cpu_probe_failed = true;
-}
-
+/*
+ * It's safe to call these functions without KVM support.
+ * They should either do nothing or return "not supported".
+ */
 static inline void kvm_arm_add_vcpu_properties(Object *obj) {}
 
 static inline bool kvm_arm_aarch32_supported(void)
@@ -371,23 +365,39 @@ static inline bool kvm_arm_sve_supported(void)
     return false;
 }
 
+/*
+ * These functions should never actually be called without KVM support.
+ */
+static inline void kvm_arm_set_cpu_features_from_host(ARMCPU *cpu)
+{
+    g_assert_not_reached();
+}
+
 static inline int kvm_arm_get_max_vm_ipa_size(MachineState *ms)
 {
-    return -ENOENT;
+    g_assert_not_reached();
 }
 
 static inline int kvm_arm_vgic_probe(void)
 {
-    return 0;
+    g_assert_not_reached();
 }
 
-static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq) {}
-static inline void kvm_arm_pmu_init(CPUState *cs) {}
+static inline void kvm_arm_pmu_set_irq(CPUState *cs, int irq)
+{
+    g_assert_not_reached();
+}
 
-static inline void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map) {}
+static inline void kvm_arm_pmu_init(CPUState *cs)
+{
+    g_assert_not_reached();
+}
+
+static inline void kvm_arm_sve_get_vls(CPUState *cs, unsigned long *map)
+{
+    g_assert_not_reached();
+}
 
-static inline void kvm_arm_get_virtual_time(CPUState *cs) {}
-static inline void kvm_arm_put_virtual_time(CPUState *cs) {}
 #endif
 
 static inline const char *gic_class_name(void)