[RFC,lttng-modules] Fix: nmi-safe clock on 32-bit systems
Commit Message
On 32-bit systems, the current algorithm assume to have one clock read
per 32-bit LSB overflow period, which is not guaranteed. It also has an
issue on the first clock reads after module load, because the initial
value for the last LSB is 0. It can cause the time to stay stuck at the
same value for a few seconds at the beginning of the trace.
It only affects 32-bit systems with kernels >= 3.17.
Fix this by using the non-nmi-safe clock source on 32-bit systems,
except for x86 systems that support double-word cmpxchg (cmpxchg8b).
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
---
wrapper/trace-clock.c | 2 +-
wrapper/trace-clock.h | 82 +++++++++++++++++++--------------------------------
2 files changed, 31 insertions(+), 53 deletions(-)
Comments
----- On Feb 9, 2017, at 8:51 PM, Mathieu Desnoyers mathieu.desnoyers at efficios.com wrote:
> On 32-bit systems, the current algorithm assume to have one clock read
> per 32-bit LSB overflow period, which is not guaranteed. It also has an
> issue on the first clock reads after module load, because the initial
> value for the last LSB is 0. It can cause the time to stay stuck at the
> same value for a few seconds at the beginning of the trace.
>
> It only affects 32-bit systems with kernels >= 3.17.
>
> Fix this by using the non-nmi-safe clock source on 32-bit systems,
> except for x86 systems that support double-word cmpxchg (cmpxchg8b).
Actually, for the fix, I'll go for an even safer approach, and
use the non-nmi-safe clock source on all 32-bit systems, including
x86-32 for now.
Mathieu
>
> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers at efficios.com>
> ---
> wrapper/trace-clock.c | 2 +-
> wrapper/trace-clock.h | 82 +++++++++++++++++++--------------------------------
> 2 files changed, 31 insertions(+), 53 deletions(-)
>
> diff --git a/wrapper/trace-clock.c b/wrapper/trace-clock.c
> index d9bc956..8c07942 100644
> --- a/wrapper/trace-clock.c
> +++ b/wrapper/trace-clock.c
> @@ -24,7 +24,7 @@
> #include <wrapper/trace-clock.h>
>
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0))
> -DEFINE_PER_CPU(local_t, lttng_last_tsc);
> +DEFINE_PER_CPU(struct lttng_last_timestamp, lttng_last_tsc);
> EXPORT_PER_CPU_SYMBOL(lttng_last_tsc);
> #endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)) */
>
> diff --git a/wrapper/trace-clock.h b/wrapper/trace-clock.h
> index 496fec4..7f76b1b 100644
> --- a/wrapper/trace-clock.h
> +++ b/wrapper/trace-clock.h
> @@ -59,65 +59,43 @@ extern struct lttng_trace_clock *lttng_trace_clock;
> #define LTTNG_CLOCK_NMI_SAFE_BROKEN
> #endif
>
> +/*
> + * We need clock values to be monotonically increasing per-cpu, which is
> + * not strictly guaranteed by ktime_get_mono_fast_ns(). It is
> + * straightforward to do on architectures with a 64-bit cmpxchg(), but
> + * not so on architectures without 64-bit cmpxchg.
> + */
> +
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) \
> + && (BITS_PER_LONG == 64 || CONFIG_X86_CMPXCHG64) \
> && !defined(LTTNG_CLOCK_NMI_SAFE_BROKEN))
> +#define LTTNG_USE_NMI_SAFE_CLOCK
> +#endif
>
> -DECLARE_PER_CPU(local_t, lttng_last_tsc);
> +#ifdef LTTNG_USE_NMI_SAFE_CLOCK
>
> -#if (BITS_PER_LONG == 32)
> /*
> - * Fixup "src_now" using the 32 LSB from "last". We need to handle overflow and
> - * underflow of the 32nd bit. "last" can be above, below or equal to the 32 LSB
> - * of "src_now".
> + * Ensure the variable is aligned on 64-bit for cmpxchg8b on 32-bit
> + * systems.
> */
> -static inline u64 trace_clock_fixup(u64 src_now, u32 last)
> -{
> - u64 now;
> +struct lttng_last_timestamp {
> + u64 v;
> +} __attribute__((aligned(sizeof(u64))));
>
> - now = src_now & 0xFFFFFFFF00000000ULL;
> - now |= (u64) last;
> - /* Detect overflow or underflow between now and last. */
> - if ((src_now & 0x80000000U) && !(last & 0x80000000U)) {
> - /*
> - * If 32nd bit transitions from 1 to 0, and we move forward in
> - * time from "now" to "last", then we have an overflow.
> - */
> - if (((s32) now - (s32) last) < 0)
> - now += 0x0000000100000000ULL;
> - } else if (!(src_now & 0x80000000U) && (last & 0x80000000U)) {
> - /*
> - * If 32nd bit transitions from 0 to 1, and we move backward in
> - * time from "now" to "last", then we have an underflow.
> - */
> - if (((s32) now - (s32) last) > 0)
> - now -= 0x0000000100000000ULL;
> - }
> - return now;
> -}
> -#else /* #if (BITS_PER_LONG == 32) */
> -/*
> - * The fixup is pretty easy on 64-bit architectures: "last" is a 64-bit
> - * value, so we can use last directly as current time.
> - */
> -static inline u64 trace_clock_fixup(u64 src_now, u64 last)
> -{
> - return last;
> -}
> -#endif /* #else #if (BITS_PER_LONG == 32) */
> +DECLARE_PER_CPU(struct lttng_last_timestamp, lttng_last_tsc);
>
> /*
> * Sometimes called with preemption enabled. Can be interrupted.
> */
> static inline u64 trace_clock_monotonic_wrapper(void)
> {
> - u64 now;
> - unsigned long last, result;
> - local_t *last_tsc;
> + u64 now, last, result;
> + struct lttng_last_timestamp *last_tsc_ptr;
>
> /* Use fast nmi-safe monotonic clock provided by the Linux kernel. */
> preempt_disable();
> - last_tsc = lttng_this_cpu_ptr(<tng_last_tsc);
> - last = local_read(last_tsc);
> + last_tsc_ptr = lttng_this_cpu_ptr(<tng_last_tsc);
> + last = last_tsc_ptr->v;
> /*
> * Read "last" before "now". It is not strictly required, but it ensures
> * that an interrupt coming in won't artificially trigger a case where
> @@ -126,9 +104,9 @@ static inline u64 trace_clock_monotonic_wrapper(void)
> */
> barrier();
> now = ktime_get_mono_fast_ns();
> - if (((long) now - (long) last) < 0)
> - now = trace_clock_fixup(now, last);
> - result = local_cmpxchg(last_tsc, last, (unsigned long) now);
> + if (U64_MAX / 2 < now - last)
> + now = last;
> + result = cmpxchg64_local(&last_tsc_ptr->v, last, now);
> preempt_enable();
> if (result == last) {
> /* Update done. */
> @@ -139,11 +117,11 @@ static inline u64 trace_clock_monotonic_wrapper(void)
> * "result", since it has been sampled concurrently with our
> * time read, so it should not be far from "now".
> */
> - return trace_clock_fixup(now, result);
> + return result;
> }
> }
>
> -#else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
> +#else /* #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
> static inline u64 trace_clock_monotonic_wrapper(void)
> {
> ktime_t ktime;
> @@ -158,7 +136,7 @@ static inline u64 trace_clock_monotonic_wrapper(void)
> ktime = ktime_get();
> return ktime_to_ns(ktime);
> }
> -#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
> +#endif /* #else #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
>
> static inline u64 trace_clock_read64_monotonic(void)
> {
> @@ -185,19 +163,19 @@ static inline const char
> *trace_clock_description_monotonic(void)
> return "Monotonic Clock";
> }
>
> -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0))
> +#ifdef LTTNG_USE_NMI_SAFE_CLOCK
> static inline int get_trace_clock(void)
> {
> printk_once(KERN_WARNING "LTTng: Using mainline kernel monotonic fast clock,
> which is NMI-safe.\n");
> return 0;
> }
> -#else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
> +#else /* #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
> static inline int get_trace_clock(void)
> {
> printk_once(KERN_WARNING "LTTng: Using mainline kernel monotonic clock. NMIs
> will not be traced.\n");
> return 0;
> }
> -#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
> +#endif /* #else #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
>
> static inline void put_trace_clock(void)
> {
> --
> 2.1.4
On Thu, 2017-02-09 at 20:51 -0500, mathieu.desnoyersa wrote:
> On 32-bit systems, the current algorithm assume to have one clock read
> per 32-bit LSB overflow period, which is not guaranteed. It also has an
> issue on the first clock reads after module load, because the initial
> value for the last LSB is 0. It can cause the time to stay stuck at the
> same value for a few seconds at the beginning of the trace.
I tested this on my 32-bit ARM system and it has solved the issue with
timestamp being fixed at zero until the first time the TSC wraps.
I'm not sure how to properly test that NMIs do not generate any reverse
timestamps.
> Fix this by using the non-nmi-safe clock source on 32-bit systems,
> except for x86 systems that support double-word cmpxchg (cmpxchg8b).
Is there a reason to restrict this to x86 systems? ARM supports 64-bit
cmpxchg too, and indeed that's what I was testing.
> +
> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) \
> + && (BITS_PER_LONG == 64 || CONFIG_X86_CMPXCHG64) \
> && !defined(LTTNG_CLOCK_NMI_SAFE_BROKEN))
This fails to compile on ARM, as CONFIG_X86_CMPXCHG64 is not defined.
----- On Feb 14, 2017, at 3:02 PM, Trent Piepho tpiepho at kymetacorp.com wrote:
> On Thu, 2017-02-09 at 20:51 -0500, mathieu.desnoyers wrote:
>> On 32-bit systems, the current algorithm assume to have one clock read
>> per 32-bit LSB overflow period, which is not guaranteed. It also has an
>> issue on the first clock reads after module load, because the initial
>> value for the last LSB is 0. It can cause the time to stay stuck at the
>> same value for a few seconds at the beginning of the trace.
>
> I tested this on my 32-bit ARM system and it has solved the issue with
> timestamp being fixed at zero until the first time the TSC wraps.
>
> I'm not sure how to properly test that NMIs do not generate any reverse
> timestamps.
NMIs don't exist on ARM. They have FIQ which are close (fast interrupts),
but no NMIs. It would be a good thing to ensure that either FIQs are disabled
when the OS updates the clock, or that no tracing is ever done in FIQ
handlers. I don't think FIQs are disabled when updating the clock, but
I suspect that no tracing is done in FIQ handlers.
Also, lttng-modules, with that fix on ARM, just discard events that
would come from NMI context (and since nmi context don't exist on
ARM, it never discards anything).
>
>
>> Fix this by using the non-nmi-safe clock source on 32-bit systems,
>> except for x86 systems that support double-word cmpxchg (cmpxchg8b).
>
> Is there a reason to restrict this to x86 systems? ARM supports 64-bit
> cmpxchg too, and indeed that's what I was testing.
Since ARM don't have NMI, I don't see why we should care about making
the clock source nmi-safe, especially since the algorithm adds a
cmpxchg64_local (slight performance overhead). This might end up
being useful if we ever want to trace fast interrupt handlers though.
>
>> +
>> #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) \
>> + && (BITS_PER_LONG == 64 || CONFIG_X86_CMPXCHG64) \
>> && !defined(LTTNG_CLOCK_NMI_SAFE_BROKEN))
>
> This fails to compile on ARM, as CONFIG_X86_CMPXCHG64 is not defined.
> From what I can see, all ARM have cmpxchg64_local() so I modified this
> to allow ARM support, used defined(), etc.
>
> It also appears that even if CONFIG_X86_CMPXCHG64 is not defined on x86,
> there is still support via another definition of cmpxchg64_local() using
> cmpxchg8b_emu. Or does this version lack some atomic in the face of NMI
> ability that the cmpxchg8b instruction would have?
Indeed, cmpxchg8b_emu disables interrupts on UP within its emulation, which
is not safe against NMIs.
I am tempted to only use the nmi-safe clock source on architectures
where it matters, and ARM32/64 don't have NMIs. So I would use
the usual seqlock-protected clock source on ARM32/64, unless we
end up figuring out that FIQ handlers are instrumented with
tracepoints.
For x86 32/64, when 64-bit local cmpxchg is natively available,
I would like to use the nmi-safe clock source. However, using
double-word CAS on x86-32 is something that would be a new behavior
in lttng-modules, and I don't want to introduce this as a fix into
the stable branches. This is why the commit that landed into master
currently simply revert to the non-nmi-safe clock source on x86-32,
and don't trace NMIs on x86-32 for now.
I would be OK to introduce use of double-word CAS on x86-32 as a new
feature for upcoming lttng-modules versions (2.10+). But I would only use it
when CONFIG_X86_CMPXCHG64 is defined: it tells us that the architecture
does have a 64-bit cmpxchg instruction.
Thoughts ?
Thanks,
Mathieu
@@ -24,7 +24,7 @@
#include <wrapper/trace-clock.h>
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0))
-DEFINE_PER_CPU(local_t, lttng_last_tsc);
+DEFINE_PER_CPU(struct lttng_last_timestamp, lttng_last_tsc);
EXPORT_PER_CPU_SYMBOL(lttng_last_tsc);
#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0)) */
@@ -59,65 +59,43 @@ extern struct lttng_trace_clock *lttng_trace_clock;
#define LTTNG_CLOCK_NMI_SAFE_BROKEN
#endif
+/*
+ * We need clock values to be monotonically increasing per-cpu, which is
+ * not strictly guaranteed by ktime_get_mono_fast_ns(). It is
+ * straightforward to do on architectures with a 64-bit cmpxchg(), but
+ * not so on architectures without 64-bit cmpxchg.
+ */
+
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0) \
+ && (BITS_PER_LONG == 64 || CONFIG_X86_CMPXCHG64) \
&& !defined(LTTNG_CLOCK_NMI_SAFE_BROKEN))
+#define LTTNG_USE_NMI_SAFE_CLOCK
+#endif
-DECLARE_PER_CPU(local_t, lttng_last_tsc);
+#ifdef LTTNG_USE_NMI_SAFE_CLOCK
-#if (BITS_PER_LONG == 32)
/*
- * Fixup "src_now" using the 32 LSB from "last". We need to handle overflow and
- * underflow of the 32nd bit. "last" can be above, below or equal to the 32 LSB
- * of "src_now".
+ * Ensure the variable is aligned on 64-bit for cmpxchg8b on 32-bit
+ * systems.
*/
-static inline u64 trace_clock_fixup(u64 src_now, u32 last)
-{
- u64 now;
+struct lttng_last_timestamp {
+ u64 v;
+} __attribute__((aligned(sizeof(u64))));
- now = src_now & 0xFFFFFFFF00000000ULL;
- now |= (u64) last;
- /* Detect overflow or underflow between now and last. */
- if ((src_now & 0x80000000U) && !(last & 0x80000000U)) {
- /*
- * If 32nd bit transitions from 1 to 0, and we move forward in
- * time from "now" to "last", then we have an overflow.
- */
- if (((s32) now - (s32) last) < 0)
- now += 0x0000000100000000ULL;
- } else if (!(src_now & 0x80000000U) && (last & 0x80000000U)) {
- /*
- * If 32nd bit transitions from 0 to 1, and we move backward in
- * time from "now" to "last", then we have an underflow.
- */
- if (((s32) now - (s32) last) > 0)
- now -= 0x0000000100000000ULL;
- }
- return now;
-}
-#else /* #if (BITS_PER_LONG == 32) */
-/*
- * The fixup is pretty easy on 64-bit architectures: "last" is a 64-bit
- * value, so we can use last directly as current time.
- */
-static inline u64 trace_clock_fixup(u64 src_now, u64 last)
-{
- return last;
-}
-#endif /* #else #if (BITS_PER_LONG == 32) */
+DECLARE_PER_CPU(struct lttng_last_timestamp, lttng_last_tsc);
/*
* Sometimes called with preemption enabled. Can be interrupted.
*/
static inline u64 trace_clock_monotonic_wrapper(void)
{
- u64 now;
- unsigned long last, result;
- local_t *last_tsc;
+ u64 now, last, result;
+ struct lttng_last_timestamp *last_tsc_ptr;
/* Use fast nmi-safe monotonic clock provided by the Linux kernel. */
preempt_disable();
- last_tsc = lttng_this_cpu_ptr(<tng_last_tsc);
- last = local_read(last_tsc);
+ last_tsc_ptr = lttng_this_cpu_ptr(<tng_last_tsc);
+ last = last_tsc_ptr->v;
/*
* Read "last" before "now". It is not strictly required, but it ensures
* that an interrupt coming in won't artificially trigger a case where
@@ -126,9 +104,9 @@ static inline u64 trace_clock_monotonic_wrapper(void)
*/
barrier();
now = ktime_get_mono_fast_ns();
- if (((long) now - (long) last) < 0)
- now = trace_clock_fixup(now, last);
- result = local_cmpxchg(last_tsc, last, (unsigned long) now);
+ if (U64_MAX / 2 < now - last)
+ now = last;
+ result = cmpxchg64_local(&last_tsc_ptr->v, last, now);
preempt_enable();
if (result == last) {
/* Update done. */
@@ -139,11 +117,11 @@ static inline u64 trace_clock_monotonic_wrapper(void)
* "result", since it has been sampled concurrently with our
* time read, so it should not be far from "now".
*/
- return trace_clock_fixup(now, result);
+ return result;
}
}
-#else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
+#else /* #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
static inline u64 trace_clock_monotonic_wrapper(void)
{
ktime_t ktime;
@@ -158,7 +136,7 @@ static inline u64 trace_clock_monotonic_wrapper(void)
ktime = ktime_get();
return ktime_to_ns(ktime);
}
-#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
+#endif /* #else #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
static inline u64 trace_clock_read64_monotonic(void)
{
@@ -185,19 +163,19 @@ static inline const char *trace_clock_description_monotonic(void)
return "Monotonic Clock";
}
-#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0))
+#ifdef LTTNG_USE_NMI_SAFE_CLOCK
static inline int get_trace_clock(void)
{
printk_once(KERN_WARNING "LTTng: Using mainline kernel monotonic fast clock, which is NMI-safe.\n");
return 0;
}
-#else /* #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
+#else /* #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
static inline int get_trace_clock(void)
{
printk_once(KERN_WARNING "LTTng: Using mainline kernel monotonic clock. NMIs will not be traced.\n");
return 0;
}
-#endif /* #else #if (LINUX_VERSION_CODE >= KERNEL_VERSION(3,17,0)) */
+#endif /* #else #ifdef LTTNG_USE_NMI_SAFE_CLOCK */
static inline void put_trace_clock(void)
{