[4/7] Replace the internal pointer manipulation with __atomic builtins
Commit Message
Instead of custom code, use the __atomic builtins to implement the
rcu_dereference(), rcu_cmpxchg_pointer(), rcu_xchg_pointer() and
rcu_assign_pointer().
Signed-off-by: Ond?ej Sur? <ondrej at sury.org>
---
include/urcu/arch.h | 20 +++++++++
include/urcu/arch/alpha.h | 6 +++
include/urcu/arch/arm.h | 12 ++++++
include/urcu/arch/mips.h | 2 +
include/urcu/arch/nios2.h | 2 +
include/urcu/arch/ppc.h | 6 +++
include/urcu/arch/s390.h | 2 +
include/urcu/arch/sparc64.h | 6 +++
include/urcu/arch/x86.h | 20 +++++++++
include/urcu/static/pointer.h | 77 +++++++----------------------------
10 files changed, 90 insertions(+), 63 deletions(-)
Comments
On 2023-03-17 17:37, Ond?ej Sur? via lttng-dev wrote:
> Instead of custom code, use the __atomic builtins to implement the
> rcu_dereference(), rcu_cmpxchg_pointer(), rcu_xchg_pointer() and
> rcu_assign_pointer().
This also changes the cmm_mb() family of functions, but not everywhere.
This should be documented.
I'm also unsure why architecture code has #ifndef cmm_mb when we would
expect the generic arch implementation to be conditional (the other way
around).
>
> Signed-off-by: Ond?ej Sur? <ondrej at sury.org>
> ---
> include/urcu/arch.h | 20 +++++++++
> include/urcu/arch/alpha.h | 6 +++
> include/urcu/arch/arm.h | 12 ++++++
> include/urcu/arch/mips.h | 2 +
> include/urcu/arch/nios2.h | 2 +
> include/urcu/arch/ppc.h | 6 +++
> include/urcu/arch/s390.h | 2 +
> include/urcu/arch/sparc64.h | 6 +++
> include/urcu/arch/x86.h | 20 +++++++++
> include/urcu/static/pointer.h | 77 +++++++----------------------------
> 10 files changed, 90 insertions(+), 63 deletions(-)
>
> diff --git a/include/urcu/arch.h b/include/urcu/arch.h
> index d3914da..aec6fa1 100644
> --- a/include/urcu/arch.h
> +++ b/include/urcu/arch.h
> @@ -21,6 +21,26 @@
> #ifndef _URCU_ARCH_H
> #define _URCU_ARCH_H
>
> +#if !defined(__has_feature)
> +#define __has_feature(x) 0
> +#endif /* if !defined(__has_feature) */
> +
> +/* GCC defines __SANITIZE_ADDRESS__, so reuse the macro for clang */
> +#if __has_feature(address_sanitizer)
> +#define __SANITIZE_ADDRESS__ 1
> +#endif /* if __has_feature(address_sanitizer) */
> +
> +#ifdef __SANITIZE_THREAD__
> +/* FIXME: Somebody who understands the barriers should look into this */
> +#define cmm_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)
This really needs to be __ATOMIC_SEQ_CST.
> +#define cmm_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
> +#define cmm_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)
I am really unsure that rmb/wmb semantics map to acq/rel. Paul, can you
confirm ?
> +#define cmm_smp_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)
SEQ_CST.
> +#define cmm_smp_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
> +#define cmm_smp_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)
Unsure (see above).
> +#define cmm_read_barrier_depends() __atomic_thread_fence(__ATOMIC_ACQ_REL)
This would map to __ATOMIC_CONSUME, but AFAIK the current implementation
of this semantic is done with __ATOMIC_ACQUIRE which is stronger than
what is really needed here. So we can expect a slowdown on some
architectures if we go that way.
Should we favor code simplicity and long-term maintainability at the
expense of performance in the short-term ? Or should we keep
arch-specific implementations until the toolchains end up implementing a
proper consume semantic ?
> +#endif
> +
> /*
> * Architecture detection using compiler defines.
> *
> diff --git a/include/urcu/arch/alpha.h b/include/urcu/arch/alpha.h
> index dc33e28..84526ef 100644
> --- a/include/urcu/arch/alpha.h
> +++ b/include/urcu/arch/alpha.h
> @@ -29,9 +29,15 @@
> extern "C" {
> #endif
>
> +#ifndef cmm_mb
> #define cmm_mb() __asm__ __volatile__ ("mb":::"memory")
> +#endif
> +#ifndef cmm_wmb
> #define cmm_wmb() __asm__ __volatile__ ("wmb":::"memory")
> +#endif
> +#ifndef cmm_read_barrier_depends
> #define cmm_read_barrier_depends() __asm__ __volatile__ ("mb":::"memory")
> +#endif
>
[...]
> diff --git a/include/urcu/static/pointer.h b/include/urcu/static/pointer.h
> index 9e46a57..3f116f3 100644
> --- a/include/urcu/static/pointer.h
> +++ b/include/urcu/static/pointer.h
> @@ -38,6 +38,8 @@
> extern "C" {
> #endif
>
> +#define _rcu_get_pointer(addr) __atomic_load_n(addr, __ATOMIC_CONSUME)
> +
> /**
> * _rcu_dereference - reads (copy) a RCU-protected pointer to a local variable
> * into a RCU read-side critical section. The pointer can later be safely
> @@ -49,14 +51,6 @@ extern "C" {
> * Inserts memory barriers on architectures that require them (currently only
> * Alpha) and documents which pointers are protected by RCU.
> *
> - * With C standards prior to C11/C++11, the compiler memory barrier in
> - * CMM_LOAD_SHARED() ensures that value-speculative optimizations (e.g.
> - * VSS: Value Speculation Scheduling) does not perform the data read
> - * before the pointer read by speculating the value of the pointer.
> - * Correct ordering is ensured because the pointer is read as a volatile
> - * access. This acts as a global side-effect operation, which forbids
> - * reordering of dependent memory operations.
We should document that we end up relying on CONSUME for rcu_dereference
in the patch commit message.
> - *
> * With C standards C11/C++11, concerns about dependency-breaking
> * optimizations are taken care of by the "memory_order_consume" atomic
> * load.
> @@ -65,10 +59,6 @@ extern "C" {
> * explicit because the pointer used as input argument is a pointer,
> * not an _Atomic type as required by C11/C++11.
> *
> - * By defining URCU_DEREFERENCE_USE_VOLATILE, the user requires use of
> - * volatile access to implement rcu_dereference rather than
> - * memory_order_consume load from the C11/C++11 standards.
> - *
> * This may improve performance on weakly-ordered architectures where
> * the compiler implements memory_order_consume as a
> * memory_order_acquire, which is stricter than required by the
> @@ -83,35 +73,7 @@ extern "C" {
> * meets the 10-line criterion in LGPL, allowing this function to be
> * expanded directly in non-LGPL code.
> */
> -
> -#if !defined (URCU_DEREFERENCE_USE_VOLATILE) && \
> - ((defined (__cplusplus) && __cplusplus >= 201103L) || \
> - (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
> -# define __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
> -#endif
> -
> -/*
> - * If p is const (the pointer itself, not what it points to), using
> - * __typeof__(p) would declare a const variable, leading to
> - * -Wincompatible-pointer-types errors. Using the statement expression
> - * makes it an rvalue and gets rid of the const-ness.
> - */
> -#ifdef __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
> -# define _rcu_dereference(p) __extension__ ({ \
> - __typeof__(__extension__ ({ \
> - __typeof__(p) __attribute__((unused)) _________p0 = { 0 }; \
> - _________p0; \
> - })) _________p1; \
> - __atomic_load(&(p), &_________p1, __ATOMIC_CONSUME); \
> - (_________p1); \
> - })
> -#else
> -# define _rcu_dereference(p) __extension__ ({ \
> - __typeof__(p) _________p1 = CMM_LOAD_SHARED(p); \
> - cmm_smp_read_barrier_depends(); \
> - (_________p1); \
> - })
> -#endif
> +#define _rcu_dereference(p) _rcu_get_pointer(&(p))
>
> /**
> * _rcu_cmpxchg_pointer - same as rcu_assign_pointer, but tests if the pointer
> @@ -126,12 +88,12 @@ extern "C" {
> * meets the 10-line criterion in LGPL, allowing this function to be
> * expanded directly in non-LGPL code.
> */
> -#define _rcu_cmpxchg_pointer(p, old, _new) \
> - __extension__ \
> - ({ \
> - __typeof__(*p) _________pold = (old); \
> - __typeof__(*p) _________pnew = (_new); \
> - uatomic_cmpxchg(p, _________pold, _________pnew); \
> +#define _rcu_cmpxchg_pointer(p, old, _new) \
> + ({ \
> + __typeof__(*(p)) __old = old; \
> + __atomic_compare_exchange_n(p, &__old, _new, 0, \
> + __ATOMIC_ACQ_REL, __ATOMIC_CONSUME); \
__ATOMIC_SEQ_CST on both success and failure.
> + __old; \
> })
>
> /**
> @@ -145,22 +107,11 @@ extern "C" {
> * meets the 10-line criterion in LGPL, allowing this function to be
> * expanded directly in non-LGPL code.
> */
> -#define _rcu_xchg_pointer(p, v) \
> - __extension__ \
> - ({ \
> - __typeof__(*p) _________pv = (v); \
> - uatomic_xchg(p, _________pv); \
> - })
> -
> +#define _rcu_xchg_pointer(p, v) \
> + __atomic_exchange_n(p, v, __ATOMIC_ACQ_REL)
__ATOMIC_SEQ_CST.
>
> -#define _rcu_set_pointer(p, v) \
> - do { \
> - __typeof__(*p) _________pv = (v); \
> - if (!__builtin_constant_p(v) || \
> - ((v) != NULL)) \
> - cmm_wmb(); \
> - uatomic_set(p, _________pv); \
> - } while (0)
> +#define _rcu_set_pointer(p, v) \
> + __atomic_store_n(p, v, __ATOMIC_RELEASE)
OK.
Thanks,
Mathieu
>
> /**
> * _rcu_assign_pointer - assign (publicize) a pointer to a new data structure
> @@ -178,7 +129,7 @@ extern "C" {
> * meets the 10-line criterion in LGPL, allowing this function to be
> * expanded directly in non-LGPL code.
> */
> -#define _rcu_assign_pointer(p, v) _rcu_set_pointer(&(p), v)
> +#define _rcu_assign_pointer(p, v) rcu_set_pointer(&(p), v)
>
> #ifdef __cplusplus
> }
@@ -21,6 +21,26 @@
#ifndef _URCU_ARCH_H
#define _URCU_ARCH_H
+#if !defined(__has_feature)
+#define __has_feature(x) 0
+#endif /* if !defined(__has_feature) */
+
+/* GCC defines __SANITIZE_ADDRESS__, so reuse the macro for clang */
+#if __has_feature(address_sanitizer)
+#define __SANITIZE_ADDRESS__ 1
+#endif /* if __has_feature(address_sanitizer) */
+
+#ifdef __SANITIZE_THREAD__
+/* FIXME: Somebody who understands the barriers should look into this */
+#define cmm_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
+#define cmm_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)
+#define cmm_smp_mb() __atomic_thread_fence(__ATOMIC_ACQ_REL)
+#define cmm_smp_rmb() __atomic_thread_fence(__ATOMIC_ACQUIRE)
+#define cmm_smp_wmb() __atomic_thread_fence(__ATOMIC_RELEASE)
+#define cmm_read_barrier_depends() __atomic_thread_fence(__ATOMIC_ACQ_REL)
+#endif
+
/*
* Architecture detection using compiler defines.
*
@@ -29,9 +29,15 @@
extern "C" {
#endif
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("mb":::"memory")
+#endif
+#ifndef cmm_wmb
#define cmm_wmb() __asm__ __volatile__ ("wmb":::"memory")
+#endif
+#ifndef cmm_read_barrier_depends
#define cmm_read_barrier_depends() __asm__ __volatile__ ("mb":::"memory")
+#endif
/*
* On Linux, define the membarrier system call number if not yet available in
@@ -42,16 +42,28 @@ extern "C" {
/*
* Issues full system DMB operation.
*/
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("dmb sy":::"memory")
+#endif
+#ifndef cmm_rmb
#define cmm_rmb() __asm__ __volatile__ ("dmb sy":::"memory")
+#endif
+#ifndef cmm_wmb
#define cmm_wmb() __asm__ __volatile__ ("dmb sy":::"memory")
+#endif
/*
* Issues DMB operation only to the inner shareable domain.
*/
+#ifndef cmm_smp_mb
#define cmm_smp_mb() __asm__ __volatile__ ("dmb ish":::"memory")
+#endif
+#ifndef cmm_smp_rmb
#define cmm_smp_rmb() __asm__ __volatile__ ("dmb ish":::"memory")
+#endif
+#ifndef cmm_smp_wmb
#define cmm_smp_wmb() __asm__ __volatile__ ("dmb ish":::"memory")
+#endif
#endif /* URCU_ARCH_ARMV7 */
@@ -30,11 +30,13 @@
extern "C" {
#endif
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ( \
" .set mips2 \n" \
" sync \n" \
" .set mips0 \n" \
:::"memory")
+#endif
#ifdef __cplusplus
}
@@ -29,7 +29,9 @@
extern "C" {
#endif
+#ifndef cmm_mb
#define cmm_mb() cmm_barrier()
+#endif
#ifdef __cplusplus
}
@@ -48,7 +48,9 @@ extern "C" {
* order cacheable and non-cacheable memory operations separately---i.e.
* not the latter against the former.
*/
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("sync":::"memory")
+#endif
/*
* lwsync orders loads in cacheable memory with respect to other loads,
@@ -56,8 +58,12 @@ extern "C" {
* Therefore, use it for barriers ordering accesses to cacheable memory
* only.
*/
+#ifndef cmm_smp_rmb
#define cmm_smp_rmb() __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
+#endif
+#ifndef cmm_smp_rmb
#define cmm_smp_wmb() __asm__ __volatile__ (LWSYNC_OPCODE:::"memory")
+#endif
#define mftbl() \
__extension__ \
@@ -39,7 +39,9 @@ extern "C" {
#define CAA_CACHE_LINE_SIZE 128
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__("bcr 15,0" : : : "memory")
+#endif
#define HAS_CAA_GET_CYCLES
@@ -49,9 +49,15 @@ __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \
"1:\n" \
: : : "memory")
+#ifndef cmm_mb
#define cmm_mb() membar_safe("#LoadLoad | #LoadStore | #StoreStore | #StoreLoad")
+#endif
+#ifndef cmm_rmb
#define cmm_rmb() membar_safe("#LoadLoad")
+#endif
+#ifndef cmm_wmb
#define cmm_wmb() membar_safe("#StoreStore")
+#endif
#ifdef __cplusplus
}
@@ -46,15 +46,23 @@ extern "C" {
/* For backwards compat */
#define CONFIG_RCU_HAVE_FENCE 1
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("mfence":::"memory")
+#endif
/*
* Define cmm_rmb/cmm_wmb to "strict" barriers that may be needed when
* using SSE or working with I/O areas. cmm_smp_rmb/cmm_smp_wmb are
* only compiler barriers, which is enough for general use.
*/
+#ifndef cmm_rmb
#define cmm_rmb() __asm__ __volatile__ ("lfence":::"memory")
+#endif
+
+#ifndef cmm_wmb
#define cmm_wmb() __asm__ __volatile__ ("sfence"::: "memory")
+#endif
+
#define cmm_smp_rmb() cmm_barrier()
#define cmm_smp_wmb() cmm_barrier()
@@ -72,15 +80,27 @@ extern "C" {
* under our feet; cmm_smp_wmb() ceases to be a nop for these processors.
*/
#if (CAA_BITS_PER_LONG == 32)
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
+#ifndef cmm_rmb
#define cmm_rmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
+#ifndef cmm_wmb
#define cmm_wmb() __asm__ __volatile__ ("lock; addl $0,0(%%esp)":::"memory")
+#endif
#else
+#ifndef cmm_mb
#define cmm_mb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
+#endif
+#ifndef cmm_rmb
#define cmm_rmb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
+#endif
+#ifndef cmm_wmb
#define cmm_wmb() __asm__ __volatile__ ("lock; addl $0,0(%%rsp)":::"memory")
#endif
#endif
+#endif
#define caa_cpu_relax() __asm__ __volatile__ ("rep; nop" : : : "memory")
@@ -38,6 +38,8 @@
extern "C" {
#endif
+#define _rcu_get_pointer(addr) __atomic_load_n(addr, __ATOMIC_CONSUME)
+
/**
* _rcu_dereference - reads (copy) a RCU-protected pointer to a local variable
* into a RCU read-side critical section. The pointer can later be safely
@@ -49,14 +51,6 @@ extern "C" {
* Inserts memory barriers on architectures that require them (currently only
* Alpha) and documents which pointers are protected by RCU.
*
- * With C standards prior to C11/C++11, the compiler memory barrier in
- * CMM_LOAD_SHARED() ensures that value-speculative optimizations (e.g.
- * VSS: Value Speculation Scheduling) does not perform the data read
- * before the pointer read by speculating the value of the pointer.
- * Correct ordering is ensured because the pointer is read as a volatile
- * access. This acts as a global side-effect operation, which forbids
- * reordering of dependent memory operations.
- *
* With C standards C11/C++11, concerns about dependency-breaking
* optimizations are taken care of by the "memory_order_consume" atomic
* load.
@@ -65,10 +59,6 @@ extern "C" {
* explicit because the pointer used as input argument is a pointer,
* not an _Atomic type as required by C11/C++11.
*
- * By defining URCU_DEREFERENCE_USE_VOLATILE, the user requires use of
- * volatile access to implement rcu_dereference rather than
- * memory_order_consume load from the C11/C++11 standards.
- *
* This may improve performance on weakly-ordered architectures where
* the compiler implements memory_order_consume as a
* memory_order_acquire, which is stricter than required by the
@@ -83,35 +73,7 @@ extern "C" {
* meets the 10-line criterion in LGPL, allowing this function to be
* expanded directly in non-LGPL code.
*/
-
-#if !defined (URCU_DEREFERENCE_USE_VOLATILE) && \
- ((defined (__cplusplus) && __cplusplus >= 201103L) || \
- (defined (__STDC_VERSION__) && __STDC_VERSION__ >= 201112L))
-# define __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
-#endif
-
-/*
- * If p is const (the pointer itself, not what it points to), using
- * __typeof__(p) would declare a const variable, leading to
- * -Wincompatible-pointer-types errors. Using the statement expression
- * makes it an rvalue and gets rid of the const-ness.
- */
-#ifdef __URCU_DEREFERENCE_USE_ATOMIC_CONSUME
-# define _rcu_dereference(p) __extension__ ({ \
- __typeof__(__extension__ ({ \
- __typeof__(p) __attribute__((unused)) _________p0 = { 0 }; \
- _________p0; \
- })) _________p1; \
- __atomic_load(&(p), &_________p1, __ATOMIC_CONSUME); \
- (_________p1); \
- })
-#else
-# define _rcu_dereference(p) __extension__ ({ \
- __typeof__(p) _________p1 = CMM_LOAD_SHARED(p); \
- cmm_smp_read_barrier_depends(); \
- (_________p1); \
- })
-#endif
+#define _rcu_dereference(p) _rcu_get_pointer(&(p))
/**
* _rcu_cmpxchg_pointer - same as rcu_assign_pointer, but tests if the pointer
@@ -126,12 +88,12 @@ extern "C" {
* meets the 10-line criterion in LGPL, allowing this function to be
* expanded directly in non-LGPL code.
*/
-#define _rcu_cmpxchg_pointer(p, old, _new) \
- __extension__ \
- ({ \
- __typeof__(*p) _________pold = (old); \
- __typeof__(*p) _________pnew = (_new); \
- uatomic_cmpxchg(p, _________pold, _________pnew); \
+#define _rcu_cmpxchg_pointer(p, old, _new) \
+ ({ \
+ __typeof__(*(p)) __old = old; \
+ __atomic_compare_exchange_n(p, &__old, _new, 0, \
+ __ATOMIC_ACQ_REL, __ATOMIC_CONSUME); \
+ __old; \
})
/**
@@ -145,22 +107,11 @@ extern "C" {
* meets the 10-line criterion in LGPL, allowing this function to be
* expanded directly in non-LGPL code.
*/
-#define _rcu_xchg_pointer(p, v) \
- __extension__ \
- ({ \
- __typeof__(*p) _________pv = (v); \
- uatomic_xchg(p, _________pv); \
- })
-
+#define _rcu_xchg_pointer(p, v) \
+ __atomic_exchange_n(p, v, __ATOMIC_ACQ_REL)
-#define _rcu_set_pointer(p, v) \
- do { \
- __typeof__(*p) _________pv = (v); \
- if (!__builtin_constant_p(v) || \
- ((v) != NULL)) \
- cmm_wmb(); \
- uatomic_set(p, _________pv); \
- } while (0)
+#define _rcu_set_pointer(p, v) \
+ __atomic_store_n(p, v, __ATOMIC_RELEASE)
/**
* _rcu_assign_pointer - assign (publicize) a pointer to a new data structure
@@ -178,7 +129,7 @@ extern "C" {
* meets the 10-line criterion in LGPL, allowing this function to be
* expanded directly in non-LGPL code.
*/
-#define _rcu_assign_pointer(p, v) _rcu_set_pointer(&(p), v)
+#define _rcu_assign_pointer(p, v) rcu_set_pointer(&(p), v)
#ifdef __cplusplus
}