diff mbox series

urcu workqueue thread uses 99% of cpu while workqueue is empty

Message ID 20220621035206.GA267474@localhost.localdomain
State New
Headers show
Series urcu workqueue thread uses 99% of cpu while workqueue is empty | expand

Commit Message

Minlan Wang June 21, 2022, 3:52 a.m. UTC
Hi, Mathieu,
On Fri, Jun 17, 2022 at 09:37:23AM -0400, Mathieu Desnoyers wrote:
> Can you try the following patch on top of v0.12.1 + your debug patch ?
> 
> diff --git a/src/workqueue.c b/src/workqueue.c
> index eed1e21e..19fc89c8 100644
> --- a/src/workqueue.c
> +++ b/src/workqueue.c
> @@ -142,6 +142,7 @@ static void futex_wait(int32_t *futex)
>  
>  		return;
>  	}
> +wait:
>  	while ((ret = futex_async(futex, FUTEX_WAIT, -1, NULL, NULL, 0))) {
>  		err = errno;
>  		fprintf(stderr, "%lu: wq %p: %s failed, errno %d\n",
> @@ -161,7 +162,16 @@ static void futex_wait(int32_t *futex)
>  
>  	fprintf(stderr, "%lu: wq %p: %s wait return %d\n",
>  		pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__, ret);
> -
> +	/*
> +	 * Prior queued wakeups can cause futex wait to return even
> +	 * though the futex value is still -1. If this happens, wait
> +	 * again.
> +	 */
> +	if (uatomic_read(futex) == -1) {
> +		fprintf(stderr, "%lu: wq %p: %s futex == -1, wait again\n",
> +			pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__);
> +		goto wait;
> +	}
>  }
>  
>  static void futex_wake_up(int32_t *futex)
> 
I tried this patch on top of my debug patch, the issue disappeared.
The log shows that "futex_wait futex == -1, wait again" did happen several times.
I put the complete log in the attachment.

> 
> We should not rely too much on the ordering of fprintf stderr output between threads,
> because those can be interleaved in any order (each fprintf to stderr is unbuffered).
> If we want more precision in the order of events I would recommend using LTTng-UST for
> fine-grained user-space tracing. This might explain why the futex has an unexpected value
> at line 2099.
I tried this patch on top of previous 2 patches to use lttng_ust_tracepoint in
workqueue.c, but always failed in compile example code with this error:
---
make[5]: Entering directory `/home/mlwang/debug/userspace-rcu/doc/examples/rculfqueue'
  CC       cds_lfq_enqueue.o
  CCLD     cds_lfq_enqueue
../../../src/.libs//liburcu-cds.so: undefined reference to `lttng_ust_tracepoint_provider_workqueue'
---
the patch is as following:

>From 2fa8082914da55b6e3815dffd2f749798bf23735 Mon Sep 17 00:00:00 2001
From: root <root at localhost.localdomain>
Date: Tue, 21 Jun 2022 11:42:56 +0800
Subject: [PATCH] add lttng_ust_tracepoint in workqueue.c

---
 src/Makefile.am       |  4 +--
 src/liburcu-cds.pc.in |  2 +-
 src/workqueue.c       |  5 ++++
 src/workqueue.h       |  1 +
 src/workqueue_tp.c    |  4 +++
 src/workqueue_tp.h    | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 95 insertions(+), 3 deletions(-)
 create mode 100644 src/workqueue_tp.c
 create mode 100644 src/workqueue_tp.h

Comments

Mathieu Desnoyers June 21, 2022, 1:12 p.m. UTC | #1
----- On Jun 20, 2022, at 11:52 PM, Minlan Wang wangminlan at szsandstone.com wrote:

> Hi, Mathieu,
> On Fri, Jun 17, 2022 at 09:37:23AM -0400, Mathieu Desnoyers wrote:
>> Can you try the following patch on top of v0.12.1 + your debug patch ?
>> 
>> diff --git a/src/workqueue.c b/src/workqueue.c
>> index eed1e21e..19fc89c8 100644
>> --- a/src/workqueue.c
>> +++ b/src/workqueue.c
>> @@ -142,6 +142,7 @@ static void futex_wait(int32_t *futex)
>>  
>>  		return;
>>  	}
>> +wait:
>>  	while ((ret = futex_async(futex, FUTEX_WAIT, -1, NULL, NULL, 0))) {
>>  		err = errno;
>>  		fprintf(stderr, "%lu: wq %p: %s failed, errno %d\n",
>> @@ -161,7 +162,16 @@ static void futex_wait(int32_t *futex)
>>  
>>  	fprintf(stderr, "%lu: wq %p: %s wait return %d\n",
>>  		pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__,
>>  		ret);
>> -
>> +	/*
>> +	 * Prior queued wakeups can cause futex wait to return even
>> +	 * though the futex value is still -1. If this happens, wait
>> +	 * again.
>> +	 */
>> +	if (uatomic_read(futex) == -1) {
>> +		fprintf(stderr, "%lu: wq %p: %s futex == -1, wait again\n",
>> +			pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex),
>> __func__);
>> +		goto wait;
>> +	}
>>  }
>>  
>>  static void futex_wake_up(int32_t *futex)
>> 
> I tried this patch on top of my debug patch, the issue disappeared.
> The log shows that "futex_wait futex == -1, wait again" did happen several
> times.
> I put the complete log in the attachment.

This means we appear to be on the right track. Good!

> 
>> 
>> We should not rely too much on the ordering of fprintf stderr output between
>> threads,
>> because those can be interleaved in any order (each fprintf to stderr is
>> unbuffered).
>> If we want more precision in the order of events I would recommend using
>> LTTng-UST for
>> fine-grained user-space tracing. This might explain why the futex has an
>> unexpected value
>> at line 2099.
> I tried this patch on top of previous 2 patches to use lttng_ust_tracepoint in
> workqueue.c, but always failed in compile example code with this error:
> ---
> make[5]: Entering directory
> `/home/mlwang/debug/userspace-rcu/doc/examples/rculfqueue'
>  CC       cds_lfq_enqueue.o
>  CCLD     cds_lfq_enqueue
> ../../../src/.libs//liburcu-cds.so: undefined reference to
> `lttng_ust_tracepoint_provider_workqueue'
> ---
> the patch is as following:
[...]
> --- a/src/workqueue.h
> +++ b/src/workqueue.h
> @@ -28,6 +28,7 @@
> #include <pthread.h>
> 
> #include <urcu/wfcqueue.h>
> +#include "workqueue_tp.h"
> 
> #ifdef __cplusplus
> extern "C" {
> diff --git a/src/workqueue_tp.c b/src/workqueue_tp.c
> new file mode 100644
> index 0000000..a5f9586
> --- /dev/null
> +++ b/src/workqueue_tp.c
> @@ -0,0 +1,4 @@
> +#define LTTNG_UST_TRACEPOINT_CREATE_PROBES
> +#define LTTNG_UST_TRACEPOINT_DEFINE
> +
> +#include "workqueue_tp.h"
> diff --git a/src/workqueue_tp.h b/src/workqueue_tp.h
> new file mode 100644
> index 0000000..e2bce31
> --- /dev/null
> +++ b/src/workqueue_tp.h
> @@ -0,0 +1,82 @@
> +#undef	LTTNG_UST_TRACEPOINT_PROVIDER
> +#define	LTTNG_UST_TRACEPOINT_PROVIVER workqueue

If you change "PROVIVER" into "PROVIDER" here, does it fix your issue ?

If that's not it, it should be something relatively simple with the build dependencies. Let me know
if it still does not work.

[...]

> --
> 1.8.3.1
> 
> Is "perf probe" accurate in tracing the futex event here?  Maybe i can try that.

I would prefer to use lttng-ust here to keep the footprint of the tracer minimal, so we do not
change the behavior too much by calling into the kernel.

Then we'll be able to correlate traces from lttng-ust and lttng-modules (kernel tracer) and
figure out what is happening both in user-space and kernel space, especially around futex and
scheduler activity within the kernel.

Thanks,

Mathieu

> 
> B.R
> Minlan
Mathieu Desnoyers June 22, 2022, 1:19 p.m. UTC | #2
----- On Jun 22, 2022, at 3:45 AM, Minlan Wang wangminlan at szsandstone.com wrote:

[...]

Hi Minlan,

> --
> 1.8.3.1
> 
> 
> And the lttng session is configured to trace these events:
> kernel: syscall futex

On the kernel side, in addition to the syscall futex, I would really like to see what
happens in the scheduler, mainly the wait/wakeup tracepoints. This can be added by using:

lttng enable-event -k 'sched_*'

This should help us confirm whether we indeed have a situation where queued wake ups
happen to wake up a wait happening only later, which is unexpected by the current liburcu
userspace code.

[...]

> ---
> 
> The babletrace output of this session is pretty big, 6 MB in size, i put it in
> the attachment trace_0622.tar.bz2.
> Let my know if your mailbox can't handle such big attachment.

It would be even better if you can share the binary trace, because then it's easy to
load it in trace compass, cut away time ranges that don't matter, and lots of other
useful stuff.

Thanks,

Mathieu
Mathieu Desnoyers June 22, 2022, 8:28 p.m. UTC | #3
----- On Jun 22, 2022, at 9:19 AM, Mathieu Desnoyers mathieu.desnoyers at efficios.com wrote:

> ----- On Jun 22, 2022, at 3:45 AM, Minlan Wang wangminlan at szsandstone.com wrote:
> 
> [...]
> 
> Hi Minlan,
> 
>> --
>> 1.8.3.1
>> 
>> 
>> And the lttng session is configured to trace these events:
>> kernel: syscall futex
> 
> On the kernel side, in addition to the syscall futex, I would really like to see
> what
> happens in the scheduler, mainly the wait/wakeup tracepoints. This can be added
> by using:
> 
> lttng enable-event -k 'sched_*'
> 
> This should help us confirm whether we indeed have a situation where queued wake
> ups
> happen to wake up a wait happening only later, which is unexpected by the
> current liburcu
> userspace code.
> 
> [...]
> 
>> ---
>> 
>> The babletrace output of this session is pretty big, 6 MB in size, i put it in
>> the attachment trace_0622.tar.bz2.
>> Let my know if your mailbox can't handle such big attachment.
> 
> It would be even better if you can share the binary trace, because then it's
> easy to
> load it in trace compass, cut away time ranges that don't matter, and lots of
> other
> useful stuff.

I just found the relevant snippet of documentation in futex(5):

       FUTEX_WAIT
              Returns 0 if the caller was woken up.  Note that a  wake-up  can
              also  be caused by common futex usage patterns in unrelated code
              that happened to have previously used the  futex  word's  memory
              location  (e.g., typical futex-based implementations of Pthreads
              mutexes can cause this under some conditions).  Therefore, call?
              ers should always conservatively assume that a return value of 0
              can mean a spurious wake-up, and  use  the  futex  word's  value
              (i.e.,  the user-space synchronization scheme) to decide whether
              to continue to block or not.

I'm pretty sure this is what is happening here.

Thanks,

Mathieu


> 
> Thanks,
> 
> Mathieu
> 
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
Mathieu Desnoyers June 22, 2022, 8:52 p.m. UTC | #4
----- On Jun 22, 2022, at 4:28 PM, Mathieu Desnoyers mathieu.desnoyers at efficios.com wrote:

> ----- On Jun 22, 2022, at 9:19 AM, Mathieu Desnoyers
> mathieu.desnoyers at efficios.com wrote:
> 
>> ----- On Jun 22, 2022, at 3:45 AM, Minlan Wang wangminlan at szsandstone.com wrote:
>> 
>> [...]
>> 
>> Hi Minlan,
>> 
>>> --
>>> 1.8.3.1
>>> 
>>> 
>>> And the lttng session is configured to trace these events:
>>> kernel: syscall futex
>> 
>> On the kernel side, in addition to the syscall futex, I would really like to see
>> what
>> happens in the scheduler, mainly the wait/wakeup tracepoints. This can be added
>> by using:
>> 
>> lttng enable-event -k 'sched_*'
>> 
>> This should help us confirm whether we indeed have a situation where queued wake
>> ups
>> happen to wake up a wait happening only later, which is unexpected by the
>> current liburcu
>> userspace code.
>> 
>> [...]
>> 
>>> ---
>>> 
>>> The babletrace output of this session is pretty big, 6 MB in size, i put it in
>>> the attachment trace_0622.tar.bz2.
>>> Let my know if your mailbox can't handle such big attachment.
>> 
>> It would be even better if you can share the binary trace, because then it's
>> easy to
>> load it in trace compass, cut away time ranges that don't matter, and lots of
>> other
>> useful stuff.
> 
> I just found the relevant snippet of documentation in futex(5):
> 
>       FUTEX_WAIT
>              Returns 0 if the caller was woken up.  Note that a  wake-up  can
>              also  be caused by common futex usage patterns in unrelated code
>              that happened to have previously used the  futex  word's  memory
>              location  (e.g., typical futex-based implementations of Pthreads
>              mutexes can cause this under some conditions).  Therefore, call?
>              ers should always conservatively assume that a return value of 0
>              can mean a spurious wake-up, and  use  the  futex  word's  value
>              (i.e.,  the user-space synchronization scheme) to decide whether
>              to continue to block or not.
> 
> I'm pretty sure this is what is happening here.

Here is the series of patches for review on gerrit:

remote:   https://review.lttng.org/c/userspace-rcu/+/8441 Fix: workqueue: futex wait: handle spurious futex wakeups [NEW]        
remote:   https://review.lttng.org/c/userspace-rcu/+/8442 Fix: urcu: futex wait: handle spurious futex wakeups [NEW]        
remote:   https://review.lttng.org/c/userspace-rcu/+/8443 Fix: call_rcu: futex wait: handle spurious futex wakeups [NEW]        
remote:   https://review.lttng.org/c/userspace-rcu/+/8444 Fix: urcu-wait: futex wait: handle spurious futex wakeups [NEW]        
remote:   https://review.lttng.org/c/userspace-rcu/+/8445 Fix: defer_rcu: futex wait: handle spurious futex wakeups [NEW]        
remote:   https://review.lttng.org/c/userspace-rcu/+/8446 Fix: urcu-qsbr: futex wait: handle spurious futex wakeups [NEW] 

Thanks,

Mathieu

> 
> Thanks,
> 
> Mathieu
> 
> 
>> 
>> Thanks,
>> 
>> Mathieu
>> 
>> --
>> Mathieu Desnoyers
>> EfficiOS Inc.
>> http://www.efficios.com
> 
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com
Minlan Wang June 23, 2022, 3:57 a.m. UTC | #5
On Wed, Jun 22, 2022 at 11:45:28PM -0400, Minlan Wang wrote:
> The session output is in attachment: 0623_futex-20220623-112754.tar.bz2
Hi, Mathieu,
There are several workqueues in the process i tracked.
The one used to trigger the issuse is this:
thread with vtid=21054, and futex = 0x5652A1035C40

B.R
Minlan
Minlan Wang June 23, 2022, 8:36 a.m. UTC | #6
Hi, Mathieu,
This is the last one.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0623_futex-20220623-112754.tar.bz2.ad
Type: application/octet-stream
Size: 38903 bytes
Desc: not available
URL: <https://lists.lttng.org/pipermail/lttng-dev/attachments/20220623/1a7bd620/attachment-0001.obj>
Minlan Wang June 23, 2022, 9:08 a.m. UTC | #7
On Wed, Jun 22, 2022 at 04:52:00PM -0400, Mathieu Desnoyers wrote:
[...]
> remote:   https://review.lttng.org/c/userspace-rcu/+/8441 Fix: workqueue: futex wait: handle spurious futex wakeups [NEW]        
I tried this patch in our environment, it worked fine, the issue does not appear anymore.

B.R
Minlan
Mathieu Desnoyers June 23, 2022, 2:09 p.m. UTC | #8
----- On Jun 22, 2022, at 11:57 PM, Minlan Wang wangminlan at szsandstone.com wrote:

> On Wed, Jun 22, 2022 at 11:45:28PM -0400, Minlan Wang wrote:
>> The session output is in attachment: 0623_futex-20220623-112754.tar.bz2
> Hi, Mathieu,
> There are several workqueues in the process i tracked.
> The one used to trigger the issuse is this:
> thread with vtid=21054, and futex = 0x5652A1035C40

By looking at the trace, here is one problematic scenario which end up emitting "workqueue:futex_wait_again":

[23:33:48.060581417] (+0.000006985) localhost.localdomain workqueue:futex_no_wait: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = 0 }
[23:33:48.060581747] (+0.000000330) localhost.localdomain workqueue:futex_wake_up_success: { cpu_id = 6 }, { vpid = 19495, vtid = 20649 }, { futex = 0x5652A1035C40, val = -1 }
[23:33:48.060581926] (+0.000000179) localhost.localdomain workqueue:futex_dec_success: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, old = 0, new = -1 }
[23:33:48.060582826] (+0.000000900) localhost.localdomain syscall_entry_futex: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { uaddr = 94912888659008, op = 1, val = 1, utime = 0, uaddr2 = 0, val3 = 0 }
[23:33:48.060582855] (+0.000000029) localhost.localdomain syscall_entry_futex: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { uaddr = 94912888659008, op = 0, val = 4294967295, utime = 0, uaddr2 = 0, val3 = 0 }
[23:33:48.060584722] (+0.000001867) localhost.localdomain sched_stat_runtime: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { comm = "bcache_writebac", tid = 21054, runtime = 16033, vruntime = 96940983054 }
[23:33:48.060585840] (+0.000001118) localhost.localdomain sched_switch: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { prev_comm = "bcache_writebac", prev_tid = 21054, prev_prio = 20, prev_state = 1, next_comm = "swapper/4", next_tid = 0, next_prio = 20 }
[23:33:48.060587680] (+0.000001840) localhost.localdomain sched_stat_sleep: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { comm = "bcache_writebac", tid = 21054, delay = 2815 }
[23:33:48.060588560] (+0.000000880) localhost.localdomain sched_wakeup: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { comm = "bcache_writebac", tid = 21054, prio = 20, success = 1, target_cpu = 4 }
[23:33:48.060590437] (+0.000001877) localhost.localdomain syscall_exit_futex: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { ret = 0, uaddr = 94912888659008, uaddr2 = 0 }
[23:33:48.060591337] (+0.000000900) localhost.localdomain syscall_exit_futex: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { ret = 1, uaddr = 94912888659008, uaddr2 = 0 }
[23:33:48.060591385] (+0.000000048) localhost.localdomain workqueue:futex_wait_return: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = -1, ret = 0 }
[23:33:48.060592205] (+0.000000820) localhost.localdomain workqueue:futex_wait_again: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = -1 }

Where the wake_up happens right before the dec_success on the waiter, which
leads to the sched_wakeup awakening the waiter when the state is 0.

If we want to dig more into why this scenario can happen, we could also add tracepoints
in urcu_workqueue_queue_work() just before/after cds_wfcq_enqueue(), and in 
workqueue_thread() around each call to cds_wfcq_empty(), and around __cds_wfcq_splice_blocking().

I suspect we end up in a situation where:

* waker:

        cds_wfcq_enqueue(&workqueue->cbs_head, &workqueue->cbs_tail, &work->next); [a]
        uatomic_inc(&workqueue->qlen);
        wake_worker_thread(workqueue);                                             [b]

vs

* waiter:
                splice_ret = __cds_wfcq_splice_blocking(&cbs_tmp_head,             [c]
                        &cbs_tmp_tail, &workqueue->cbs_head, &workqueue->cbs_tail);
[...]
                        if (cds_wfcq_empty(&workqueue->cbs_head,                   [d]
                                        &workqueue->cbs_tail)) {
                                futex_wait(&workqueue->futex);                     [e]
                                uatomic_dec(&workqueue->futex);                    [f]

happen in an order such that 

[a] enqueues an item. Then [c] splices the item out of the queue because it was already awakened
by a prior wakeup. So the queue is observed as empty by [c]. Then [b] sets the futex to 0
(in userspace), which causes [e] to skip waiting on the futex, and [f] brings the value back
to -1. However, in the next loop, the queue is still observed as empty by [d], thus calling
[e] again. This time, the value is -1, so it calls sys_futex FUTEX_WAIT. Unfortunately, we
still have the call to sys_futex FUTEX_WAKE that will eventually be executed, thus awakening
the futex while the futex state is still -1, which is unexpected.

Thanks,

Mathieu
Minlan Wang June 24, 2022, 6:21 a.m. UTC | #9
On Thu, Jun 23, 2022 at 10:09:55AM -0400, Mathieu Desnoyers wrote:
> 
> By looking at the trace, here is one problematic scenario which end up emitting "workqueue:futex_wait_again":
> 
> [23:33:48.060581417] (+0.000006985) localhost.localdomain workqueue:futex_no_wait: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = 0 }
> [23:33:48.060581747] (+0.000000330) localhost.localdomain workqueue:futex_wake_up_success: { cpu_id = 6 }, { vpid = 19495, vtid = 20649 }, { futex = 0x5652A1035C40, val = -1 }
> [23:33:48.060581926] (+0.000000179) localhost.localdomain workqueue:futex_dec_success: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, old = 0, new = -1 }
> [23:33:48.060582826] (+0.000000900) localhost.localdomain syscall_entry_futex: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { uaddr = 94912888659008, op = 1, val = 1, utime = 0, uaddr2 = 0, val3 = 0 }
> [23:33:48.060582855] (+0.000000029) localhost.localdomain syscall_entry_futex: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { uaddr = 94912888659008, op = 0, val = 4294967295, utime = 0, uaddr2 = 0, val3 = 0 }
> [23:33:48.060584722] (+0.000001867) localhost.localdomain sched_stat_runtime: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { comm = "bcache_writebac", tid = 21054, runtime = 16033, vruntime = 96940983054 }
> [23:33:48.060585840] (+0.000001118) localhost.localdomain sched_switch: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { prev_comm = "bcache_writebac", prev_tid = 21054, prev_prio = 20, prev_state = 1, next_comm = "swapper/4", next_tid = 0, next_prio = 20 }
> [23:33:48.060587680] (+0.000001840) localhost.localdomain sched_stat_sleep: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { comm = "bcache_writebac", tid = 21054, delay = 2815 }
> [23:33:48.060588560] (+0.000000880) localhost.localdomain sched_wakeup: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { comm = "bcache_writebac", tid = 21054, prio = 20, success = 1, target_cpu = 4 }
> [23:33:48.060590437] (+0.000001877) localhost.localdomain syscall_exit_futex: { cpu_id = 4 }, { vpid = 19495, vtid = 21054, pid = 19495, tid = 21054 }, { ret = 0, uaddr = 94912888659008, uaddr2 = 0 }
> [23:33:48.060591337] (+0.000000900) localhost.localdomain syscall_exit_futex: { cpu_id = 6 }, { vpid = 19495, vtid = 20649, pid = 19495, tid = 20649 }, { ret = 1, uaddr = 94912888659008, uaddr2 = 0 }
> [23:33:48.060591385] (+0.000000048) localhost.localdomain workqueue:futex_wait_return: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = -1, ret = 0 }
> [23:33:48.060592205] (+0.000000820) localhost.localdomain workqueue:futex_wait_again: { cpu_id = 4 }, { vpid = 19495, vtid = 21054 }, { futex = 0x5652A1035C40, val = -1 }
> 
> Where the wake_up happens right before the dec_success on the waiter, which
> leads to the sched_wakeup awakening the waiter when the state is 0.
Yes, this makes things clear, urcu_workqueue_queue_work did wake up waiters on
futex when the futex is -1.

> 
> If we want to dig more into why this scenario can happen, we could also add tracepoints
> in urcu_workqueue_queue_work() just before/after cds_wfcq_enqueue(), and in 
> workqueue_thread() around each call to cds_wfcq_empty(), and around __cds_wfcq_splice_blocking().
> 
> I suspect we end up in a situation where:
> 
> * waker:
> 
>         cds_wfcq_enqueue(&workqueue->cbs_head, &workqueue->cbs_tail, &work->next); [a]
>         uatomic_inc(&workqueue->qlen);
>         wake_worker_thread(workqueue);                                             [b]
> 
> vs
> 
> * waiter:
>                 splice_ret = __cds_wfcq_splice_blocking(&cbs_tmp_head,             [c]
>                         &cbs_tmp_tail, &workqueue->cbs_head, &workqueue->cbs_tail);
> [...]
>                         if (cds_wfcq_empty(&workqueue->cbs_head,                   [d]
>                                         &workqueue->cbs_tail)) {
>                                 futex_wait(&workqueue->futex);                     [e]
>                                 uatomic_dec(&workqueue->futex);                    [f]
> 
> happen in an order such that 
> 
> [a] enqueues an item. Then [c] splices the item out of the queue because it was already awakened
> by a prior wakeup. So the queue is observed as empty by [c]. Then [b] sets the futex to 0
> (in userspace), which causes [e] to skip waiting on the futex, and [f] brings the value back
> to -1. However, in the next loop, the queue is still observed as empty by [d], thus calling
> [e] again. This time, the value is -1, so it calls sys_futex FUTEX_WAIT. Unfortunately, we
> still have the call to sys_futex FUTEX_WAKE that will eventually be executed, thus awakening
> the futex while the futex state is still -1, which is unexpected.
Yes, i agree with you. We'll add the patch you mentioned into our code to fix this.
Thanks for your help.

Since cds_wfcq_enqueue/__cds_wfcq_splice_blocking do nothing to
workqueue->futex, add debug info to see if workqueue is empty is not so
neccessary to me.
Let me know if you need anything else on this.

Thanks, again.

Minlan
diff mbox series

Patch

diff --git a/src/Makefile.am b/src/Makefile.am
index 88ccc1f..5a18e31 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -2,7 +2,7 @@  AM_CPPFLAGS += -I$(top_srcdir)/include -I$(top_builddir)/include -I$(top_srcdir)
 
 #Add the -version-info directly here since we are only building
 # library that use the version-info
-AM_LDFLAGS=-version-info $(URCU_LIBRARY_VERSION)
+AM_LDFLAGS=-version-info $(URCU_LIBRARY_VERSION) -llttng-ust -ldl
 if USE_CYGWIN
 AM_LDFLAGS+=-no-undefined
 endif
@@ -56,7 +56,7 @@  liburcu_bp_la_SOURCES = urcu-bp.c urcu-pointer.c $(COMPAT)
 liburcu_bp_la_LIBADD = liburcu-common.la
 
 liburcu_cds_la_SOURCES = rculfqueue.c rculfstack.c lfstack.c \
-	workqueue.c workqueue.h $(RCULFHASH) $(COMPAT)
+	workqueue.c workqueue.h workqueue_tp.c workqueue_tp.h $(RCULFHASH) $(COMPAT)
 liburcu_cds_la_LIBADD = liburcu-common.la
 
 pkgconfigdir = $(libdir)/pkgconfig
diff --git a/src/liburcu-cds.pc.in b/src/liburcu-cds.pc.in
index e3d13af..f4e5e3e 100644
--- a/src/liburcu-cds.pc.in
+++ b/src/liburcu-cds.pc.in
@@ -7,5 +7,5 @@  Name: Userspace RCU Concurrent Data Structures
 Description: Data structures leveraging RCU and atomic operations to provide efficient concurrency-aware storage
 Version: @PACKAGE_VERSION@
 Requires:
-Libs: -L${libdir} -lurcu-cds
+Libs: -L${libdir} -lurcu-cds -llttng-ust -ldl
 Cflags: -I${includedir} 
diff --git a/src/workqueue.c b/src/workqueue.c
index 19fc89c..c29c070 100644
--- a/src/workqueue.c
+++ b/src/workqueue.c
@@ -145,6 +145,7 @@  static void futex_wait(int32_t *futex)
 wait:
 	while ((ret = futex_async(futex, FUTEX_WAIT, -1, NULL, NULL, 0))) {
 		err = errno;
+		lttng_ust_tracepoint(workqueue, futex_wait_err, (long)futex, *futex, err);
 		fprintf(stderr, "%lu: wq %p: %s failed, errno %d\n",
 			pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__, err);
 		switch (err) {
@@ -160,6 +161,7 @@  wait:
 		}
 	}
 
+	lttng_ust_tracepoint(workqueue, futex_wait_return, (long)futex, *futex, ret);
 	fprintf(stderr, "%lu: wq %p: %s wait return %d\n",
 		pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__, ret);
 	/*
@@ -168,6 +170,7 @@  wait:
 	 * again.
 	 */
 	if (uatomic_read(futex) == -1) {
+		lttng_ust_tracepoint(workqueue, futex_wait_again, (long)futex, *futex);
 		fprintf(stderr, "%lu: wq %p: %s futex == -1, wait again\n",
 			pthread_self(), caa_container_of(futex, struct urcu_workqueue, futex), __func__);
 		goto wait;
@@ -184,11 +187,13 @@  static void futex_wake_up(int32_t *futex)
 	if (caa_unlikely(old == -1)) {
 		old = uatomic_xchg(futex, 0);
 		if (old == -1) {
+			lttng_ust_tracepoint(workqueue, futex_wake_up_success, (long)futex, old);
 			fprintf(stderr, "%lu: wq %p, wakeup succeed: old %d\n",
 				pthread_self(),
 				caa_container_of(futex, struct urcu_workqueue, futex),
 				old);
 		} else {
+			lttng_ust_tracepoint(workqueue, futex_wake_up_fail, (long)futex, old);
 			fprintf(stderr, "%lu: wq %p, wakeup failed: old %d\n",
 				pthread_self(),
 				caa_container_of(futex, struct urcu_workqueue, futex),
diff --git a/src/workqueue.h b/src/workqueue.h
index 52b6973..63931a4 100644
--- a/src/workqueue.h
+++ b/src/workqueue.h
@@ -28,6 +28,7 @@ 
 #include <pthread.h>
 
 #include <urcu/wfcqueue.h>
+#include "workqueue_tp.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/workqueue_tp.c b/src/workqueue_tp.c
new file mode 100644
index 0000000..a5f9586
--- /dev/null
+++ b/src/workqueue_tp.c
@@ -0,0 +1,4 @@ 
+#define LTTNG_UST_TRACEPOINT_CREATE_PROBES
+#define LTTNG_UST_TRACEPOINT_DEFINE
+
+#include "workqueue_tp.h"
diff --git a/src/workqueue_tp.h b/src/workqueue_tp.h
new file mode 100644
index 0000000..e2bce31
--- /dev/null
+++ b/src/workqueue_tp.h
@@ -0,0 +1,82 @@ 
+#undef	LTTNG_UST_TRACEPOINT_PROVIDER
+#define	LTTNG_UST_TRACEPOINT_PROVIVER workqueue
+
+#undef	LTTNG_UST_TRACEPOINT_INCLUDE
+#define	LTTNG_UST_TRACEPOINT_INCLUDE "./workqueue_tp.h"
+
+#if !defined(_WORKQUEUE_TP_H) || defined(LTTNG_USG_TRACEPOINT_HEADER_MULTI_READ)
+#define	_WORKQUEUE_TP_H
+
+#include <lttng/tracepoint.h>
+
+LTTNG_UST_TRACEPOINT_EVENT(
+	workqueue,
+	futex_wait_err,
+	LTTNG_UST_TP_ARGS( 
+		long, futex_p_arg,
+		int, futex_arg,
+		int, err_arg
+	),
+	LTTNG_UST_TP_FIELDS(
+		lttng_ust_field_integer_hex(long, futex_p_field, futex_p_arg)
+		lttng_ust_field_integer(int, futex_field, futex_arg)
+		lttng_ust_field_integer(int, err_field, err_arg)
+	)
+)
+
+LTTNG_UST_TRACEPOINT_EVENT(
+	workqueue,
+	futex_wait_return,
+	LTTNG_UST_TP_ARGS( 
+		long, futex_p_arg,
+		int, futex_arg,
+		int, ret_arg
+	),
+	LTTNG_UST_TP_FIELDS(
+		lttng_ust_field_integer_hex(long, futex_p_field, futex_p_arg)
+		lttng_ust_field_integer(int, futex_field, futex_arg)
+		lttng_ust_field_integer(int, ret_field, ret_arg)
+	)
+)
+
+LTTNG_UST_TRACEPOINT_EVENT(
+	workqueue,
+	futex_wait_again,
+	LTTNG_UST_TP_ARGS( 
+		long, futex_p_arg,
+		int, futex_arg
+	),
+	LTTNG_UST_TP_FIELDS(
+		lttng_ust_field_integer_hex(long, futex_p_field, futex_p_arg)
+		lttng_ust_field_integer(int, futex_field, futex_arg)
+	)
+)
+
+LTTNG_UST_TRACEPOINT_EVENT(
+	workqueue,
+	futex_wake_up_success,
+	LTTNG_UST_TP_ARGS( 
+		long, futex_p_arg,
+		int, futex_arg
+	),
+	LTTNG_UST_TP_FIELDS(
+		lttng_ust_field_integer_hex(long, futex_p_field, futex_p_arg)
+		lttng_ust_field_integer(int, futex_field, futex_arg)
+	)
+)
+
+LTTNG_UST_TRACEPOINT_EVENT(
+	workqueue,
+	futex_wake_up_fail,
+	LTTNG_UST_TP_ARGS( 
+		long, futex_p_arg,
+		int, futex_arg
+	),
+	LTTNG_UST_TP_FIELDS(
+		lttng_ust_field_integer_hex(long, futex_p_field, futex_p_arg)
+		lttng_ust_field_integer(int, futex_field, futex_arg)
+	)
+)
+
+#endif /* _WORKQUEUE_TP_H */
+#include <lttng/tracepoint-event.h>