Andriy Gapon
2013-08-30 12:00:30 UTC
It seems that there is a race between l2arc_evict and the l2arc related code in
arc_read, at least in recent FreeBSD.
We got the following crash:
...
#8 0xffffffff8073f996 in trap (frame=0xfffffe104a076f40) at
/usr/src/sys/amd64/amd64/trap.c:463
#9 0xffffffff80729372 in calltrap () at /usr/src/sys/amd64/amd64/exception.S:232
#10 0xffffffff80d95e21 in arc_read (pio=0xfffff801d4baf3b0,
spa=0xfffff8000fddf000, bp=0xfffffe000e5e8240, done=0x181fd9c00,
private=0xfffff80100000000, priority=0, zio_flags=744616296,
arc_flags=0xfffffe104a0770a0,
zb=0xfffff80100000000) at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c:3331
#11 0xffffffff80d9d331 in dbuf_read (db=0xfffff801e6e737e0, zio=0x0, flags=10)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c:582
#12 0xffffffff80da3ae3 in dmu_buf_hold (os=<value optimized out>, object=<value
optimized out>, offset=<value optimized out>, tag=<value optimized out>,
dbp=0xfffffe104a0771f8, flags=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c:153
#13 0xffffffff80df6bf9 in zap_lockdir (os=0xfffff8001d258400, obj=17, tx=0x0,
lti=RW_READER, fatreader=1, adding=0, zapp=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:467
#14 0xffffffff80df776d in zap_lookup_norm (os=0x0, zapobj=18446744071577351547,
name=0xfffffe104a077490 "mktemp", integer_size=8, num_integers=1,
buf=0xfffffe104a077338, mt=501, realname=0x0, rn_len=0, ncp=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:788
#15 0xffffffff80df771e in zap_lookup (os=0x0, zapobj=18446744071577351547,
name=0x1f5 <Address 0x1f5 out of bounds>, integer_size=1, num_integers=0, buf=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:773
#16 0xffffffff80e144f6 in zfs_dirent_lock (dlpp=0xfffffe104a0773b0,
dzp=0xfffff8001ddaa5c0, name=0xfffffe104a077490 "mktemp",
zpp=0xfffffe104a0773a8, flag=6, direntflags=0x0, realpnp=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c:91
#17 0xffffffff80e1484d in zfs_dirlook (dzp=0xfffff8001ddaa5c0,
name=0xfffffe104a077490 "mktemp", vpp=0xfffffe104a077868, flags=<value optimized
out>, deflg=0x0, rpnp=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c:426
#18 0xffffffff80e28b34 in zfs_lookup () at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:1500
...
The arc_read code in question is this:
if (hdr->b_l2hdr->b_compress ==
ZIO_COMPRESS_EMPTY) {
rzio = zio_null(pio, spa, vd,
l2arc_read_done, cb,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY);
} else {
rzio = zio_read_phys(pio, vd, addr,
hdr->b_l2hdr->b_asize,
buf->b_data, ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
}
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes,
hdr->b_l2hdr->b_asize);
The crash is on the ARCSTAT_INCR line while dereferencing hdr->b_l2hdr.
It's apparent that hdr->b_l2hdr had to be non-NULL for this code path to be
taken. At the moment of the crash it is NULL though:
(kgdb) p *(arc_buf_hdr_t*)0xfffff8001d7cfe58
$6 = {b_dva = {dva_word = {6, 394971608}}, b_birth = 827690, b_cksum0 =
590276882470, b_freeze_lock = {lock_object = {lo_name = 0xffffffff80e84eb6
"buf->b_freeze_lock", lo_flags = 40960000, lo_data = 0, lo_witness = 0x0},
sx_lock = 1}, b_freeze_cksum = 0xfffff8001d0d7e20, b_thawed = 0x0,
b_hash_next = 0x0, b_buf = 0xfffff80e2c61f168, b_flags = 132640, b_datacnt = 1,
b_acb = 0xfffff80f89ba3680, b_cv = {
cv_description = 0xffffffff80e84eab "buf->b_cv", cv_waiters = 0}, b_type =
ARC_BUFC_METADATA, b_size = 8704, b_spa = 2586324942190472777, b_state =
0xffffffff80eaf080, b_arc_node = {list_next = 0x0, list_prev = 0x0},
b_arc_access = 138774732, b_refcnt = {rc_count = 1}, b_l2hdr = 0x0, b_l2node =
{list_next = 0x0, list_prev = 0x0}}
It seems that b_flags include the following bits: ARC_IN_HASH_TABLE,
ARC_IO_IN_PROGRESS, ARC_L2_EVICTED.
As far as I can see it is l2arc_evict that sets ARC_L2_EVICTED and resets
b_l2hdr to NULL. Looks like it does that under protection of hash_lock and
l2arc_buflist_mtx.
On the other hand, it seems that the arc_read code in question runs without
protection of any of these locks (or any other ARC-level lock).
What do you think?
Thank you!
arc_read, at least in recent FreeBSD.
We got the following crash:
...
#8 0xffffffff8073f996 in trap (frame=0xfffffe104a076f40) at
/usr/src/sys/amd64/amd64/trap.c:463
#9 0xffffffff80729372 in calltrap () at /usr/src/sys/amd64/amd64/exception.S:232
#10 0xffffffff80d95e21 in arc_read (pio=0xfffff801d4baf3b0,
spa=0xfffff8000fddf000, bp=0xfffffe000e5e8240, done=0x181fd9c00,
private=0xfffff80100000000, priority=0, zio_flags=744616296,
arc_flags=0xfffffe104a0770a0,
zb=0xfffff80100000000) at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c:3331
#11 0xffffffff80d9d331 in dbuf_read (db=0xfffff801e6e737e0, zio=0x0, flags=10)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c:582
#12 0xffffffff80da3ae3 in dmu_buf_hold (os=<value optimized out>, object=<value
optimized out>, offset=<value optimized out>, tag=<value optimized out>,
dbp=0xfffffe104a0771f8, flags=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c:153
#13 0xffffffff80df6bf9 in zap_lockdir (os=0xfffff8001d258400, obj=17, tx=0x0,
lti=RW_READER, fatreader=1, adding=0, zapp=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:467
#14 0xffffffff80df776d in zap_lookup_norm (os=0x0, zapobj=18446744071577351547,
name=0xfffffe104a077490 "mktemp", integer_size=8, num_integers=1,
buf=0xfffffe104a077338, mt=501, realname=0x0, rn_len=0, ncp=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:788
#15 0xffffffff80df771e in zap_lookup (os=0x0, zapobj=18446744071577351547,
name=0x1f5 <Address 0x1f5 out of bounds>, integer_size=1, num_integers=0, buf=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c:773
#16 0xffffffff80e144f6 in zfs_dirent_lock (dlpp=0xfffffe104a0773b0,
dzp=0xfffff8001ddaa5c0, name=0xfffffe104a077490 "mktemp",
zpp=0xfffffe104a0773a8, flag=6, direntflags=0x0, realpnp=<value optimized out>)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c:91
#17 0xffffffff80e1484d in zfs_dirlook (dzp=0xfffff8001ddaa5c0,
name=0xfffffe104a077490 "mktemp", vpp=0xfffffe104a077868, flags=<value optimized
out>, deflg=0x0, rpnp=0x0)
at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c:426
#18 0xffffffff80e28b34 in zfs_lookup () at
/usr/src/sys/modules/zfs/../../cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c:1500
...
The arc_read code in question is this:
if (hdr->b_l2hdr->b_compress ==
ZIO_COMPRESS_EMPTY) {
rzio = zio_null(pio, spa, vd,
l2arc_read_done, cb,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY);
} else {
rzio = zio_read_phys(pio, vd, addr,
hdr->b_l2hdr->b_asize,
buf->b_data, ZIO_CHECKSUM_OFF,
l2arc_read_done, cb, priority,
zio_flags | ZIO_FLAG_DONT_CACHE |
ZIO_FLAG_CANFAIL |
ZIO_FLAG_DONT_PROPAGATE |
ZIO_FLAG_DONT_RETRY, B_FALSE);
}
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
ARCSTAT_INCR(arcstat_l2_read_bytes,
hdr->b_l2hdr->b_asize);
The crash is on the ARCSTAT_INCR line while dereferencing hdr->b_l2hdr.
It's apparent that hdr->b_l2hdr had to be non-NULL for this code path to be
taken. At the moment of the crash it is NULL though:
(kgdb) p *(arc_buf_hdr_t*)0xfffff8001d7cfe58
$6 = {b_dva = {dva_word = {6, 394971608}}, b_birth = 827690, b_cksum0 =
590276882470, b_freeze_lock = {lock_object = {lo_name = 0xffffffff80e84eb6
"buf->b_freeze_lock", lo_flags = 40960000, lo_data = 0, lo_witness = 0x0},
sx_lock = 1}, b_freeze_cksum = 0xfffff8001d0d7e20, b_thawed = 0x0,
b_hash_next = 0x0, b_buf = 0xfffff80e2c61f168, b_flags = 132640, b_datacnt = 1,
b_acb = 0xfffff80f89ba3680, b_cv = {
cv_description = 0xffffffff80e84eab "buf->b_cv", cv_waiters = 0}, b_type =
ARC_BUFC_METADATA, b_size = 8704, b_spa = 2586324942190472777, b_state =
0xffffffff80eaf080, b_arc_node = {list_next = 0x0, list_prev = 0x0},
b_arc_access = 138774732, b_refcnt = {rc_count = 1}, b_l2hdr = 0x0, b_l2node =
{list_next = 0x0, list_prev = 0x0}}
It seems that b_flags include the following bits: ARC_IN_HASH_TABLE,
ARC_IO_IN_PROGRESS, ARC_L2_EVICTED.
As far as I can see it is l2arc_evict that sets ARC_L2_EVICTED and resets
b_l2hdr to NULL. Looks like it does that under protection of hash_lock and
l2arc_buflist_mtx.
On the other hand, it seems that the arc_read code in question runs without
protection of any of these locks (or any other ARC-level lock).
What do you think?
Thank you!
--
Andriy Gapon
Andriy Gapon