本文分析过程中参考借鉴了博客:https://cloud.tencent.com/developer/article/1782057的一些知识点,针对自己碰到的实际问题分析过程中补充了一些问题点的理解分析,欢迎留言交流。

 

内核coredump分析:

通过 crash 命令 ”foreach bt > bt.log“ 获得所有进程的堆栈信息。根据 jbd2 关键字过滤进程,主要有以下几类:

类别 1:内核的 Journal 线程阻塞等待在jbd2_journal_commit_transaction函数

下图的 Journal 线程正是卡在等待 t_updates 归零的循环中,说明此时有一个用户进程正在发起 Commit 操作,但是还没有完成 buffer 链表的更新。

crash> bt 544
PID: 544    TASK: ffff95a25c44d230  CPU: 2   COMMAND: "jbd2/vda1-8"
 #0 [ffff95aa59f8fc18] __schedule at ffffffff9d780d4a
 #1 [ffff95aa59f8fca0] schedule at ffffffff9d7811f9
 #2 [ffff95aa59f8fcb0] jbd2_journal_commit_transaction at ffffffffc01e32dc  [jbd2]
 #3 [ffff95aa59f8fe48] kjournald2 at ffffffffc01e9ee9  [jbd2]
 #4 [ffff95aa59f8fec8] kthread at ffffffff9d0c6321
crash>
crash> ps | grep UN | grep 544
    544      2   2  ffff95a25c44d230  UN   0.0       0      0  [jbd2/vda1-8]
crash>
crash>

crash> dis -rl ffffffffc01e32dc | tail
0xffffffffc01e32be <jbd2_journal_commit_transaction+542>:       mov    -0x118(%rbp),%rdi
0xffffffffc01e32c5 <jbd2_journal_commit_transaction+549>:       callq  0xffffffff9d117054 <__raw_callee_save___pv_queued_spin_unlock>
0xffffffffc01e32ca <jbd2_journal_commit_transaction+554>:       xchg   %ax,%ax
/usr/src/debug/kernel-3.10.0-1062.18.1.el7/linux-3.10.0-1062.18.1.el7.x86_64/arch/x86/include/asm/spinlock.h: 274
0xffffffffc01e32cc <jbd2_journal_commit_transaction+556>:       mov    -0x130(%rbp),%rax
0xffffffffc01e32d3 <jbd2_journal_commit_transaction+563>:       lock incl 0x4(%rax)
/usr/src/debug/kernel-3.10.0-1062.18.1.el7/linux-3.10.0-1062.18.1.el7.x86_64/fs/jbd2/commit.c: 462
0xffffffffc01e32d7 <jbd2_journal_commit_transaction+567>:       callq  0xffffffff9d7811d0 <schedule>
/usr/src/debug/kernel-3.10.0-1062.18.1.el7/linux-3.10.0-1062.18.1.el7.x86_64/fs/jbd2/commit.c: 463
0xffffffffc01e32dc <jbd2_journal_commit_transaction+572>:       mov    -0x130(%rbp),%rdi
crash>

void jbd2_journal_commit_transaction(journal_t *journal)
{
.....
 372         struct transaction_stats_s stats;
 373         transaction_t *commit_transaction;
....
 432         commit_transaction = journal->j_running_transaction;
.....
 454         while (atomic_read(&commit_transaction->t_updates)) {//内核jouranl线程进行Commit前需要等待t_updates为0,jbd2/vda1-8阻塞在这个循环中
                                                                   说明此时有进程正在发起Commit操作
 455                 DEFINE_WAIT(wait);
 456
 457                 prepare_to_wait(&journal->j_wait_updates, &wait,
 458                                         TASK_UNINTERRUPTIBLE);//加入j_wait_updates等待队列并设置为UN状态
 459                 if (atomic_read(&commit_transaction->t_updates)) {
 460                         spin_unlock(&commit_transaction->t_handle_lock);
 461                         write_unlock(&journal->j_state_lock);
 462                         schedule();//从反汇编信息可以知道进程544在这里释放了cpu
 463                         write_lock(&journal->j_state_lock); 
 464                         spin_lock(&commit_transaction->t_handle_lock);
 465                 }
 466                 finish_wait(&journal->j_wait_updates, &wait);
 467         }
             ......
 540         commit_transaction->t_state = T_FLUSH;
             ......
 545         wake_up(&journal->j_wait_transaction_locked);//唤醒等待在j_wait_transaction_locked的进程
             ......  
 1137         commit_transaction->t_state = T_FINISHED;
 1146         wake_up(&journal->j_wait_done_commit);//唤醒等待在j_wait_done_commit的进程
             ......
 }

类别 2:等待 Transaction Commit 完成的进程

都在等待 Transaction Commit 的完成。jbd2_log_wait_commit() 正在j_wait_done_commit队列中等待唤醒,只有当类别1的Journal线程完成 jbd2_journal_commit_transaction() 时,才会调用wake_up(&journal->j_wait_done_commit)唤醒该类进程,因此阻塞等待在这个栈的线程正在等待类别1的Journal 线程。

PID: 32549  TASK: ffff95a1ae20a0e0  CPU: 12  COMMAND: "minio"
 #0 [ffff95a21c393bb8] __schedule at ffffffff9d780d4a
 #1 [ffff95a21c393c40] schedule at ffffffff9d7811f9
 #2 [ffff95a21c393c50] jbd2_log_wait_commit at ffffffffc01e9825  [jbd2]
 #3 [ffff95a21c393cc8] jbd2_complete_transaction at ffffffffc01eaeb2  [jbd2]
 #4 [ffff95a21c393ce8] ext4_sync_file at ffffffffc01ff7b2  [ext4]
 #5 [ffff95a21c393d30] generic_write_sync at ffffffff9d27fc8f
 #6 [ffff95a21c393d40] generic_file_aio_write at ffffffff9d1c0007
 #7 [ffff95a21c393d80] ext4_file_write at ffffffffc01ff352  [ext4]
 #8 [ffff95a21c393df0] do_sync_write at ffffffff9d249fa3
 #9 [ffff95a21c393ec8] vfs_write at ffffffff9d24aa90
#10 [ffff95a21c393f08] sys_write at ffffffff9d24b8af
#11 [ffff95a21c393f50] system_call_fastpath at ffffffff9d78dede
    RIP: 000000000047ec50  RSP: 000000c00fef1b78  RFLAGS: 00000246
    RAX: 0000000000000001  RBX: 000000c000066a00  RCX: 000000000000006d
    RDX: 0000000000000251  RSI: 000000c07f25e000  RDI: 0000000000000043
    RBP: 000000c0075b0428   R8: 0000000000000000   R9: 0000000000000000
    R10: 0000000000000000  R11: 0000000000000212  R12: 00000000000000f2
    R13: 0000000000000000  R14: 0000000001f6cace  R15: 0000000000000000
    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b


/*
 * Wait for a specified commit to complete.
 * The caller may not hold the journal lock.
 */
int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
{
     ......
        while (tid_gt(tid, journal->j_commit_sequence)) {
                ......
                wake_up(&journal->j_wait_commit);
                wait_event(journal->j_wait_done_commit,
                                !tid_gt(tid, journal->j_commit_sequence));//阻塞等待在j_wait_done_commit队列上
                .....
        }
     .....   
        return err;
}

类别 3:等待 Commit Transaction 的用户进程

这类进程卡在wait_transaction_locked() 上,这类线程被动阻塞等待类别1的journal线程

调用wake_up(&journal->j_wait_transaction_locked)唤醒。

PID: 639    TASK: ffff95a25c890000  CPU: 1   COMMAND: "systemd-journal"
 #0 [ffff95a25df2f8f8] __schedule at ffffffff9d780d4a
 #1 [ffff95a25df2f988] schedule at ffffffff9d7811f9
 #2 [ffff95a25df2f998] wait_transaction_locked at ffffffffc01e0085  [jbd2]
 #3 [ffff95a25df2f9f0] add_transaction_credits at ffffffffc01e0368  [jbd2]
 #4 [ffff95a25df2fa50] start_this_handle at ffffffffc01e05e1  [jbd2]
 #5 [ffff95a25df2fae8] jbd2__journal_start at ffffffffc01e0a93  [jbd2]
 #6 [ffff95a25df2fb30] __ext4_journal_start_sb at ffffffffc0238139  [ext4]
 #7 [ffff95a25df2fb70] ext4_dirty_inode at ffffffffc020bcba  [ext4]
 #8 [ffff95a25df2fb90] __mark_inode_dirty at ffffffff9d27b44d
 #9 [ffff95a25df2fbc8] update_time at ffffffff9d267f99
#10 [ffff95a25df2fbf8] file_update_time at ffffffff9d268080
#11 [ffff95a25df2fc38] ext4_page_mkwrite at ffffffffc020bf0d  [ext4]
#12 [ffff95a25df2fc90] do_page_mkwrite at ffffffff9d1ebeba
#13 [ffff95a25df2fd10] do_wp_page at ffffffff9d1ef977
#14 [ffff95a25df2fdb8] handle_mm_fault at ffffffff9d1f3cd2
#15 [ffff95a25df2fe80] __do_page_fault at ffffffff9d788653
#16 [ffff95a25df2fef0] trace_do_page_fault at ffffffff9d788a26
#17 [ffff95a25df2ff30] do_async_page_fault at ffffffff9d787fa2
#18 [ffff95a25df2ff50] async_page_fault at ffffffff9d7847a8
    RIP: 0000564db2859c0e  RSP: 00007ffeea162cc0  RFLAGS: 00010202
    RAX: 00007f59ef4202f0  RBX: 0000000000000059  RCX: 0000000001111349
    RDX: 0000000000000000  RSI: 0000000000000001  RDI: 0000564db4911c10
    RBP: 0000000000000001   R8: 0000000000000059   R9: 0000564db4919580
    R10: 0000000000000000  R11: 0000000000000000  R12: 00000000011112f0
    R13: 0000564db49193c0  R14: 00007ffeea162df0  R15: 000005bad6c6dcce


    
crash> dis -rl ffffffffc01e0368
......    
/usr/src/debug/kernel-3.10.0-1062.18.1.el7/linux-3.10.0-1062.18.1.el7.x86_64/fs/jbd2/transaction.c: 190
0xffffffffc01e0360 <add_transaction_credits+608>:       mov    %rbx,%rdi
0xffffffffc01e0363 <add_transaction_credits+611>:       callq  0xffffffffc01e0000 <wait_transaction_locked>
/usr/src/debug/kernel-3.10.0-1062.18.1.el7/linux-3.10.0-1062.18.1.el7.x86_64/fs/jbd2/transaction.c: 191
0xffffffffc01e0368 <add_transaction_credits+616>:       mov    $0x1,%eax
    
    
/*
 * Wait until we can add credits for handle to the running transaction.  Called
 * with j_state_lock held for reading. Returns 0 if handle joined the running
 * transaction. Returns 1 if we had to wait, j_state_lock is dropped, and
 * caller must retry.
 */
static int add_transaction_credits(journal_t *journal, int blocks,
                                   int rsv_blocks)
{
        transaction_t *t = journal->j_running_transaction;
        int needed;
        int total = blocks + rsv_blocks;

        /*
         * If the current transaction is locked down for commit, wait
         * for the lock to be released.
         */
        if (t->t_state == T_LOCKED) {
                wait_transaction_locked(journal);//阻塞等待在journal->j_wait_transaction_locked队列上
                return 1;
        }
        ......
}

因此类别 1 内核的 Journal 线程jbd2/vda1-8卡住是导致其他进程卡住的原因,此时整个 ext4 文件系统的文件操作都无法继续。

下面着重分析该进程的堆栈及相关代码。

jbd2/vda1-8进程堆栈分析:

截止到系统触发panic时jbd2/vda1-8进程已经连续D状态达182秒:

crash> ps -l |grep -w 544
[6300064763554793] [UN]  PID: 544    TASK: ffff95a25c44d230  CPU: 2   COMMAND: "jbd2/vda1-8"
crash>
crash> log | grep -w "Kernel panic"
[6300246.253028] Kernel panic - not syncing: hung_task: blocked tasks
crash>
crash>
crash> pd (6300246-6300064)
$1 = 182
crash>

根据堆栈和反汇编信息可以找到journal_t地址:

crash> bt 544
PID: 544    TASK: ffff95a25c44d230  CPU: 2   COMMAND: "jbd2/vda1-8"
 #0 [ffff95aa59f8fc18] __schedule at ffffffff9d780d4a
 #1 [ffff95aa59f8fca0] schedule at ffffffff9d7811f9
 #2 [ffff95aa59f8fcb0] jbd2_journal_commit_transaction at ffffffffc01e32dc  [jbd2]
 #3 [ffff95aa59f8fe48] kjournald2 at ffffffffc01e9ee9  [jbd2]
 #4 [ffff95aa59f8fec8] kthread at ffffffff9d0c6321
crash>
crash> journal_t.j_devname ffff95a25e74e800
  j_devname = "vda1-8\000"
crash>
crash命令search -t 搜索journal_t地址可以快速找下哪些进程会使用到同一个journal_t

遍历下阻塞在journal_t.j_wait_updates等待队列都有哪些进程:

crash> journal_t.j_wait_updates ffff95a25e74e800 -x
  j_wait_updates = {
    ....
    task_list = {
      next = 0xffff95aa59f8fda0,
      prev = 0xffff95aa59f8fda0
    }
  }
crash>

crash> list 0xffff95aa59f8fda0 -l wait_queue_t.task_list -s wait_queue_t.private
ffff95aa59f8fda0
  private = 0xffff95a25c44d230 //task_struct地址,对应jbd2/vda1-8进程
ffff95a25e74e8c8
  private = 0xffff95a25e74e8b0 //该地址已经不存在,说明进程已经退出
crash>

获取journal_t的 running Transaction 0xffff959aedbbb800,可以看到它的t_updates为1,

满足在jbd2_journal_commit_transaction中进入阻塞等待条件:

crash> journal_t.j_running_transaction ffff95a25e74e800 -x
  j_running_transaction = 0xffff959aedbbb800
  
  
crash> journal_t.j_running_transaction ffff95a25e74e800 -xo
typedef struct journal_s {
  [ffff95a25e74e860] transaction_t *j_running_transaction;
} journal_t;
crash>


crash> transaction_t.t_journal,t_state,t_updates 0xffff959aedbbb800
  t_journal = 0xffff95a25e74e800
  t_state = T_LOCKED
  t_updates = {
    counter = 1 
  }
crash>

在 Transaction 被提交前,用户进程会先通过 jbd2__journal_start()申请一个新的handle,

随后调用start_this_handle()为handle做初始化,并且把handle保存到当前进程的task_struct.journal_info中,

同时也会对t_updates加1

1107 static inline handle_t *journal_current_handle(void)
1108 {
1109         return current->journal_info; 
1110 }
 
 432 handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks,
 433                               gfp_t gfp_mask, unsigned int type,
 434                               unsigned int line_no)
 435 {
 436         handle_t *handle = journal_current_handle();//从进程task_struct.journal_info获取handle
 439         if (!journal)
 440                 return ERR_PTR(-EROFS);
 441
 442         if (handle) {//task_struct.journal_info不为空这里会返回
 443                 J_ASSERT(handle->h_transaction->t_journal == journal);
 444                 handle->h_ref++;
 445                 return handle;
 446         }
 447
 448         handle = new_handle(nblocks);//申请一个新的handle
             ......
 464         err = start_this_handle(journal, handle, gfp_mask);
             ...... 
 476         return handle;
 477 }

在 start_this_handle() 中,Handle 中保存了 running Transaction 的指针,并且Transaction的t_updates被加一。

271 static int start_this_handle(journal_t *journal, handle_t *handle,
272                              gfp_t gfp_mask)
273 {
274    transaction_t   *transaction, *new_transaction = NULL; 
        ......     
359    transaction = journal->j_running_transaction;//报错running Transaction
       .....
375    /* OK, account for the buffers that this operation expects to
376    * use and add the handle to the running transaction.
377    */
378    update_t_max_wait(transaction, ts);
379    handle->h_transaction = transaction;//running Transaction
380    handle->h_requested_credits = blocks;
381    handle->h_start_jiffies = jiffies;
382    atomic_inc(&transaction->t_updates);//t_updates加1
383    atomic_inc(&transaction->t_handle_count);
384    jbd_debug(4, "Handle %p given %d credits (total %d, free %lu)\n",
385                   handle, blocks,
386                   atomic_read(&transaction->t_outstanding_credits),
387                   jbd2_log_space_left(journal));
388    read_unlock(&journal->j_state_lock);
389    current->journal_info = handle;//handle存放在进程的task_struct.journal_info中
390
391    lock_map_acquire(&handle->h_lockdep_map);
392    jbd2_journal_free_transaction(new_transaction);
393    return 0;
394 }

t_updates递减过程以及唤醒在journal->j_wait_updates队列等待的进程:

全局搜索下代码发现只有两个函数会主动调用wake_up(&journal->j_wait_updates)唤醒阻塞在j_wait_updates队列进程

# grep -w "j_wait_updates" ./ -nr | grep wake_up
./fs/jbd2/transaction.c:670:            wake_up(&journal->j_wait_updates);
./fs/jbd2/transaction.c:1684:           wake_up(&journal->j_wait_updates);

正常Transaction的 t_updates是在jbd2_journal_stop中递减并且会唤醒在journal->j_wait_updates队列中等待的进程

int jbd2_journal_stop(handle_t *handle)
{

  transaction_t *transaction = handle->h_transaction;
  ....
if (atomic_dec_and_test(&transaction->t_updates)) {
                wake_up(&journal->j_wait_updates);
                if (journal->j_barrier_count)
                        wake_up(&journal->j_wait_transaction_locked);
        }
  ....    

  current->journal_info = NULL;//清掉存放在task_struct.journal_info的handle
  ....  
}

这个问题里为什么jbd2/vda1-8进程没有被唤醒并且t_updates为何没有递减为0?

t_updates不为0,说明还没有执行 jbd2_journal_stop(),那么handle会被某个进程所持有,保存在进程控制块 task_struct 的 journal_info中,

而由于handle结构体成员h_transaction存放的是running Transaction的地址(也就是journal_t.j_running_transaction),因此

只要把所有进程task_struct.journal_info打出来找到不为0的值就相当于找到了当前所有正在操作的handle,然后再把每一个handle

的成员h_transaction跟前面已知在等待的journal的j_running_transaction值进行对比找到相同的值,那么就可以知道哪个进程没有

调用jbd2_journal_stop来递减t_updates递减以及唤醒在journal->j_wait_updates队列等待的进程。

搜索所有进程的 task_struct信息:

crash> foreach task > task.txt

过滤出所有非0的journal_info地址,也就是handle_t地址:

# cat task.txt  | grep journal_info | grep -v 0x0 | awk -F',' '{print $(NF-1)}' | awk '{print "handle_t.h_transaction "$3}' > handle_t.h_transaction.txt

获取所有handle_t.h_transaction信息:

crash> <./handle_t.h_transaction.txt
crash> handle_t.h_transaction 0xffff95aa5a8a47b0
    h_transaction = 0xffff959acafd3100
crash> handle_t.h_transaction 0xffff95aa5aa75900
    h_transaction = 0xffff959acafd3100
crash> handle_t.h_transaction 0xffff95a25932b120
    h_transaction = 0xffff959aedbbb800
crash>

可以看到handle_t.h_transaction 0xffff95a25932b120的结果

h_transaction地址0xffff959aedbbb800跟前面pid 544进程jbd2/vda1-8

栈中找到的j_running_transaction是一样的:

crash> journal_t.j_running_transaction ffff95a25e74e800 -x
  j_running_transaction = 0xffff959aedbbb800

那么当前正在访问地址handle_t 0xffff95a25932b120就是我们要找的进程,

在前面导出的所有进程的task_struct信息的task.txt文件中找到对应journal_info

关联的进程PID: 11233

分析到这里,可以知道jbd2/vda1-8进程之所以卡在jbd2_journal_commit_transaction上,

是因为在等11233进程返回调用jbd2_journal_stop递减t_updates递减以及唤醒在

journal->j_wait_updates等待队列中的bd2/vda1-8进程。

从堆栈看系统panic时进程11233正在尝试申请内存,并且进程为RUNNING状态,

那为什么会导致出问题呢?

crash> ps | grep 11233
> 11233  10869   1  ffff959a4b470000  RU   7.7 11849084 5220708  VM Periodic Tas
crash>
crash> bt 11233
PID: 11233  TASK: ffff959a4b470000  CPU: 1   COMMAND: "VM Periodic Tas"
 #0 [ffff95a25fc48e48] crash_nmi_callback at ffffffff9d058017
 #1 [ffff95a25fc48e58] nmi_handle at ffffffff9d78593c
 #2 [ffff95a25fc48eb0] do_nmi at ffffffff9d785b5d
 #3 [ffff95a25fc48ef0] end_repeat_nmi at ffffffff9d784d9c
    [exception RIP: mem_cgroup_zone_lruvec+86]
    RIP: ffffffff9d23be66  RSP: ffff959b426478b0  RFLAGS: 00000246
    RAX: ffff959ab9b70410  RBX: ffff959b426479a0  RCX: 0000000000000400
    RDX: 0000000000000001  RSI: ffff959ae2104000  RDI: ffff95aa7ffda000
    RBP: ffff959b426478b0   R8: 0000000000000000   R9: 0000000000000000
    R10: ffff95aa7ffda000  R11: 0000000000000006  R12: 0000000000000000
    R13: ffff95aa7ffda000  R14: 0000000000000001  R15: ffff959ae2104000
    ORIG_RAX: ffffffffffffffff  CS: 0010  SS: 0018
--- <NMI exception stack> ---
 #4 [ffff959b426478b0] mem_cgroup_zone_lruvec at ffffffff9d23be66
 #5 [ffff959b426478b8] shrink_zone at ffffffff9d1d3c6b
 #6 [ffff959b42647910] do_try_to_free_pages at ffffffff9d1d4160
 #7 [ffff959b42647988] try_to_free_pages at ffffffff9d1d468c
 #8 [ffff959b42647a20] free_more_memory at ffffffff9d281ede
 #9 [ffff959b42647a58] __getblk at ffffffff9d28324b
#10 [ffff959b42647ab8] __ext4_get_inode_loc at ffffffffc0203dc3  [ext4]
#11 [ffff959b42647b20] ext4_get_inode_loc at ffffffffc02065fd  [ext4]
#12 [ffff959b42647b30] ext4_reserve_inode_write at ffffffffc02083a6  [ext4]
#13 [ffff959b42647b60] ext4_mark_inode_dirty at ffffffffc0208473  [ext4]
#14 [ffff959b42647bb8] ext4_dirty_inode at ffffffffc020bcd0  [ext4]
#15 [ffff959b42647bd8] __mark_inode_dirty at ffffffff9d27b44d
#16 [ffff959b42647c10] update_time at ffffffff9d267f99
#17 [ffff959b42647c40] file_update_time at ffffffff9d268080
#18 [ffff959b42647c80] ext4_page_mkwrite at ffffffffc020bf0d  [ext4]
#19 [ffff959b42647cd8] do_page_mkwrite at ffffffff9d1ebeba
#20 [ffff959b42647d58] do_shared_fault at ffffffff9d1ec556
#21 [ffff959b42647db8] handle_mm_fault at ffffffff9d1f3603
#22 [ffff959b42647e80] __do_page_fault at ffffffff9d788653
#23 [ffff959b42647ef0] trace_do_page_fault at ffffffff9d788a26
#24 [ffff959b42647f30] do_async_page_fault at ffffffff9d787fa2
#25 [ffff959b42647f50] async_page_fault at ffffffff9d7847a8
    RIP: 00007fb7fc0b060e  RSP: 00007fb7d48f8bf0  RFLAGS: 00010202
    RAX: 000000001238e540  RBX: 00007fb7faec4a10  RCX: 0000000000000032
    RDX: 00007fb7f40258a0  RSI: 0000000000000000  RDI: 00007fb7f4024960
    RBP: 00007fb7d48f8c00   R8: 0000000000000003   R9: 000000000060219e
    R10: 000000000000a382  R11: 00144c72b94d61b0  R12: 0000000000000008
    R13: 00007fb7f4283cf0  R14: 0000000000000032  R15: 0000000000000032
    ORIG_RAX: ffffffffffffffff  CS: 0033  SS: 002b
crash>

如果 __getblk获取不到内存时,会不断地尝试调用 free_more_memory() 释放掉部分内存,

然后再尝试获取。如果一直获取不到,那么将会死循环在__getblk_slow里:

1347 struct buffer_head *
1348 __getblk(struct block_device *bdev, sector_t block, unsigned size)
1349 {
1350         struct buffer_head *bh = __find_get_block(bdev, block, size);
1351
1352         might_sleep();
1353         if (bh == NULL)
1354                 bh = __getblk_slow(bdev, block, size);
1355         return bh;
1356 }

1047 static struct buffer_head *
1048 __getblk_slow(struct block_device *bdev, sector_t block, int size)
1049 {
1050         /* Size must be multiple of hard sectorsize */
1051         if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
1052                         (size < 512 || size > PAGE_SIZE))) {
1053                 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1054                                         size);
1055                 printk(KERN_ERR "logical block size: %d\n",
1056                                         bdev_logical_block_size(bdev));
1057
1058                 dump_stack();
1059                 return NULL;
1060         }
1061
1062         for (;;) {//进程因为申请不到内存会导致在这里出现死循环:
1063                 struct buffer_head *bh;
1064                 int ret;
1065
1066                 bh = __find_get_block(bdev, block, size);
1067                 if (bh)
1068                         return bh;
1069
1070                 ret = grow_buffers(bdev, block, size);
1071                 if (ret < 0)
1072                         return NULL;
1073                 if (ret == 0)
1074                         free_more_memory();
1075         }
1076 }

因为进程一直卡在__getblk_slow导致无法返回到ext4_dirty_inode去调用ext4_journal_stop

递减t_updates以及唤醒在journal->j_wait_updates等待队列中的bd2/vda1-8进程:

5362 void ext4_dirty_inode(struct inode *inode, int flags)
5363 {
5364         handle_t *handle;
5365
5366         handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
5367         if (IS_ERR(handle))
5368                 goto out;
5369
5370         ext4_mark_inode_dirty(handle, inode);
5371
5372         ext4_journal_stop(handle);
5373 out:
5374         return;
5375 }

那么__getblk为什么会分配不出内存?

调用栈里可以看到__getblk调用了free_more_memory尝试去回收cache,接下来看看为什么不能释放出可用内存.

找出PID: 11233 TASK:0xffff959a4b470000所在的容器memory cgroup路径:

crash> p ((struct task_struct*)0xffff959a4b470000)->cgroups.subsys[3].cgroup.dentry
$11 = (struct dentry *) 0xffff959a95097500
crash>
crash> files -d 0xffff959a95097500
     DENTRY           INODE           SUPERBLK     TYPE PATH
ffff959a95097500 ffff959af4f26500 ffff95aa7fc2d000 DIR  /sys/fs/cgroup/memory/kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/23936a5916df4d8ca6a52936e5d68adf68223cbb8499a4d89aac19a5dc079e7f
crash>

查看容器memory cgroup的内存使用情况和mem limit,仅剩可用内存5242880KB-5236564KB=6316KB:

crash>  p ((struct task_struct*)0xffff959a4b470000)->cgroups.subsys[3]
$9 = (struct cgroup_subsys_state *) 0xffff959adc811400    
    
crash> struct mem_cgroup.memory 0xffff959adc811400
  memory = {
    count = {
      counter = 1309141//5236564KB
    },
    limit = 1310720,//5242880KB
    parent = 0xffff959b40121c48,
    watermark = 1310606,
    failcnt = 0
  }
crash>      

找到容器memory cgroup的父cgroup,也就是pod所对应的memory cgroup:

crash> struct mem_cgroup.memory -xo
struct mem_cgroup {
   [0x48] struct page_counter memory;
}
crash>

crash> px (0xffff959b40121c48-0x48)
$12 = 0xffff959b40121c00
crash>      

pod下剩余可用内存5242880KB-5237400KB=5480KB:

crash> struct mem_cgroup.css,memory 0xffff959b40121c00
  css = {
    cgroup = 0xffff959a432da800,
    ...
  }
  memory = {
    count = {
      counter = 1309350//单位page,5237400KB
    },
    limit = 1310720,
    parent = 0xffff95aa55954848,
    watermark = 1310720,//单位page,5242880KB
    failcnt = 39865632
  }
crash>

crash> struct cgroup.dentry 0xffff959a432da800
  dentry = 0xffff959a4a2e1b00
crash> files -d 0xffff959a4a2e1b00
     DENTRY           INODE           SUPERBLK     TYPE PATH
ffff959a4a2e1b00 ffff959add7e6bf0 ffff95aa7fc2d000 DIR  /sys/fs/cgroup/memory/kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74
crash>

前面分析到出问题时PID: 11233所在pod mem limit设置的是5242880KB,并且已经使用了5237400KB,只剩余5480KB,

我们找下已使用内存5237400KB都是什么内存?

通过crash命令可以查看进程0xffff959a4b470000同一个cgroup的所有进程:

list task_struct.cg_list -s task_struct.pid,comm -h 0xffff959a4b470000

列出该进程PID: 11233 TASK:0xffff959a4b470000所在pod所有进程的mm_struct:

crash>list task_struct.cg_list -s task_struct.pid -h 0xffff959a4b470000 | grep -v pid | awk '{print "task_struct.mm,comm "$1}' >cg_list_mm_comm.txt

crash> <./cg_list_mm_comm.txt | grep 0xffff959ad8a60c80
  mm = 0xffff959ad8a60c80
  mm = 0xffff959ad8a60c80
  mm = 0xffff959ad8a60c80
  mm = 0xffff959ad8a60c80
 ........

cgroup中所有进程指向同一个mm_struct地址:

 crash> <./cg_list_mm_comm.txt | grep 0xffff959ad8a60c80 | sort | uniq
  mm = 0xffff959ad8a60c80

可以看到被使用的内存基本都是匿名页,既没有开启swap,也没有可被回收的内存,只能依赖oom kill来释放内存了。

crash> mm_struct.rss_stat 0xffff959ad8a60c80
  rss_stat = {
    count = {{
        counter = 5
      }, {
        counter = 1304200 //MM_ANONPAGES内存,单位page,5216800KB
      }, {
        counter = 0
      }}
  }
  
366 enum {
367         MM_FILEPAGES,   /* Resident file mapping pages */
368         MM_ANONPAGES,   /* Resident anonymous pages */
369         MM_SWAPENTS,    /* Anonymous swap entries */
370         NR_MM_COUNTERS,
371         /*
372          * Resident shared memory pages
373          *
374          * We can't expand *_rss_stat without breaking kABI
375          * MM_SHMEMPAGES need to be set apart
376          */
377         MM_SHMEMPAGES = NR_MM_COUNTERS
378 };

通过PID: 11233 TASK:0xffff959a4b470000所在的memory cgroup路径名可知其k8s pod id为640e2729-2741-463f-a639-d068a97f6a74 , 查看dmesg历史记录这个pod也是发生过oom kill的。

crash> log | grep 640e2729-2741-463f-a639-d068a97f6a74
[4828067.592843] Task in /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/210d097be54db64bb5ff54c7e81128a08d57264341dccbbd4d5390ed3d5104ae killed as a result of limit of /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74
[4828067.592851] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74: cache:0KB rss:0KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
[4828067.599341] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/53b295539d6879b833520d575e6d9a7815553942fe1d54bf4476a13514546e36: cache:0KB rss:40KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:40KB inactive_file:0KB active_file:0KB unevictable:0KB
[4828067.599354] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/210d097be54db64bb5ff54c7e81128a08d57264341dccbbd4d5390ed3d5104ae: cache:2528KB rss:5228628KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:5228628KB inactive_file:1168KB active_file:1204KB unevictable:0KB
crash>

从mem_cgroup_oom函数实现可知道如果触发了oom,则进程的task_struct.memcg_oom.memcg会被赋值为触发oom的memory cgroup:

2241 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
2242 {
2243         if (!current->memcg_oom.may_oom)
2244                 return;
2245         /*
2246          * We are in the middle of the charge context here, so we
2247          * don't want to block when potentially sitting on a callstack
2248          * that holds all kinds of filesystem and mm locks.
2249          *
2250          * Also, the caller may handle a failed allocation gracefully
2251          * (like optional page cache readahead) and so an OOM kille
2252          * invocation might not even be necessary.
2253          *
2254          * That's why we don't do anything here except remember the
2255          * OOM context and then deal with it at the end of the page
2256          * fault when the stack is unwound, the locks are released,
2257          * and when we know whether the fault was overall successful.
2258          */
2259         css_get(&memcg->css);
2260         current->memcg_oom.memcg = memcg;
2261         current->memcg_oom.gfp_mask = mask;
2262         current->memcg_oom.order = order;
2263 }

从下面信息可知道PID: 11233已经触发了pod oom:

crash> task_struct.memcg_oom ffff959a4b470000
  memcg_oom = {
    memcg = 0xffff959b40121c00,//代表该进程触发了其所在mem cgroup oom
    gfp_mask = 80,
    order = 0,
    may_oom = 1 
  }
crash>

crash> struct mem_cgroup,css,oom_kill_disable 0xffff959b40121c00
  memory = {
  ...
  css = {
    cgroup = 0xffff959a432da800,
    ....
  }
  oom_kill_disable = 0
crash>

crash> struct cgroup.dentry 0xffff959a432da800
  dentry = 0xffff959a4a2e1b00
crash> files -d 0xffff959a4a2e1b00
     DENTRY           INODE           SUPERBLK     TYPE PATH
ffff959a4a2e1b00 ffff959add7e6bf0 ffff95aa7fc2d000 DIR  /sys/fs/cgroup/memory/kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74
crash>

那这次既然进程PID: 11233 TASK: ffff959a4b470000已经触发了oom,为什么没有发生oom kill?

对比pid 11233的调用栈查看代码可以知道oom kill是发生在从__getblk退出返回到handle_mm_fault函数才

会被调用,而前面已经分析过进程因为申请不了内存导致无法从__getblk退出。

handle_mm_fault
  ->mem_cgroup_oom_synchronize
    ->mem_cgroup_out_of_memory

查看coredump log信息的时间戳也可以看pod 640e2729最后一次发生oom到触发hung task panic间隔了408小时,说明出问题时

虽然该pod触发了oom,但是并未真正发生oom kill释放内存:

crash> log | grep 640e2729
[4828067.592843] Task in /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/210d097be54db64bb5ff54c7e81128a08d57264341dccbbd4d5390ed3d5104ae killed as a result of limit of /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74
[4828067.592851] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74: cache:0KB rss:0KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:0KB inactive_file:0KB active_file:0KB unevictable:0KB
[4828067.599341] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/53b295539d6879b833520d575e6d9a7815553942fe1d54bf4476a13514546e36: cache:0KB rss:40KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:40KB inactive_file:0KB active_file:0KB unevictable:0KB
[4828067.599354] Memory cgroup stats for /kubepods/burstable/pod640e2729-2741-463f-a639-d068a97f6a74/210d097be54db64bb5ff54c7e81128a08d57264341dccbbd4d5390ed3d5104ae: cache:2528KB rss:5228628KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:5228628KB inactive_file:1168KB active_file:1204KB unevictable:0KB
crash>
    
crash> log | grep panic
[6300246.253028] Kernel panic - not syncing: hung_task: blocked tasks
[6300246.258090]  [<ffffffff9d774a0b>] panic+0xe8/0x21f
crash>
crash>    
crash> pd (6300246-4828067)
$18 = 1472179
crash> pd 1472179/3600
$19 = 408
crash>

总结下前面的分析:

node上的一个k8s pod的其中一个进程11233在写IO时进入 __getblk()等待可用的内存,只有拿到足够内存,

才能完成 ext4_journal_stop()将 t_updates 递减,其对应的内核 Journal 线程才能被唤醒,

但是由于该pod的内存占用已经达到了其运行使用的上限,且无可回收的内存,导致进程触发了pod

所在memory cgroup oom,但是由于引起oom的进程因为申请不到足够的内存无法从__getblk()函数里

退出到handle_mm_fault触发oom kill释放内存,最终引起了死环。

问题解决:

通过调整pod mem limit避免容器内存触发oom可以规避该问题,但是容器场景下oom是很难完全避免的,特别是在混布场景下而其中一个业务pod 频繁oom触发内核问题会导致整个node的业务pod都不可用,需要从内核代码逻辑里看下问题是如何触发的以及新版本是否有相应的优化。

__getblk调用链的关键函数如下:

__getblk
->__getblk_slow
    ->grow_buffers
      ->grow_dev_page
        ->find_or_create_page
          ->add_to_page_cache_lru
            ->__add_to_page_cache_locked
              -> mem_cgroup_cache_charge
                -> mem_cgroup_charge_common
                  ->__mem_cgroup_try_charge
                   ->mem_cgroup_do_charge
                     ->mem_cgroup_oom
      ->free_more_memory    

分析cgroup分配内存的相关代码,如果gfp_mask带了__GFP_NOFAIL标记,那么即使cgroup已用内存超过cgroup memory limit也可以申请成功,只是会把超过的新申请内存page统计到root memory cgroup里去:

1121 struct page *find_or_create_page(struct address_space *mapping,
1122                 pgoff_t index, gfp_t gfp_mask)
1123 {
1124         struct page *page;
1125         int err;
1126 repeat:
1127         page = find_lock_page(mapping, index);
1128         if (!page) {
1129                 page = __page_cache_alloc(gfp_mask);//申请到新的page
1130                 if (!page)
1131                         return NULL;
1132                 /*
1133                  * We want a regular kernel memory (not highmem or DMA etc)
1134                  * allocation for the radix tree nodes, but we need to honou
1135                  * the context-specific requirements the caller has asked for.
1136                  * GFP_RECLAIM_MASK collects those requirements.
1137                  */
1138                 err = add_to_page_cache_lru(page, mapping, index,
1139                         (gfp_mask & GFP_RECLAIM_MASK));
1140                 if (unlikely(err)) {//add_to_page_cache_lru如果最终返回非0则释放该page,表示申请page失败
1141                         page_cache_release(page);
1142                         page = NULL;
1143                         if (err == -EEXIST)
1144                                 goto repeat;
1145                 }
1146         }
1147         return page;
1148 }

3856 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
3857                                 gfp_t gfp_mask, enum charge_type ctype)
3858 {
             ......
3874         ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);//这里判断加上新申请的page会否超过cgroup memory limit
3875         if (ret == -ENOMEM)//返回-ENOMEM代表已用内存超过cgroup memory limit,最终返回find_or_create_page释放该page
3876                 return ret;
3877         __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);//这里memcg为root mem cgroup
3878         return 0;
3879 }



2728 static int __mem_cgroup_try_charge(struct mm_struct *mm,
2729                                    gfp_t gfp_mask,
2730                                    unsigned int nr_pages,
2731                                    struct mem_cgroup **ptr,
2732                                    bool oom)
2733 {    
             ......
2760         if (gfp_mask & __GFP_NOFAIL)
2761                 oom = false;
             ......
2859 nomem:
2860         if (!(gfp_mask & __GFP_NOFAIL)) {//gfp_mask如果带了__GFP_NOFAIL标记则不进入该流程,
2861                 *ptr = NULL;
2862                 return -ENOMEM;       //所以即使cgroup已使用内存超了memory limit也不会返回-ENOMEM
2863         }
2864 bypass: //gfp_mask带了 __GFP_NOFAIL标记则已使用内存超cgroup memory limit走到这里返回-EINTR
2865         *ptr = root_mem_cgroup;//返回root mem cgroup
2866         return -EINTR;
2867 }

但是grow_dev_page() 在调用 find_or_create_page() 之前已经携带了 __GFP_NOFAIL (commit 84235de394d9775bfaa7fa9762a59d91fef0c1fc引入),按理来说,后续的内存分配就不会受到 cgroup memory limit 限制了,但为什么 __getblk() 还是没有成功呢?

30 #define ___GFP_NOFAIL           0x800u
84 #define __GFP_NOFAIL    ((__force gfp_t)___GFP_NOFAIL)  /* See above */    

950 static int
 951 grow_dev_page(struct block_device *bdev, sector_t block,
 952                 pgoff_t index, int size, int sizebits)
 953 {
 954         struct inode *inode = bdev->bd_inode;
 955         struct page *page;
 956         struct buffer_head *bh;
 957         sector_t end_block;
 958         int ret = 0;            /* Will call free_more_memory() */
 959         gfp_t gfp_mask;
 960
 961         gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS;
 962         gfp_mask |= __GFP_MOVABLE;
 963         /*
 964          * XXX: __getblk_slow() can not really deal with failure and
 965          * will endlessly loop on improvised global reclaim.  Prefe
 966          * looping in the allocator rather than here, at least that
 967          * code knows what it's doing.
 968          */
 969         gfp_mask |= __GFP_NOFAIL;         //这里已经带了__GFP_NOFAIL标志
 970
 971         page = find_or_create_page(inode->i_mapping, index, gfp_mask);
             ......
1010 }             

从coredump查看gfp_mask值确实也是没有带___GFP_NOFAIL标记:

crash> task_struct.memcg_oom ffff959a4b470000 -x
  memcg_oom = {
    memcg = 0xffff959b40121c00,
    gfp_mask = 0x50,
    order = 0x0,
    may_oom = 0x1
  }
crash>
crash> px (0x50&0x800) //gfp_mask & ___GFP_NOFAIL结果为0,也就是___GFP_NOFAIL标志被清除了
$14 = 0x0
crash>    

__GFP_NOFAIL 标志被清除:

mapping_gfp_constraint()会将传递的 gfp_mask 与 inode->i_mapping的 flags 进行按位与操作。通过分析coredump确认 inode->i_mapping 的flags不支持 __GFP_NOFAIL,导致grow_dev_page传递的 __GFP_NOFAIL 丢失了!

79 static inline gfp_t mapping_gfp_mask(struct address_space * mapping)
 80 {
 81         return (__force gfp_t)mapping->flags & __GFP_BITS_MASK;
 82 }
 83
 84 /* Restricts the given gfp_mask to what the mapping allows. */
 85 static inline gfp_t mapping_gfp_constraint(struct address_space *mapping,
 86                 gfp_t gfp_mask)
 87 {
 88         return mapping_gfp_mask(mapping) & gfp_mask;
 89 }


584 static int __add_to_page_cache_locked(struct page *page,
 585                                       struct address_space *mapping,
 586                                       pgoff_t offset, gfp_t gfp_mask,
 587                                       void **shadowp)
 588 {
 589         int error;
 590
 591         VM_BUG_ON(!PageLocked(page));
 592         VM_BUG_ON(PageSwapBacked(page));
 593
 594         gfp_mask = mapping_gfp_constraint(mapping, gfp_mask);//___GFP_NOFAIL在这里被丢了
 595
 596         error = mem_cgroup_cache_charge(page, current->mm,
 597                                         gfp_mask & GFP_RECLAIM_MASK);
             ......
627 }             
    
110 #define __GFP_BITS_SHIFT 25     /* Room for N __GFP_FOO bits */
111 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))  //这里结果为0x1ffffff
    
crash> eval 1<<25
hexadecimal: 2000000  (32MB)
    decimal: 33554432
      octal: 200000000
     binary: 0000000000000000000000000000000000000010000000000000000000000000
crash> px (0x2000000-1)
$17 = 0x1ffffff
crash>
    
    
    
crash> struct block_device.bd_inode ffff959bbf810680
  bd_inode = 0xffff959bbf810770
crash> struct inode.i_mapping 0xffff959bbf810770
  i_mapping = 0xffff959bbf8108c0
crash> struct address_space.flags 0xffff959bbf8108c0 -x
  flags = 0x200d0
crash>    
    
crash> eval 0x200d0 & 0x1ffffff
hexadecimal: 200d0
    decimal: 131280
      octal: 400320
     binary: 0000000000000000000000000000000000000000000000100000000011010000
crash>
    
0x200d0 & ___GFP_NOFAIL=0x200d0&0x800 = 0

查看linux主线代码发现__add_to_page_cache_locked未有添加mapping_gfp_constraint函数,不清楚redhat是怎么引入的,当前问题主要是由这里引入的。

另外linux主线增加了如下三个patch确保通过grow_dev_page()申请内存都使用 __GFP_NOFAIL:

bc48f001de12 buffer: eliminate the need to call free_more_memory() in getblk_slow()
94dc24c0c59a buffer: grow_dev_page() should use GFP_NOFAIL for all cases
640ab98fb362 buffer: have alloc_page_buffers() use __GFP_NOFAIL

2020-12的时候已经有centos7用户給centos官方反馈过该问题,截止3.10.0-1160.36.2.el7.x86_64版本centos7还未有修复该问题。

https://bugs.centos.org/view.php?id=17982

建议腾讯云用户使用TencentOS解决该问题,TencentOS相关介绍参考:

https://cloud.tencent.com/document/product/457/50124

https://github.com/Tencent/TencentOS-kernel

发表评论

邮箱地址不会被公开。 必填项已用*标注