Python: 通过预置后门(调试接口)的方式调试运行中的进程

前言

每当运行中的程序/服务出现问题的时候我们都希望能够尽快定位问题的原因,能够复现问题和解决问题。 一般都会想如果能知道程序正在进行什么操作就好了,如果能马上复现问题就好了,如果能知道当前程序内变量的值就好了。知道这些信息的方法有很多种,本文介绍一种通过预置后门的方法来调试运行中进程的思路和方法。

常用的预置后门的方法是:通过定义信号处理程序,在接收到相应的信号时,进行特定的,便于调试和查找问题的操作。 比如下面将要举的两个例子:

  • 接收到信号时输出便于调试的特定信息
  • 接收到信号时开启远程调试服务

接收到信号时输出便于调试的特定信息

比如输出当前程序的 traceback 信息,这样就可以知道当前程序运行到哪里了,甚至知道代码执行位置的局部变量和全局变量的值。

在下面的示例程序中,程序在接收到 USR1 信号时输出程序的 traceback 信息:

# -*- coding: utf-8 -*-
from queue import Queue
import signal
import sys
import threading
import time
import traceback


def output_tracebacks(signum, frame):
    id2thread = {}
    for thread in threading.enumerate():
        id2thread[thread.ident] = thread
    for thread_id, stack in sys._current_frames().items():
        stack_list = traceback.format_list(traceback.extract_stack(stack))
        print('thread {}:'.format(id2thread[thread_id]))
        print(''.join(stack_list))


def setup_backdoor():
    signal.signal(signal.SIGUSR1, output_tracebacks)


def worker(q):
    while True:
        task = q.get()
        if task is None:
            break
        # do something with task
        time.sleep(1.2)


def producer(q):
    for x in range(100):
        q.put(x)
        time.sleep(1)
    q.put(None)


setup_backdoor()
q = Queue()
t1 = threading.Thread(target=producer, args=(q,))
t1.start()
t2 = threading.Thread(target=worker, args=(q,))
t2.start()
for t in [t1, t2]:
    t.join()

运行程序并通过 USR1 信号激活后门,获取程序的 traceback 信息:

$ python testa.py &
[1] 79163
$ kill -s USR1 79163
thread <Thread(Thread-2, started 123145565609984)>:
  File "/xxx/lib/python3.6/threading.py", line 884, in _bootstrap
    self._bootstrap_inner()
  File "/xxx/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/xxx/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "testa.py", line 30, in worker
    time.sleep(1.2)

thread <Thread(Thread-1, started 123145560354816)>:
  File "/xxx/lib/python3.6/threading.py", line 884, in _bootstrap
    self._bootstrap_inner()
  File "/xxx/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/xxx/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "testa.py", line 36, in producer
    time.sleep(1)

thread <_MainThread(MainThread, started 140736812057536)>:
  File "testa.py", line 47, in <module>
    t.join()
  File "/xxx/lib/python3.6/threading.py", line 1056, in join
    self._wait_for_tstate_lock()
  File "/xxx/lib/python3.6/threading.py", line 1072, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "testa.py", line 15, in output_tracebacks
    stack_list = traceback.format_list(traceback.extract_stack(stack))

关于获取 traceback 信息的更多内容详见 Python: 获取并发程序的 traceback 信息(threading/gevent/asyncio)

接收到信号时开启远程调试服务

比如开启一个使用当前进程中运行环境的远程调试器(Python Shell), 可以在这个调试器中访问运行时变化的全局变量,使用进程当前的运行时环境执行代码:

# -*- coding: utf-8 -*-
from code import InteractiveConsole
from queue import Queue
import signal
import socketserver
import sys
import threading
import time
import traceback


class FileLikeObject(object):
    def __init__(self, rfile, wfile):
        self._rfile = rfile
        self._wfile = wfile

    def __getattr__(self, name):
        try:
            return getattr(self._rfile, name)
        except AttributeError:
            return getattr(self._wfile, name)

    def write(self, data):
        if not isinstance(data, bytes):
            data = data.encode('utf-8')
        self._wfile.write(data)

    def isatty(self):
        return True

    def flush(self):
        pass

    def readline(self, *args):
        try:
            data = self._rfile.readline(*args).replace(b'\r\n', b'\n')
            if not isinstance(data, str):
                data = data.decode('utf-8')
            return data
        except UnicodeError:
            return ''


class DebuggerTCPHandler(socketserver.StreamRequestHandler):

    def handle(self):
        fileobj = FileLikeObject(self.rfile, self.wfile)
        sys.stdin = sys.stdout = sys.stderr = fileobj

        try:
            console = InteractiveConsole(locals=globals())
            console.interact(banner='== debug server ==', exitmsg='')
        except SystemExit:
            pass
        finally:
            sys.stdin = sys.__stdin__
            sys.stdout = sys.__stdout__
            sys.stderr = sys.__stderr__


def output_tracebacks():
    id2thread = {}
    for thread in threading.enumerate():
        id2thread[thread.ident] = thread
    for thread_id, stack in sys._current_frames().items():
        stack_list = traceback.format_list(traceback.extract_stack(stack))
        print('thread {}:'.format(id2thread[thread_id]))
        print(''.join(stack_list))


debugger = None


def start_debugger(signum, frame):
    print('start debugger...')
    server = socketserver.TCPServer(('localhost', 9999), DebuggerTCPHandler)
    t = threading.Thread(target=server.serve_forever)
    t.start()
    global debugger
    debugger = (server, t)
    print('started debugger')


def close_debugger(signum, frame):
    print('close debugger...')
    if debugger is None:
        print('closed debugger')
        return

    server, t = debugger
    server.shutdown()
    server.server_close()
    t.join()
    print('closed debugger')


def setup_backdoor():
    signal.signal(signal.SIGUSR1, start_debugger)
    signal.signal(signal.SIGUSR2, close_debugger)


def worker(q):
    while True:
        task = q.get()
        if task is None:
            break
        # do something with task
        time.sleep(1.2)


def producer(q):
    for x in range(100):
        q.put(x)
        time.sleep(1)
    q.put(None)


setup_backdoor()
q = Queue()
t1 = threading.Thread(target=producer, args=(q,))
t1.start()
t2 = threading.Thread(target=worker, args=(q,))
t2.start()
for t in [t1, t2]:
    t.join()

运行程序,通过 USR1 激活远程调试器,调试完后通过 USR2 关闭远程调试服务:

$ python testb.py &
[1] 87173
$ kill -s USR1 87173
start debugger...
started debugger
$
$ telnet 127.0.0.1 9999
Trying 127.0.0.1...
Connected to localhost.
Escape character is '^]'.
== debug server ==
>>> output_tracebacks()
thread <Thread(Thread-3, started 123145482240000)>:
  File "/xxx/lib/python3.6/threading.py", line 884, in _bootstrap
    self._bootstrap_inner()
  File "/xxx/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/xxx/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "/xxx/lib/python3.6/socketserver.py", line 238, in serve_forever
    self._handle_request_noblock()
  File "/xxx/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/xxx/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/xxx/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/xxx/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "testb.py", line 52, in handle
    console.interact(banner='== debug server ==', exitmsg='')
  File "/xxx/lib/python3.6/code.py", line 233, in interact
    more = self.push(line)
  File "/xxx/lib/python3.6/code.py", line 259, in push
    more = self.runsource(source, self.filename)
  File "/xxx/lib/python3.6/code.py", line 75, in runsource
    self.runcode(code)
  File "/xxx/lib/python3.6/code.py", line 91, in runcode
    exec(code, self.locals)
  File "<console>", line 1, in <module>
  File "testb.py", line 66, in output_tracebacks
    stack_list = traceback.format_list(traceback.extract_stack(stack))

thread <Thread(Thread-2, started 123145476984832)>:
  File "/xxx/lib/python3.6/threading.py", line 884, in _bootstrap
    self._bootstrap_inner()
  File "/xxx/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/xxx/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "testb.py", line 108, in worker
    time.sleep(1.2)

thread <Thread(Thread-1, started 123145471729664)>:
  File "/xxx/lib/python3.6/threading.py", line 884, in _bootstrap
    self._bootstrap_inner()
  File "/xxx/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/xxx/lib/python3.6/threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "testb.py", line 114, in producer
    time.sleep(1)

thread <_MainThread(MainThread, started 140736812057536)>:
  File "testb.py", line 125, in <module>
    t.join()
  File "/xxx/lib/python3.6/threading.py", line 1056, in join
    self._wait_for_tstate_lock()
  File "/xxx/lib/python3.6/threading.py", line 1072, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):

>>> q
<queue.Queue object at 0x10a2e8fd0>
>>> q.qsize()
13
>>> q.qsize()
14
>>> exit()
Connection closed by foreign host.

$ jobs
[1]+  Running                 python testb.py &
$ kill -s USR2 87173
close debugger...
closed debugger

$ telnet 127.0.0.1 9999
Trying 127.0.0.1...
telnet: connect to address 127.0.0.1: Connection refused
telnet: Unable to connect to remote host

上面的代码只是粗略的演示如何实现一个远程调试器,正式的远程调试器可以参考和使用 gevent.backdoortwisted.conch.manholeionelmc/python-manhole 等功能完善的第三方模块。

总结

上面两个例子只是常见的预置后门,实际上还可以预置其他的功能(比如,预置一个 HTTP server ,通过访问不同的 URL 来获取不同的运行时信息或者做一些辅助调试的操作),一切都是为了调试,都是为了尽快定位和解决问题。 需要预置哪些后门要通过实际情况来定,一方面要考虑是否会影响服务的正常运行,另一方面也要考虑哪种方式哪些信息能更快的帮忙我们定位和解决问题,最重要的是 要考虑安全问题 ,要做好安全防护,不要把端口暴露到外网。

虽然标题以及文章中的例子是 Python 相关,但是这种思想是不仅限于 Python ,同样也可以应用于其他语言编写的服务。欢迎大家和我一起交流和探讨调试技巧以及解决问题的方法。


Comments