Overview

Several tips to level up your reverse engineering with Python3.

Tip 1 - Use Juptyper Notebooks and A Git Repo

Remember to steal our .gitignore file to filter out the juptyper junk files.

Tip 2 - Remember Byte Strings Are Not Strings

import binascii

string_example = "test"
byte_array_example = b"test"

# Convert string into bytes
print(string_example.encode('utf-8'))

# Convert byte array into string
print(byte_array_example.decode('utf-8'))

b'test'
test

Example for Tips 3-5

example_data = b'\x02\x00\x00\x00\x00\x04\x00\x00\x00test\x01\x04\x00\x00\x00t\x00e\x00s\x00t\x00'

Example Data Structure

struct strings{
DWORD number_of_strings;
string* string;
}

struct string{
BOOL is_wide_string;
DWORD string_length;
chr* string;
}

Tip 3 - Hex Encoding Binary Data

def unhex(hex_string):
    import binascii
    if type(hex_string) == str:
        return binascii.unhexlify(hex_string.encode('utf-8'))
    else:
        return binascii.unhexlify(hex_string)

    
def tohex(data):
    import binascii
    if type(data) == str:
        return binascii.hexlify(data.encode('utf-8'))
    else:
        return binascii.hexlify(data)

print("This is hex encoded data: %r" % tohex(example_data))

This is hex encoded data: b'0200000000040000007465737401040000007400650073007400'

Tip 4 - Use Struct To Convert Between Data and Types

https://docs.python.org/3/library/struct.html

import struct
number_of_strings = struct.unpack('<I',example_data[:4])[0]
print("Number of strings: %d" % number_of_strings)

Number of strings: 2

Tip 5 - Use Custom Struct Class To Parse Binary Data

import struct 

example_string = b'\x00\x04\x00\x00\x00test'

class EXAMPLE_STRING:
    def __init__(self):
        self.is_wide_string = False
        self.string_length = 0
        self.string = b''
    def from_buffer_copy(self, data):
        ptr = 0
        self.is_wide_string = struct.unpack('?', data[ptr:ptr+1])[0]
        ptr += 1
        self.string_length = struct.unpack('<I', data[ptr:ptr+4])[0]
        ptr += 4
        if self.is_wide_string:
            self.string = data[ptr:ptr+(self.string_length*2)].decode('utf-16le')
            ptr += self.string_length*2
        else:
            self.string = data[ptr:ptr+self.string_length].decode('utf-8')
            ptr += self.string_length
    def pack(self):
        data = b''
        data += struct.pack('?', self.is_wide_string)
        data += struct.pack('<I', self.string_length)
        if self.is_wide_string:
            data += self.string.encode('utf-16le')
        else:
            data += self.string.encode('utf-8')
        return data

print("Example string data: %r" % example_string)
es = EXAMPLE_STRING()
es.from_buffer_copy(example_string)
print("Example is wide: %s" % es.is_wide_string)
print("Example string length: %d" % es.string_length)
print("Example string: %s" % es.string)

Example string data: b'\x00\x04\x00\x00\x00test'
Example is wide: False
Example string length: 4
Example string: test

es.is_wide_string = True
print("Example string data converted to wide: %r" % es.pack())

Example string data converted to wide: b'\x01\x04\x00\x00\x00t\x00e\x00s\x00t\x00'

class EXAMPLE_STRINGS:
    def __init__(self):
        self.length = 0
        self.strings = []
    def from_buffer_copy(self, data):
        ptr = 0
        self.length = struct.unpack('<I', data[ptr:ptr+4])[0]
        ptr += 4
        for i in range(self.length):
            tmp_string = EXAMPLE_STRING()
            tmp_string.from_buffer_copy(data[ptr:])
            ptr += len(tmp_string.pack())
            self.strings.append(tmp_string)
    def pack(self):
        data = b''
        data += struct.pack('<I', self.length)
        for s in self.strings:
            data += s.pack()
        return data
    
es = EXAMPLE_STRINGS()
es.from_buffer_copy(example_data)
print("Number of strings: %d" % es.length)
print("First string is wide: %s" % es.strings[0].is_wide_string)
print("First string: %s" % es.strings[0].string)
print("Second string is wide: %s" % es.strings[1].is_wide_string)
print("Second string: %s" % es.strings[1].string)

Number of strings: 2
First string is wide: False
First string: test
Second string is wide: True
Second string: test

Tip 6 - Parse Stream With BytesIO

without position pointer

https://docs.python.org/3/library/io.html#binary-i-o

import io
import struct

example_string = b'\x00\x04\x00\x00\x00test'
bstream = io.BytesIO(example_string) # transform example_string in stream (like File)

is_wide_string, *_ = struct.unpack('?', bstream.read(1))
string_length, *_ = struct.unpack('<I', bstream.read(4))
string_length = string_length*2 if is_wide_string else string_length
strings = bstream.read(string_length).decode('utf-16le' if is_wide_string else 'utf-8')
print(f'is_wide_string: {is_wide_string}')
print(f'string_length: {string_length}')
print(f'string: {strings}', end="\n\n")

# Tip 6.1 - Using lambda
bstream.seek(0,0) # beginning of the stream
read_byte = lambda bs: bs.read(1)[0]
read_dword = lambda bs: struct.unpack("<I", bs.read(4))[0]
read_utf16le = lambda bs, s: bs.read(s).decode("utf-16le")
read_utf8 = lambda bs, s: bs.read(s).decode("utf-8")
print(f'is_wide_string: {read_byte(bstream)}')
print(f'string_length: {read_dword(bstream)}')
if is_wide_string:
    print(f'string: {read_utf16le(bstream, string_length)}')
else :
    print(f'string: {read_utf8(bstream, string_length)}')

is_wide_string: False
string_length: 4
string: test

is_wide_string: 0
string_length: 4
string: test

Tip 7 - Using Fundamental Data Types With ctypes

e.g., useful to reimplement C pseudo code algorithm in Python

Highly recommended to read ctypes doc for reverser https://docs.python.org/3/library/ctypes.html

print("WRONG :")
DWORD = 0xFFFFFFFF
print(f"DWORD = {hex(DWORD)}")
DWORD += 1
print(f"DWORD+1 = {hex(DWORD)} ; WRONG! DWORD size is 4 bytes")
DWORD = 0x0
print(f"DWORD = {hex(DWORD)}")
DWORD -= 1
print(f"DWORD-1 = {hex(DWORD)} ; HUM ?!", end="\n\n")

import ctypes

print("GOOD :")
DWORD = ctypes.c_uint32(0xFFFFFFFF)
print(f"DWORD = {hex(DWORD.value)}")
DWORD.value += 1
print(f"DWORD+1 = {hex(DWORD.value)} ; GOOD!")
DWORD = ctypes.c_uint32(0x0)
print(f"DWORD = {hex(DWORD.value)}")
DWORD.value -= 1
print(f"DWORD-1 = {hex(DWORD.value)}", end="\n\n")

# CAST DATA
EAX = ctypes.c_uint32(0xAAAABBBB)
AX = ctypes.cast(ctypes.addressof(EAX), ctypes.POINTER(ctypes.c_uint16)) # we can cast
print("EAX =", hex(EAX.value))
print("AX =", hex(AX.contents.value))

WRONG :
DWORD = 0xffffffff
DWORD+1 = 0x100000000 ; WRONG! DWORD size is 4 bytes
DWORD = 0x0
DWORD-1 = -0x1 ; HUM ?!

GOOD :
DWORD = 0xffffffff
DWORD+1 = 0x0 ; GOOD!
DWORD = 0x0
DWORD-1 = 0xffffffff

EAX = 0xaaaabbbb
AX = 0xbbbb

Tip 8 - Parse Binary Data Straight To Python Structure

e.g, we are parsing UNICODE_STRING C structure (from WinAPI) :

typedef struct _UNICODE_STRING {
  USHORT Length;
  USHORT MaximumLength;
  PWSTR  Buffer;
} UNICODE_STRING, *PUNICODE_STRING;

from ctypes import Structure
from ctypes import wintypes
import io

class UNICODE_STRING(Structure):
    _fields_ = [
        ('Length', wintypes.USHORT),
        ('MaximumLength', wintypes.USHORT),
        ('Buffer', wintypes.WCHAR*1024) # No more 1024 wchar! If more => CRASH
        # To keep the example more concise we are not manage the Buffer length
    ]

# UNICODE_STRING data extracted from memory/raw file
data = b'\x0F\x00\x11\x00\x48\x00\x65\x00\x6c\x00\x6c\x00\x6f\x00\x20\x00\x74\x00\x68\x00\x65\x00\x20\x00\x77\x00\x6f\x00\x72\x00\x6c\x00\x64\x00\x00\x00\x00\x00'
stream = io.BytesIO(data)
    
unicode_stru = UNICODE_STRING()
stream.readinto(unicode_stru) # deserialization bin to struct
print(f"L:{unicode_stru.Length} Max:{unicode_stru.MaximumLength} Str:{unicode_stru.Buffer}")

L:15 Max:17 Str:Hello the world

Tip 9 - Call C Function from DLL

e.g., we want to analyze the return of GetLogicalDriveStringsW

DWORD GetLogicalDriveStringsW(
  DWORD  nBufferLength,
  LPWSTR lpBuffer
);

from ctypes import wintypes, windll, byref

length = windll.kernel32.GetLogicalDriveStringsW(0,0) # total length required 
data_string = (wintypes.CHAR*(length*2))() # array of char to access in raw format
windll.kernel32.GetLogicalDriveStringsW(length, byref(data_string)) # call api
print("raw: " + str(data_string.raw))
for drive in data_string.raw.rsplit(b'\x00\x00'):
    print(drive.decode("utf-16le"))

raw: b'C\x00:\x00\\\x00\x00\x00E\x00:\x00\\\x00\x00\x00F\x00:\x00\\\x00\x00\x00\x00\x00'
C:\
E:\
F:\