import os import struct import zlib import zipfile import io from pathlib import Path DATA_DIR = os.path.join(os.path.dirname(__file__), "data") def create_empty_txt(filename: str): Path(filename).write_text("") # ============================================================================ # Simple files # ============================================================================ def create_simple_txt(filename: str): Path(filename).write_text("Hello, this is a simple text file.") def create_simple_pdf(filename: str): content = b"""%PDF-1.4 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj xref 0 4 0000000000 65535 f 0000000009 00000 n 0000000058 00000 n 0000000115 00000 n trailer << /Size 4 /Root 1 0 R >> startxref 190 %%EOF""" Path(filename).write_bytes(content) def create_simple_docx(filename: str): import zipfile import io buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w") as z: z.writestr( "[Content_Types].xml", '', ) z.writestr( "_rels/.rels", '', ) z.writestr( "word/document.xml", 'Hello, this is a simple docx file.', ) Path(filename).write_bytes(buffer.getvalue()) def create_simple_png(filename: str): def chunk(name: bytes, data: bytes) -> bytes: c = zlib.crc32(name + data) return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c) ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0)) raw = b"\x00\xff\x00\x00" idat = chunk(b"IDAT", zlib.compress(raw)) iend = chunk(b"IEND", b"") Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend) def create_simple_jpeg(filename: str): jpeg_bytes = bytes.fromhex( "FFD8 FFE0 0010 4A46 4946 0001 0100 0001 0001 0000 " # SOI & JFIF "FFDB 0043 0008 0606 0706 0508 0707 0709 0908 0A0C " # DQT (Quantization) "140D 0C0B 0B0C 1912 130F 141D 1A1F 1E1D 1A1C 1C20 " "242E 2720 222C 231C 1C28 3729 2C30 3134 3434 1F27 " "393D 3832 3C2E 3334 32" "FFC0 000B 0800 0100 0101 0111 00" # SOF0 (Start of Frame) "FFC4 001F 0000 0001 0501 0101 0101 0101 0000 0000 " # DHT (Huffman Tables) "0000 0000 0102 0304 0506 0708 090A 0B" "FFC4 00B5 1000 0201 0303 0204 0305 0504 0400 0001 7D" "FFDA 0008 0101 0000 3F00 FBD3" # SOS (Start of Scan) "FFD9" # EOI (End of Image) ) Path(filename).write_bytes(jpeg_bytes) # ============================================================================ # DOCX Security Tests # ============================================================================ def create_deeply_nested_docx(filename: str): """Test parser stress with deeply nested XML - creates 100MB+ file.""" # Each paragraph with text runs - calculate size more precisely # Each text run is approximately 70 bytes text_run = "A" # Target: ~100MB for document.xml content # Opening/closing tags add minimal overhead runs_per_paragraph = 10000 paragraph_template = "" + text_run * runs_per_paragraph + "" # Calculate approximate size per paragraph paragraph_size = len(paragraph_template.encode("utf-8")) # Target size: just below the max threshold so that it passes the size checks # It could still be a dangerous file since its number of nodes would be very high. target_size = 9 * 1024 * 1024 num_paragraphs = (target_size // paragraph_size) + 1 # Build the document XML document_xml = ( f'' f'' f"{paragraph_template * num_paragraphs}" f"" ) buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: z.writestr( "[Content_Types].xml", '' '' '' "", ) z.writestr( "_rels/.rels", '' '' "", ) z.writestr("word/document.xml", document_xml) Path(filename).write_bytes(buffer.getvalue()) def create_deeply_nested_docx_bomb(filename: str): """Test parser stress with deeply nested XML - creates 100MB+ file.""" # Each paragraph with text runs - calculate size more precisely # Each text run is approximately 70 bytes text_run = "A" # Target: ~100MB for document.xml content # Opening/closing tags add minimal overhead runs_per_paragraph = 10000 paragraph_template = "" + text_run * runs_per_paragraph + "" # Calculate approximate size per paragraph paragraph_size = len(paragraph_template.encode("utf-8")) # Target size: 105MB to ensure we're over 100MB after compression target_size = 105 * 1024 * 1024 num_paragraphs = (target_size // paragraph_size) + 1 # Build the document XML document_xml = ( f'' f'' f"{paragraph_template * num_paragraphs}" f"" ) buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: z.writestr( "[Content_Types].xml", '' '' '' "", ) z.writestr( "_rels/.rels", '' '' "", ) z.writestr("word/document.xml", document_xml) Path(filename).write_bytes(buffer.getvalue()) def create_zip_bomb_docx(filename: str): """Test zip bomb protection - highly compressed repetitive content.""" # Highly compressible content - expands significantly when decompressed # This is NOT a real zip bomb, just a moderately compressed file to test your safe_unzip_check repetitive_content = ( "A" * 100_000_000 ) # 100MB of repetitive data, compresses very well document_xml = f'{repetitive_content}' buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: z.writestr( "[Content_Types].xml", '', ) z.writestr( "_rels/.rels", '', ) z.writestr("word/document.xml", document_xml) Path(filename).write_bytes(buffer.getvalue()) def create_xxe_docx(filename: str): """Test XXE (XML External Entity) injection protection.""" # Attempts to read a local file via XXE - your parser should ignore this document_xml = """ ]> &xxe; """ buffer = io.BytesIO() with zipfile.ZipFile(buffer, "w") as z: z.writestr( "[Content_Types].xml", '', ) z.writestr( "_rels/.rels", '', ) z.writestr("word/document.xml", document_xml) Path(filename).write_bytes(buffer.getvalue()) # ============================================================================ # PDF Security Tests # ============================================================================ def create_malformed_pdf(filename: str): header = b"%PDF-1.7\n" binary_marker = b"%\xff\xff\xff\xff\n" # Define a text object # 3 0 obj is the stream containing the actual text text_content = ( b"BT /F1 12 Tf 100 700 Td (This is a test of your PDF sanitizer.) Tj ET" ) body = ( b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\n" b"endobj\n" b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n" b"endobj\n" b"3 0 obj\n<< /Length " + str(len(text_content)).encode() + b" >>\n" b"stream\n" + text_content + b"\nendstream\n" b"endobj\n" ) # Intentional 'Gibberish' padding to break the XREF math padding = b"STUFFING" * 100 footer = ( b"xref\n" b"0 4\n" b"0000000000 65535 f\n" b"0000000015 00000 n\n" # Intentional wrong offset b"trailer\n" b"<< /Size 4 /Root 1 0 R >>\n" b"startxref\n" b"999999\n" # Points to a completely fake location b"%%EOF" ) content = header + binary_marker + body + padding + footer Path(filename).write_bytes(content) # ============================================================================ # PNG Security Tests # ============================================================================ def chunk(name: bytes, data: bytes) -> bytes: c = zlib.crc32(name + data) return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c) def create_fake_large_png(filename: str): """Create a PNG that claims huge dimensions.""" # Claim to be 100000x100000 pixels (would be ~40GB uncompressed) ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 100000, 100000, 8, 2, 0, 0, 0)) raw = b"\x00\xff\x00\x00" idat = chunk(b"IDAT", zlib.compress(raw)) iend = chunk(b"IEND", b"") Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend) def create_fake_small_png(filename: str): """Create PNG that's huge but claims to be small.""" # Claim HUGE dimensions width, height = 10000, 10000 # 100 megapixels ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 10, 10, 8, 2, 0, 0, 0)) # Create the full data but it compresses well raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height compressed = zlib.compress(raw_data, level=9) idat = chunk(b"IDAT", compressed) iend = chunk(b"IEND", b"") final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend Path(filename).write_bytes(final) def create_png_decompression_bomb(filename: str): """Create PNG that's small but decompresses to huge size.""" # Claim HUGE dimensions width, height = 20000, 20000 ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0)) # Create the full data but it compresses well raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height compressed = zlib.compress(raw_data, level=9) idat = chunk(b"IDAT", compressed) iend = chunk(b"IEND", b"") final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend Path(filename).write_bytes(final) def create_malformed_png(filename: str): """Create PNG with invalid structure.""" # 1. Signature sig = b"\x89PNG\r\n\x1a\n" # 2. A valid-looking IHDR chunk # Length (13), Name (IHDR), Width/Height, then CRC ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0) ihdr_name = b"IHDR" crc = struct.pack(">I", zlib.crc32(ihdr_name + ihdr_data)) ihdr_chunk = struct.pack(">I", 13) + ihdr_name + ihdr_data + crc # 3. Followed by total gibberish content = sig + ihdr_chunk + b"GIBBERISH" * 100 Path(filename).write_bytes(content) # ============================================================================ # JPEG Security Tests # ============================================================================ def create_fake_large_jpeg(filename: str): """Create JPEG that claims huge dimensions.""" # Claims 65535x65535 (Max allowed by standard JPEG) # This is ~4.3 Gigapixels. In RGB, this would be ~12GB of RAM. # The 12GB Decompression Bomb Header header = bytes.fromhex( "FFD8" # SOI (Start of Image) "FFDB 0043 00" # DQT Marker + Length "08080808080808080808080808080808" # Quantization Table (Dummy 8s) "08080808080808080808080808080808" "08080808080808080808080808080808" "08080808080808080808080808080808" "FFC0 0011 08 FFFF FFFF" # SOF0: 8-bit, 65535x65535 px "03 01 11 00 02 11 01 03 11 01" # Components (YCrCb) "FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan) ) # Add a tiny bit of compressed "black" data so it's not immediately truncated pixel_data = b"\x00" * 100 footer = b"\xff\xd9" # EOI Path(filename).write_bytes(header + pixel_data + footer) def create_malformed_jpeg(filename: str): """Create JPEG with invalid structure.""" # JPEG markers without proper data jpeg_bytes = bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x00]) Path(filename).write_bytes(jpeg_bytes) def create_jpeg_with_excessive_markers(filename: str): # Use bytearray for performance; += on 'bytes' is O(n^2) content = bytearray([0xFF, 0xD8]) # SOI # 1. Add the metadata 'stuffing' marker = bytes([0xFF, 0xFE, 0x00, 0x03, 0x41]) for _ in range(1_887_000): # Adds up to 9MB content.extend(marker) # 2. Add a tiny 1x1 valid JPEG skeleton at the end # This ensures the parser doesn't give up early skeleton = bytes.fromhex( "FFDB 0043 00" # DQT Marker "08080808080808080808080808080808" # Quantization Table (64 bytes of 0x08) "08080808080808080808080808080808" "08080808080808080808080808080808" "08080808080808080808080808080808" "FFC0 0011 08 0001 0001" # SOF0 (1x1 pixels) "03 01 11 00 02 11 01 03 11 01" "FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan) "003F FF00" # Dummy Scan Data ) content.extend(skeleton) content.extend([0xFF, 0xD9]) # EOI Path(filename).write_bytes(content) def create_jpeg_pixel_bomb(filename: str): """ Valid JPEG under 10MB that expands to ~12GB in RAM. Uses a 1x1 minimal scan to satisfy the parser while claiming 64k dimensions. """ # Claims 65535x65535 pixels # 65535 * 65535 * 3 (RGB) = 12,884,508,675 bytes (~12 GB) # Pixel Bomb Header: Claims 65535x65535 resolution header = ( bytes.fromhex("FFD8") # SOI + bytes.fromhex("FFDB 0043 00") # DQT Header + bytes([0x08] * 64) # DQT Table Data + bytes.fromhex("FFC0 0011 08 FFFF FFFF") # SOF0 (Height/Width 65535) + bytes.fromhex("03 01 11 00 02 11 01 03 11 01") + bytes.fromhex("FFDA 0008 01 01 00 00 3F 00") # SOS ) # Fill with 9MB of highly compressible null data to hit your size limit # and ensure the file is not 'truncated' pixel_data = b"\x00" * (9 * 1024 * 1024) footer = b"\xff\xd9" # EOI Path(filename).write_bytes(header + pixel_data + footer) def create_jpeg_cpu_scan_bomb(filename: str): """ A 9MB JPEG containing thousands of SOS (Start of Scan) markers. Forces the CPU to perform redundant decoding passes. """ # CPU Scan Bomb Base: Designed for repeating SOS markers content = bytearray( bytes.fromhex("FFD8") # SOI + bytes.fromhex("FFDB 0043 00") # DQT Header + bytes([0x08] * 64) # DQT Table Data + bytes.fromhex("FFC0 0011 08 0100 0100") # SOF (256x256) + bytes.fromhex("03 01 11 00 02 11 01 03 11 01") ) # Add 9MB of SOS markers # Each SOS marker forces a new decoding pass in some parsers sos_marker = bytes( [0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00] ) for _ in range(800000): # Fill up to ~9MB content.extend(sos_marker) content.extend([0xFF, 0xD9]) # EOI Path(filename).write_bytes(content)