Spaces:
Paused
Paused
| import os | |
| import struct | |
| import zlib | |
| import zipfile | |
| import io | |
| from pathlib import Path | |
| DATA_DIR = os.path.join(os.path.dirname(__file__), "data") | |
| def create_empty_txt(filename: str): | |
| Path(filename).write_text("") | |
| # ============================================================================ | |
| # Simple files | |
| # ============================================================================ | |
| def create_simple_txt(filename: str): | |
| Path(filename).write_text("Hello, this is a simple text file.") | |
| def create_simple_pdf(filename: str): | |
| content = b"""%PDF-1.4 | |
| 1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj | |
| 2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj | |
| 3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj | |
| xref | |
| 0 4 | |
| 0000000000 65535 f | |
| 0000000009 00000 n | |
| 0000000058 00000 n | |
| 0000000115 00000 n | |
| trailer << /Size 4 /Root 1 0 R >> | |
| startxref | |
| 190 | |
| %%EOF""" | |
| Path(filename).write_bytes(content) | |
| def create_simple_docx(filename: str): | |
| import zipfile | |
| import io | |
| buffer = io.BytesIO() | |
| with zipfile.ZipFile(buffer, "w") as z: | |
| z.writestr( | |
| "[Content_Types].xml", | |
| '<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>', | |
| ) | |
| z.writestr( | |
| "_rels/.rels", | |
| '<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>', | |
| ) | |
| z.writestr( | |
| "word/document.xml", | |
| '<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Hello, this is a simple docx file.</w:t></w:r></w:p></w:body></w:document>', | |
| ) | |
| Path(filename).write_bytes(buffer.getvalue()) | |
| def create_simple_png(filename: str): | |
| def chunk(name: bytes, data: bytes) -> bytes: | |
| c = zlib.crc32(name + data) | |
| return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c) | |
| ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0)) | |
| raw = b"\x00\xff\x00\x00" | |
| idat = chunk(b"IDAT", zlib.compress(raw)) | |
| iend = chunk(b"IEND", b"") | |
| Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend) | |
| def create_simple_jpeg(filename: str): | |
| jpeg_bytes = bytes.fromhex( | |
| "FFD8 FFE0 0010 4A46 4946 0001 0100 0001 0001 0000 " # SOI & JFIF | |
| "FFDB 0043 0008 0606 0706 0508 0707 0709 0908 0A0C " # DQT (Quantization) | |
| "140D 0C0B 0B0C 1912 130F 141D 1A1F 1E1D 1A1C 1C20 " | |
| "242E 2720 222C 231C 1C28 3729 2C30 3134 3434 1F27 " | |
| "393D 3832 3C2E 3334 32" | |
| "FFC0 000B 0800 0100 0101 0111 00" # SOF0 (Start of Frame) | |
| "FFC4 001F 0000 0001 0501 0101 0101 0101 0000 0000 " # DHT (Huffman Tables) | |
| "0000 0000 0102 0304 0506 0708 090A 0B" | |
| "FFC4 00B5 1000 0201 0303 0204 0305 0504 0400 0001 7D" | |
| "FFDA 0008 0101 0000 3F00 FBD3" # SOS (Start of Scan) | |
| "FFD9" # EOI (End of Image) | |
| ) | |
| Path(filename).write_bytes(jpeg_bytes) | |
| # ============================================================================ | |
| # DOCX Security Tests | |
| # ============================================================================ | |
| def create_deeply_nested_docx(filename: str): | |
| """Test parser stress with deeply nested XML - creates 100MB+ file.""" | |
| # Each paragraph with text runs - calculate size more precisely | |
| # Each text run is approximately 70 bytes | |
| text_run = "<w:r><w:t>A</w:t></w:r>" | |
| # Target: ~100MB for document.xml content | |
| # Opening/closing tags add minimal overhead | |
| runs_per_paragraph = 10000 | |
| paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>" | |
| # Calculate approximate size per paragraph | |
| paragraph_size = len(paragraph_template.encode("utf-8")) | |
| # Target size: just below the max threshold so that it passes the size checks | |
| # It could still be a dangerous file since its number of nodes would be very high. | |
| target_size = 9 * 1024 * 1024 | |
| num_paragraphs = (target_size // paragraph_size) + 1 | |
| # Build the document XML | |
| document_xml = ( | |
| f'<?xml version="1.0"?>' | |
| f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' | |
| f"<w:body>{paragraph_template * num_paragraphs}</w:body>" | |
| f"</w:document>" | |
| ) | |
| buffer = io.BytesIO() | |
| with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: | |
| z.writestr( | |
| "[Content_Types].xml", | |
| '<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">' | |
| '<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>' | |
| '<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>' | |
| "</Types>", | |
| ) | |
| z.writestr( | |
| "_rels/.rels", | |
| '<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">' | |
| '<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>' | |
| "</Relationships>", | |
| ) | |
| z.writestr("word/document.xml", document_xml) | |
| Path(filename).write_bytes(buffer.getvalue()) | |
| def create_deeply_nested_docx_bomb(filename: str): | |
| """Test parser stress with deeply nested XML - creates 100MB+ file.""" | |
| # Each paragraph with text runs - calculate size more precisely | |
| # Each text run is approximately 70 bytes | |
| text_run = "<w:r><w:t>A</w:t></w:r>" | |
| # Target: ~100MB for document.xml content | |
| # Opening/closing tags add minimal overhead | |
| runs_per_paragraph = 10000 | |
| paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>" | |
| # Calculate approximate size per paragraph | |
| paragraph_size = len(paragraph_template.encode("utf-8")) | |
| # Target size: 105MB to ensure we're over 100MB after compression | |
| target_size = 105 * 1024 * 1024 | |
| num_paragraphs = (target_size // paragraph_size) + 1 | |
| # Build the document XML | |
| document_xml = ( | |
| f'<?xml version="1.0"?>' | |
| f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">' | |
| f"<w:body>{paragraph_template * num_paragraphs}</w:body>" | |
| f"</w:document>" | |
| ) | |
| buffer = io.BytesIO() | |
| with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: | |
| z.writestr( | |
| "[Content_Types].xml", | |
| '<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">' | |
| '<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>' | |
| '<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>' | |
| "</Types>", | |
| ) | |
| z.writestr( | |
| "_rels/.rels", | |
| '<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">' | |
| '<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>' | |
| "</Relationships>", | |
| ) | |
| z.writestr("word/document.xml", document_xml) | |
| Path(filename).write_bytes(buffer.getvalue()) | |
| def create_zip_bomb_docx(filename: str): | |
| """Test zip bomb protection - highly compressed repetitive content.""" | |
| # Highly compressible content - expands significantly when decompressed | |
| # This is NOT a real zip bomb, just a moderately compressed file to test your safe_unzip_check | |
| repetitive_content = ( | |
| "A" * 100_000_000 | |
| ) # 100MB of repetitive data, compresses very well | |
| document_xml = f'<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>{repetitive_content}</w:t></w:r></w:p></w:body></w:document>' | |
| buffer = io.BytesIO() | |
| with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z: | |
| z.writestr( | |
| "[Content_Types].xml", | |
| '<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>', | |
| ) | |
| z.writestr( | |
| "_rels/.rels", | |
| '<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>', | |
| ) | |
| z.writestr("word/document.xml", document_xml) | |
| Path(filename).write_bytes(buffer.getvalue()) | |
| def create_xxe_docx(filename: str): | |
| """Test XXE (XML External Entity) injection protection.""" | |
| # Attempts to read a local file via XXE - your parser should ignore this | |
| document_xml = """<?xml version="1.0"?> | |
| <!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]> | |
| <w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"> | |
| <w:body><w:p><w:r><w:t>&xxe;</w:t></w:r></w:p></w:body> | |
| </w:document>""" | |
| buffer = io.BytesIO() | |
| with zipfile.ZipFile(buffer, "w") as z: | |
| z.writestr( | |
| "[Content_Types].xml", | |
| '<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>', | |
| ) | |
| z.writestr( | |
| "_rels/.rels", | |
| '<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>', | |
| ) | |
| z.writestr("word/document.xml", document_xml) | |
| Path(filename).write_bytes(buffer.getvalue()) | |
| # ============================================================================ | |
| # PDF Security Tests | |
| # ============================================================================ | |
| def create_malformed_pdf(filename: str): | |
| header = b"%PDF-1.7\n" | |
| binary_marker = b"%\xff\xff\xff\xff\n" | |
| # Define a text object | |
| # 3 0 obj is the stream containing the actual text | |
| text_content = ( | |
| b"BT /F1 12 Tf 100 700 Td (This is a test of your PDF sanitizer.) Tj ET" | |
| ) | |
| body = ( | |
| b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\n" | |
| b"endobj\n" | |
| b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n" | |
| b"endobj\n" | |
| b"3 0 obj\n<< /Length " + str(len(text_content)).encode() + b" >>\n" | |
| b"stream\n" + text_content + b"\nendstream\n" | |
| b"endobj\n" | |
| ) | |
| # Intentional 'Gibberish' padding to break the XREF math | |
| padding = b"STUFFING" * 100 | |
| footer = ( | |
| b"xref\n" | |
| b"0 4\n" | |
| b"0000000000 65535 f\n" | |
| b"0000000015 00000 n\n" # Intentional wrong offset | |
| b"trailer\n" | |
| b"<< /Size 4 /Root 1 0 R >>\n" | |
| b"startxref\n" | |
| b"999999\n" # Points to a completely fake location | |
| b"%%EOF" | |
| ) | |
| content = header + binary_marker + body + padding + footer | |
| Path(filename).write_bytes(content) | |
| # ============================================================================ | |
| # PNG Security Tests | |
| # ============================================================================ | |
| def chunk(name: bytes, data: bytes) -> bytes: | |
| c = zlib.crc32(name + data) | |
| return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c) | |
| def create_fake_large_png(filename: str): | |
| """Create a PNG that claims huge dimensions.""" | |
| # Claim to be 100000x100000 pixels (would be ~40GB uncompressed) | |
| ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 100000, 100000, 8, 2, 0, 0, 0)) | |
| raw = b"\x00\xff\x00\x00" | |
| idat = chunk(b"IDAT", zlib.compress(raw)) | |
| iend = chunk(b"IEND", b"") | |
| Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend) | |
| def create_fake_small_png(filename: str): | |
| """Create PNG that's huge but claims to be small.""" | |
| # Claim HUGE dimensions | |
| width, height = 10000, 10000 # 100 megapixels | |
| ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 10, 10, 8, 2, 0, 0, 0)) | |
| # Create the full data but it compresses well | |
| raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height | |
| compressed = zlib.compress(raw_data, level=9) | |
| idat = chunk(b"IDAT", compressed) | |
| iend = chunk(b"IEND", b"") | |
| final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend | |
| Path(filename).write_bytes(final) | |
| def create_png_decompression_bomb(filename: str): | |
| """Create PNG that's small but decompresses to huge size.""" | |
| # Claim HUGE dimensions | |
| width, height = 20000, 20000 | |
| ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0)) | |
| # Create the full data but it compresses well | |
| raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height | |
| compressed = zlib.compress(raw_data, level=9) | |
| idat = chunk(b"IDAT", compressed) | |
| iend = chunk(b"IEND", b"") | |
| final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend | |
| Path(filename).write_bytes(final) | |
| def create_malformed_png(filename: str): | |
| """Create PNG with invalid structure.""" | |
| # 1. Signature | |
| sig = b"\x89PNG\r\n\x1a\n" | |
| # 2. A valid-looking IHDR chunk | |
| # Length (13), Name (IHDR), Width/Height, then CRC | |
| ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0) | |
| ihdr_name = b"IHDR" | |
| crc = struct.pack(">I", zlib.crc32(ihdr_name + ihdr_data)) | |
| ihdr_chunk = struct.pack(">I", 13) + ihdr_name + ihdr_data + crc | |
| # 3. Followed by total gibberish | |
| content = sig + ihdr_chunk + b"GIBBERISH" * 100 | |
| Path(filename).write_bytes(content) | |
| # ============================================================================ | |
| # JPEG Security Tests | |
| # ============================================================================ | |
| def create_fake_large_jpeg(filename: str): | |
| """Create JPEG that claims huge dimensions.""" | |
| # Claims 65535x65535 (Max allowed by standard JPEG) | |
| # This is ~4.3 Gigapixels. In RGB, this would be ~12GB of RAM. | |
| # The 12GB Decompression Bomb Header | |
| header = bytes.fromhex( | |
| "FFD8" # SOI (Start of Image) | |
| "FFDB 0043 00" # DQT Marker + Length | |
| "08080808080808080808080808080808" # Quantization Table (Dummy 8s) | |
| "08080808080808080808080808080808" | |
| "08080808080808080808080808080808" | |
| "08080808080808080808080808080808" | |
| "FFC0 0011 08 FFFF FFFF" # SOF0: 8-bit, 65535x65535 px | |
| "03 01 11 00 02 11 01 03 11 01" # Components (YCrCb) | |
| "FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan) | |
| ) | |
| # Add a tiny bit of compressed "black" data so it's not immediately truncated | |
| pixel_data = b"\x00" * 100 | |
| footer = b"\xff\xd9" # EOI | |
| Path(filename).write_bytes(header + pixel_data + footer) | |
| def create_malformed_jpeg(filename: str): | |
| """Create JPEG with invalid structure.""" | |
| # JPEG markers without proper data | |
| jpeg_bytes = bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x00]) | |
| Path(filename).write_bytes(jpeg_bytes) | |
| def create_jpeg_with_excessive_markers(filename: str): | |
| # Use bytearray for performance; += on 'bytes' is O(n^2) | |
| content = bytearray([0xFF, 0xD8]) # SOI | |
| # 1. Add the metadata 'stuffing' | |
| marker = bytes([0xFF, 0xFE, 0x00, 0x03, 0x41]) | |
| for _ in range(1_887_000): # Adds up to 9MB | |
| content.extend(marker) | |
| # 2. Add a tiny 1x1 valid JPEG skeleton at the end | |
| # This ensures the parser doesn't give up early | |
| skeleton = bytes.fromhex( | |
| "FFDB 0043 00" # DQT Marker | |
| "08080808080808080808080808080808" # Quantization Table (64 bytes of 0x08) | |
| "08080808080808080808080808080808" | |
| "08080808080808080808080808080808" | |
| "08080808080808080808080808080808" | |
| "FFC0 0011 08 0001 0001" # SOF0 (1x1 pixels) | |
| "03 01 11 00 02 11 01 03 11 01" | |
| "FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan) | |
| "003F FF00" # Dummy Scan Data | |
| ) | |
| content.extend(skeleton) | |
| content.extend([0xFF, 0xD9]) # EOI | |
| Path(filename).write_bytes(content) | |
| def create_jpeg_pixel_bomb(filename: str): | |
| """ | |
| Valid JPEG under 10MB that expands to ~12GB in RAM. | |
| Uses a 1x1 minimal scan to satisfy the parser while claiming 64k dimensions. | |
| """ | |
| # Claims 65535x65535 pixels | |
| # 65535 * 65535 * 3 (RGB) = 12,884,508,675 bytes (~12 GB) | |
| # Pixel Bomb Header: Claims 65535x65535 resolution | |
| header = ( | |
| bytes.fromhex("FFD8") # SOI | |
| + bytes.fromhex("FFDB 0043 00") # DQT Header | |
| + bytes([0x08] * 64) # DQT Table Data | |
| + bytes.fromhex("FFC0 0011 08 FFFF FFFF") # SOF0 (Height/Width 65535) | |
| + bytes.fromhex("03 01 11 00 02 11 01 03 11 01") | |
| + bytes.fromhex("FFDA 0008 01 01 00 00 3F 00") # SOS | |
| ) | |
| # Fill with 9MB of highly compressible null data to hit your size limit | |
| # and ensure the file is not 'truncated' | |
| pixel_data = b"\x00" * (9 * 1024 * 1024) | |
| footer = b"\xff\xd9" # EOI | |
| Path(filename).write_bytes(header + pixel_data + footer) | |
| def create_jpeg_cpu_scan_bomb(filename: str): | |
| """ | |
| A 9MB JPEG containing thousands of SOS (Start of Scan) markers. | |
| Forces the CPU to perform redundant decoding passes. | |
| """ | |
| # CPU Scan Bomb Base: Designed for repeating SOS markers | |
| content = bytearray( | |
| bytes.fromhex("FFD8") # SOI | |
| + bytes.fromhex("FFDB 0043 00") # DQT Header | |
| + bytes([0x08] * 64) # DQT Table Data | |
| + bytes.fromhex("FFC0 0011 08 0100 0100") # SOF (256x256) | |
| + bytes.fromhex("03 01 11 00 02 11 01 03 11 01") | |
| ) | |
| # Add 9MB of SOS markers | |
| # Each SOS marker forces a new decoding pass in some parsers | |
| sos_marker = bytes( | |
| [0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00] | |
| ) | |
| for _ in range(800000): # Fill up to ~9MB | |
| content.extend(sos_marker) | |
| content.extend([0xFF, 0xD9]) # EOI | |
| Path(filename).write_bytes(content) | |