champ-chatbot / tests /file_factory.py
qyle's picture
deployment
f80f41e verified
import os
import struct
import zlib
import zipfile
import io
from pathlib import Path
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
def create_empty_txt(filename: str):
Path(filename).write_text("")
# ============================================================================
# Simple files
# ============================================================================
def create_simple_txt(filename: str):
Path(filename).write_text("Hello, this is a simple text file.")
def create_simple_pdf(filename: str):
content = b"""%PDF-1.4
1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
xref
0 4
0000000000 65535 f
0000000009 00000 n
0000000058 00000 n
0000000115 00000 n
trailer << /Size 4 /Root 1 0 R >>
startxref
190
%%EOF"""
Path(filename).write_bytes(content)
def create_simple_docx(filename: str):
import zipfile
import io
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
)
z.writestr(
"word/document.xml",
'<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Hello, this is a simple docx file.</w:t></w:r></w:p></w:body></w:document>',
)
Path(filename).write_bytes(buffer.getvalue())
def create_simple_png(filename: str):
def chunk(name: bytes, data: bytes) -> bytes:
c = zlib.crc32(name + data)
return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c)
ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0))
raw = b"\x00\xff\x00\x00"
idat = chunk(b"IDAT", zlib.compress(raw))
iend = chunk(b"IEND", b"")
Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend)
def create_simple_jpeg(filename: str):
jpeg_bytes = bytes.fromhex(
"FFD8 FFE0 0010 4A46 4946 0001 0100 0001 0001 0000 " # SOI & JFIF
"FFDB 0043 0008 0606 0706 0508 0707 0709 0908 0A0C " # DQT (Quantization)
"140D 0C0B 0B0C 1912 130F 141D 1A1F 1E1D 1A1C 1C20 "
"242E 2720 222C 231C 1C28 3729 2C30 3134 3434 1F27 "
"393D 3832 3C2E 3334 32"
"FFC0 000B 0800 0100 0101 0111 00" # SOF0 (Start of Frame)
"FFC4 001F 0000 0001 0501 0101 0101 0101 0000 0000 " # DHT (Huffman Tables)
"0000 0000 0102 0304 0506 0708 090A 0B"
"FFC4 00B5 1000 0201 0303 0204 0305 0504 0400 0001 7D"
"FFDA 0008 0101 0000 3F00 FBD3" # SOS (Start of Scan)
"FFD9" # EOI (End of Image)
)
Path(filename).write_bytes(jpeg_bytes)
# ============================================================================
# DOCX Security Tests
# ============================================================================
def create_deeply_nested_docx(filename: str):
"""Test parser stress with deeply nested XML - creates 100MB+ file."""
# Each paragraph with text runs - calculate size more precisely
# Each text run is approximately 70 bytes
text_run = "<w:r><w:t>A</w:t></w:r>"
# Target: ~100MB for document.xml content
# Opening/closing tags add minimal overhead
runs_per_paragraph = 10000
paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>"
# Calculate approximate size per paragraph
paragraph_size = len(paragraph_template.encode("utf-8"))
# Target size: just below the max threshold so that it passes the size checks
# It could still be a dangerous file since its number of nodes would be very high.
target_size = 9 * 1024 * 1024
num_paragraphs = (target_size // paragraph_size) + 1
# Build the document XML
document_xml = (
f'<?xml version="1.0"?>'
f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
f"<w:body>{paragraph_template * num_paragraphs}</w:body>"
f"</w:document>"
)
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
'<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
"</Types>",
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>'
"</Relationships>",
)
z.writestr("word/document.xml", document_xml)
Path(filename).write_bytes(buffer.getvalue())
def create_deeply_nested_docx_bomb(filename: str):
"""Test parser stress with deeply nested XML - creates 100MB+ file."""
# Each paragraph with text runs - calculate size more precisely
# Each text run is approximately 70 bytes
text_run = "<w:r><w:t>A</w:t></w:r>"
# Target: ~100MB for document.xml content
# Opening/closing tags add minimal overhead
runs_per_paragraph = 10000
paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>"
# Calculate approximate size per paragraph
paragraph_size = len(paragraph_template.encode("utf-8"))
# Target size: 105MB to ensure we're over 100MB after compression
target_size = 105 * 1024 * 1024
num_paragraphs = (target_size // paragraph_size) + 1
# Build the document XML
document_xml = (
f'<?xml version="1.0"?>'
f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
f"<w:body>{paragraph_template * num_paragraphs}</w:body>"
f"</w:document>"
)
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
'<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
"</Types>",
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>'
"</Relationships>",
)
z.writestr("word/document.xml", document_xml)
Path(filename).write_bytes(buffer.getvalue())
def create_zip_bomb_docx(filename: str):
"""Test zip bomb protection - highly compressed repetitive content."""
# Highly compressible content - expands significantly when decompressed
# This is NOT a real zip bomb, just a moderately compressed file to test your safe_unzip_check
repetitive_content = (
"A" * 100_000_000
) # 100MB of repetitive data, compresses very well
document_xml = f'<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>{repetitive_content}</w:t></w:r></w:p></w:body></w:document>'
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
)
z.writestr("word/document.xml", document_xml)
Path(filename).write_bytes(buffer.getvalue())
def create_xxe_docx(filename: str):
"""Test XXE (XML External Entity) injection protection."""
# Attempts to read a local file via XXE - your parser should ignore this
document_xml = """<?xml version="1.0"?>
<!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>&xxe;</w:t></w:r></w:p></w:body>
</w:document>"""
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as z:
z.writestr(
"[Content_Types].xml",
'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
)
z.writestr(
"_rels/.rels",
'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
)
z.writestr("word/document.xml", document_xml)
Path(filename).write_bytes(buffer.getvalue())
# ============================================================================
# PDF Security Tests
# ============================================================================
def create_malformed_pdf(filename: str):
header = b"%PDF-1.7\n"
binary_marker = b"%\xff\xff\xff\xff\n"
# Define a text object
# 3 0 obj is the stream containing the actual text
text_content = (
b"BT /F1 12 Tf 100 700 Td (This is a test of your PDF sanitizer.) Tj ET"
)
body = (
b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\n"
b"endobj\n"
b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n"
b"endobj\n"
b"3 0 obj\n<< /Length " + str(len(text_content)).encode() + b" >>\n"
b"stream\n" + text_content + b"\nendstream\n"
b"endobj\n"
)
# Intentional 'Gibberish' padding to break the XREF math
padding = b"STUFFING" * 100
footer = (
b"xref\n"
b"0 4\n"
b"0000000000 65535 f\n"
b"0000000015 00000 n\n" # Intentional wrong offset
b"trailer\n"
b"<< /Size 4 /Root 1 0 R >>\n"
b"startxref\n"
b"999999\n" # Points to a completely fake location
b"%%EOF"
)
content = header + binary_marker + body + padding + footer
Path(filename).write_bytes(content)
# ============================================================================
# PNG Security Tests
# ============================================================================
def chunk(name: bytes, data: bytes) -> bytes:
c = zlib.crc32(name + data)
return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c)
def create_fake_large_png(filename: str):
"""Create a PNG that claims huge dimensions."""
# Claim to be 100000x100000 pixels (would be ~40GB uncompressed)
ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 100000, 100000, 8, 2, 0, 0, 0))
raw = b"\x00\xff\x00\x00"
idat = chunk(b"IDAT", zlib.compress(raw))
iend = chunk(b"IEND", b"")
Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend)
def create_fake_small_png(filename: str):
"""Create PNG that's huge but claims to be small."""
# Claim HUGE dimensions
width, height = 10000, 10000 # 100 megapixels
ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 10, 10, 8, 2, 0, 0, 0))
# Create the full data but it compresses well
raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height
compressed = zlib.compress(raw_data, level=9)
idat = chunk(b"IDAT", compressed)
iend = chunk(b"IEND", b"")
final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend
Path(filename).write_bytes(final)
def create_png_decompression_bomb(filename: str):
"""Create PNG that's small but decompresses to huge size."""
# Claim HUGE dimensions
width, height = 20000, 20000
ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0))
# Create the full data but it compresses well
raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height
compressed = zlib.compress(raw_data, level=9)
idat = chunk(b"IDAT", compressed)
iend = chunk(b"IEND", b"")
final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend
Path(filename).write_bytes(final)
def create_malformed_png(filename: str):
"""Create PNG with invalid structure."""
# 1. Signature
sig = b"\x89PNG\r\n\x1a\n"
# 2. A valid-looking IHDR chunk
# Length (13), Name (IHDR), Width/Height, then CRC
ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0)
ihdr_name = b"IHDR"
crc = struct.pack(">I", zlib.crc32(ihdr_name + ihdr_data))
ihdr_chunk = struct.pack(">I", 13) + ihdr_name + ihdr_data + crc
# 3. Followed by total gibberish
content = sig + ihdr_chunk + b"GIBBERISH" * 100
Path(filename).write_bytes(content)
# ============================================================================
# JPEG Security Tests
# ============================================================================
def create_fake_large_jpeg(filename: str):
"""Create JPEG that claims huge dimensions."""
# Claims 65535x65535 (Max allowed by standard JPEG)
# This is ~4.3 Gigapixels. In RGB, this would be ~12GB of RAM.
# The 12GB Decompression Bomb Header
header = bytes.fromhex(
"FFD8" # SOI (Start of Image)
"FFDB 0043 00" # DQT Marker + Length
"08080808080808080808080808080808" # Quantization Table (Dummy 8s)
"08080808080808080808080808080808"
"08080808080808080808080808080808"
"08080808080808080808080808080808"
"FFC0 0011 08 FFFF FFFF" # SOF0: 8-bit, 65535x65535 px
"03 01 11 00 02 11 01 03 11 01" # Components (YCrCb)
"FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan)
)
# Add a tiny bit of compressed "black" data so it's not immediately truncated
pixel_data = b"\x00" * 100
footer = b"\xff\xd9" # EOI
Path(filename).write_bytes(header + pixel_data + footer)
def create_malformed_jpeg(filename: str):
"""Create JPEG with invalid structure."""
# JPEG markers without proper data
jpeg_bytes = bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x00])
Path(filename).write_bytes(jpeg_bytes)
def create_jpeg_with_excessive_markers(filename: str):
# Use bytearray for performance; += on 'bytes' is O(n^2)
content = bytearray([0xFF, 0xD8]) # SOI
# 1. Add the metadata 'stuffing'
marker = bytes([0xFF, 0xFE, 0x00, 0x03, 0x41])
for _ in range(1_887_000): # Adds up to 9MB
content.extend(marker)
# 2. Add a tiny 1x1 valid JPEG skeleton at the end
# This ensures the parser doesn't give up early
skeleton = bytes.fromhex(
"FFDB 0043 00" # DQT Marker
"08080808080808080808080808080808" # Quantization Table (64 bytes of 0x08)
"08080808080808080808080808080808"
"08080808080808080808080808080808"
"08080808080808080808080808080808"
"FFC0 0011 08 0001 0001" # SOF0 (1x1 pixels)
"03 01 11 00 02 11 01 03 11 01"
"FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan)
"003F FF00" # Dummy Scan Data
)
content.extend(skeleton)
content.extend([0xFF, 0xD9]) # EOI
Path(filename).write_bytes(content)
def create_jpeg_pixel_bomb(filename: str):
"""
Valid JPEG under 10MB that expands to ~12GB in RAM.
Uses a 1x1 minimal scan to satisfy the parser while claiming 64k dimensions.
"""
# Claims 65535x65535 pixels
# 65535 * 65535 * 3 (RGB) = 12,884,508,675 bytes (~12 GB)
# Pixel Bomb Header: Claims 65535x65535 resolution
header = (
bytes.fromhex("FFD8") # SOI
+ bytes.fromhex("FFDB 0043 00") # DQT Header
+ bytes([0x08] * 64) # DQT Table Data
+ bytes.fromhex("FFC0 0011 08 FFFF FFFF") # SOF0 (Height/Width 65535)
+ bytes.fromhex("03 01 11 00 02 11 01 03 11 01")
+ bytes.fromhex("FFDA 0008 01 01 00 00 3F 00") # SOS
)
# Fill with 9MB of highly compressible null data to hit your size limit
# and ensure the file is not 'truncated'
pixel_data = b"\x00" * (9 * 1024 * 1024)
footer = b"\xff\xd9" # EOI
Path(filename).write_bytes(header + pixel_data + footer)
def create_jpeg_cpu_scan_bomb(filename: str):
"""
A 9MB JPEG containing thousands of SOS (Start of Scan) markers.
Forces the CPU to perform redundant decoding passes.
"""
# CPU Scan Bomb Base: Designed for repeating SOS markers
content = bytearray(
bytes.fromhex("FFD8") # SOI
+ bytes.fromhex("FFDB 0043 00") # DQT Header
+ bytes([0x08] * 64) # DQT Table Data
+ bytes.fromhex("FFC0 0011 08 0100 0100") # SOF (256x256)
+ bytes.fromhex("03 01 11 00 02 11 01 03 11 01")
)
# Add 9MB of SOS markers
# Each SOS marker forces a new decoding pass in some parsers
sos_marker = bytes(
[0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00]
)
for _ in range(800000): # Fill up to ~9MB
content.extend(sos_marker)
content.extend([0xFF, 0xD9]) # EOI
Path(filename).write_bytes(content)