champ-chatbot

Paused

App Files Files Community

champ-chatbot / tests /file_factory.py

qyle

deployment

f80f41e verified about 1 month ago

raw

history blame contribute delete

19.4 kB

	import os
	import struct
	import zlib
	import zipfile
	import io
	from pathlib import Path

	DATA_DIR = os.path.join(os.path.dirname(__file__), "data")


	def create_empty_txt(filename: str):
	Path(filename).write_text("")


	# ============================================================================
	# Simple files
	# ============================================================================


	def create_simple_txt(filename: str):
	Path(filename).write_text("Hello, this is a simple text file.")


	def create_simple_pdf(filename: str):
	content = b"""%PDF-1.4
	1 0 obj << /Type /Catalog /Pages 2 0 R >> endobj
	2 0 obj << /Type /Pages /Kids [3 0 R] /Count 1 >> endobj
	3 0 obj << /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >> endobj
	xref
	0 4
	0000000000 65535 f
	0000000009 00000 n
	0000000058 00000 n
	0000000115 00000 n
	trailer << /Size 4 /Root 1 0 R >>
	startxref
	190
	%%EOF"""
	Path(filename).write_bytes(content)


	def create_simple_docx(filename: str):
	import zipfile
	import io

	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w") as z:
	z.writestr(
	"[Content_Types].xml",
	'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
	)
	z.writestr(
	"_rels/.rels",
	'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
	)
	z.writestr(
	"word/document.xml",
	'<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Hello, this is a simple docx file.</w:t></w:r></w:p></w:body></w:document>',
	)
	Path(filename).write_bytes(buffer.getvalue())


	def create_simple_png(filename: str):
	def chunk(name: bytes, data: bytes) -> bytes:
	c = zlib.crc32(name + data)
	return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c)

	ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0))
	raw = b"\x00\xff\x00\x00"
	idat = chunk(b"IDAT", zlib.compress(raw))
	iend = chunk(b"IEND", b"")

	Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend)


	def create_simple_jpeg(filename: str):
	jpeg_bytes = bytes.fromhex(
	"FFD8 FFE0 0010 4A46 4946 0001 0100 0001 0001 0000 " # SOI & JFIF
	"FFDB 0043 0008 0606 0706 0508 0707 0709 0908 0A0C " # DQT (Quantization)
	"140D 0C0B 0B0C 1912 130F 141D 1A1F 1E1D 1A1C 1C20 "
	"242E 2720 222C 231C 1C28 3729 2C30 3134 3434 1F27 "
	"393D 3832 3C2E 3334 32"
	"FFC0 000B 0800 0100 0101 0111 00" # SOF0 (Start of Frame)
	"FFC4 001F 0000 0001 0501 0101 0101 0101 0000 0000 " # DHT (Huffman Tables)
	"0000 0000 0102 0304 0506 0708 090A 0B"
	"FFC4 00B5 1000 0201 0303 0204 0305 0504 0400 0001 7D"
	"FFDA 0008 0101 0000 3F00 FBD3" # SOS (Start of Scan)
	"FFD9" # EOI (End of Image)
	)
	Path(filename).write_bytes(jpeg_bytes)


	# ============================================================================
	# DOCX Security Tests
	# ============================================================================


	def create_deeply_nested_docx(filename: str):
	"""Test parser stress with deeply nested XML - creates 100MB+ file."""
	# Each paragraph with text runs - calculate size more precisely
	# Each text run is approximately 70 bytes
	text_run = "<w:r><w:t>A</w:t></w:r>"

	# Target: ~100MB for document.xml content
	# Opening/closing tags add minimal overhead
	runs_per_paragraph = 10000
	paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>"

	# Calculate approximate size per paragraph
	paragraph_size = len(paragraph_template.encode("utf-8"))

	# Target size: just below the max threshold so that it passes the size checks
	# It could still be a dangerous file since its number of nodes would be very high.
	target_size = 9 * 1024 * 1024
	num_paragraphs = (target_size // paragraph_size) + 1

	# Build the document XML
	document_xml = (
	f'<?xml version="1.0"?>'
	f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
	f"<w:body>{paragraph_template * num_paragraphs}</w:body>"
	f"</w:document>"
	)

	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
	z.writestr(
	"[Content_Types].xml",
	'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
	'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
	'<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
	"</Types>",
	)
	z.writestr(
	"_rels/.rels",
	'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
	'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>'
	"</Relationships>",
	)
	z.writestr("word/document.xml", document_xml)

	Path(filename).write_bytes(buffer.getvalue())


	def create_deeply_nested_docx_bomb(filename: str):
	"""Test parser stress with deeply nested XML - creates 100MB+ file."""
	# Each paragraph with text runs - calculate size more precisely
	# Each text run is approximately 70 bytes
	text_run = "<w:r><w:t>A</w:t></w:r>"

	# Target: ~100MB for document.xml content
	# Opening/closing tags add minimal overhead
	runs_per_paragraph = 10000
	paragraph_template = "<w:p>" + text_run * runs_per_paragraph + "</w:p>"

	# Calculate approximate size per paragraph
	paragraph_size = len(paragraph_template.encode("utf-8"))

	# Target size: 105MB to ensure we're over 100MB after compression
	target_size = 105 * 1024 * 1024
	num_paragraphs = (target_size // paragraph_size) + 1

	# Build the document XML
	document_xml = (
	f'<?xml version="1.0"?>'
	f'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
	f"<w:body>{paragraph_template * num_paragraphs}</w:body>"
	f"</w:document>"
	)

	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
	z.writestr(
	"[Content_Types].xml",
	'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
	'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
	'<Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
	"</Types>",
	)
	z.writestr(
	"_rels/.rels",
	'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
	'<Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/>'
	"</Relationships>",
	)
	z.writestr("word/document.xml", document_xml)

	Path(filename).write_bytes(buffer.getvalue())


	def create_zip_bomb_docx(filename: str):
	"""Test zip bomb protection - highly compressed repetitive content."""
	# Highly compressible content - expands significantly when decompressed
	# This is NOT a real zip bomb, just a moderately compressed file to test your safe_unzip_check
	repetitive_content = (
	"A" * 100_000_000
	) # 100MB of repetitive data, compresses very well
	document_xml = f'<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>{repetitive_content}</w:t></w:r></w:p></w:body></w:document>'

	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w", compression=zipfile.ZIP_DEFLATED) as z:
	z.writestr(
	"[Content_Types].xml",
	'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
	)
	z.writestr(
	"_rels/.rels",
	'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
	)
	z.writestr("word/document.xml", document_xml)
	Path(filename).write_bytes(buffer.getvalue())


	def create_xxe_docx(filename: str):
	"""Test XXE (XML External Entity) injection protection."""
	# Attempts to read a local file via XXE - your parser should ignore this
	document_xml = """<?xml version="1.0"?>
	<!DOCTYPE foo [<!ENTITY xxe SYSTEM "file:///etc/passwd">]>
	<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
	<w:body><w:p><w:r><w:t>&xxe;</w:t></w:r></w:p></w:body>
	</w:document>"""

	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w") as z:
	z.writestr(
	"[Content_Types].xml",
	'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"><Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/><Override PartName="/word/document.xml" ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/></Types>',
	)
	z.writestr(
	"_rels/.rels",
	'<?xml version="1.0"?><Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"><Relationship Id="rId1" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" Target="word/document.xml"/></Relationships>',
	)
	z.writestr("word/document.xml", document_xml)
	Path(filename).write_bytes(buffer.getvalue())


	# ============================================================================
	# PDF Security Tests
	# ============================================================================


	def create_malformed_pdf(filename: str):
	header = b"%PDF-1.7\n"
	binary_marker = b"%\xff\xff\xff\xff\n"

	# Define a text object
	# 3 0 obj is the stream containing the actual text
	text_content = (
	b"BT /F1 12 Tf 100 700 Td (This is a test of your PDF sanitizer.) Tj ET"
	)

	body = (
	b"1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\n"
	b"endobj\n"
	b"2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\n"
	b"endobj\n"
	b"3 0 obj\n<< /Length " + str(len(text_content)).encode() + b" >>\n"
	b"stream\n" + text_content + b"\nendstream\n"
	b"endobj\n"
	)

	# Intentional 'Gibberish' padding to break the XREF math
	padding = b"STUFFING" * 100

	footer = (
	b"xref\n"
	b"0 4\n"
	b"0000000000 65535 f\n"
	b"0000000015 00000 n\n" # Intentional wrong offset
	b"trailer\n"
	b"<< /Size 4 /Root 1 0 R >>\n"
	b"startxref\n"
	b"999999\n" # Points to a completely fake location
	b"%%EOF"
	)

	content = header + binary_marker + body + padding + footer
	Path(filename).write_bytes(content)


	# ============================================================================
	# PNG Security Tests
	# ============================================================================


	def chunk(name: bytes, data: bytes) -> bytes:
	c = zlib.crc32(name + data)
	return struct.pack(">I", len(data)) + name + data + struct.pack(">I", c)


	def create_fake_large_png(filename: str):
	"""Create a PNG that claims huge dimensions."""
	# Claim to be 100000x100000 pixels (would be ~40GB uncompressed)
	ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 100000, 100000, 8, 2, 0, 0, 0))
	raw = b"\x00\xff\x00\x00"
	idat = chunk(b"IDAT", zlib.compress(raw))
	iend = chunk(b"IEND", b"")

	Path(filename).write_bytes(b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend)


	def create_fake_small_png(filename: str):
	"""Create PNG that's huge but claims to be small."""
	# Claim HUGE dimensions
	width, height = 10000, 10000 # 100 megapixels
	ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", 10, 10, 8, 2, 0, 0, 0))

	# Create the full data but it compresses well
	raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height

	compressed = zlib.compress(raw_data, level=9)

	idat = chunk(b"IDAT", compressed)
	iend = chunk(b"IEND", b"")

	final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend
	Path(filename).write_bytes(final)


	def create_png_decompression_bomb(filename: str):
	"""Create PNG that's small but decompresses to huge size."""
	# Claim HUGE dimensions
	width, height = 20000, 20000
	ihdr = chunk(b"IHDR", struct.pack(">IIBBBBB", width, height, 8, 2, 0, 0, 0))

	# Create the full data but it compresses well
	raw_data = (b"\x00" + b"\xff\xff\xff" * width) * height

	compressed = zlib.compress(raw_data, level=9)

	idat = chunk(b"IDAT", compressed)
	iend = chunk(b"IEND", b"")

	final = b"\x89PNG\r\n\x1a\n" + ihdr + idat + iend
	Path(filename).write_bytes(final)


	def create_malformed_png(filename: str):
	"""Create PNG with invalid structure."""
	# 1. Signature
	sig = b"\x89PNG\r\n\x1a\n"

	# 2. A valid-looking IHDR chunk
	# Length (13), Name (IHDR), Width/Height, then CRC
	ihdr_data = struct.pack(">IIBBBBB", 1, 1, 8, 2, 0, 0, 0)
	ihdr_name = b"IHDR"
	crc = struct.pack(">I", zlib.crc32(ihdr_name + ihdr_data))
	ihdr_chunk = struct.pack(">I", 13) + ihdr_name + ihdr_data + crc

	# 3. Followed by total gibberish
	content = sig + ihdr_chunk + b"GIBBERISH" * 100
	Path(filename).write_bytes(content)


	# ============================================================================
	# JPEG Security Tests
	# ============================================================================
	def create_fake_large_jpeg(filename: str):
	"""Create JPEG that claims huge dimensions."""
	# Claims 65535x65535 (Max allowed by standard JPEG)
	# This is ~4.3 Gigapixels. In RGB, this would be ~12GB of RAM.
	# The 12GB Decompression Bomb Header
	header = bytes.fromhex(
	"FFD8" # SOI (Start of Image)
	"FFDB 0043 00" # DQT Marker + Length
	"08080808080808080808080808080808" # Quantization Table (Dummy 8s)
	"08080808080808080808080808080808"
	"08080808080808080808080808080808"
	"08080808080808080808080808080808"
	"FFC0 0011 08 FFFF FFFF" # SOF0: 8-bit, 65535x65535 px
	"03 01 11 00 02 11 01 03 11 01" # Components (YCrCb)
	"FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan)
	)

	# Add a tiny bit of compressed "black" data so it's not immediately truncated
	pixel_data = b"\x00" * 100
	footer = b"\xff\xd9" # EOI

	Path(filename).write_bytes(header + pixel_data + footer)


	def create_malformed_jpeg(filename: str):
	"""Create JPEG with invalid structure."""
	# JPEG markers without proper data
	jpeg_bytes = bytes([0xFF, 0xD8, 0xFF, 0xE0, 0x00, 0x00])
	Path(filename).write_bytes(jpeg_bytes)


	def create_jpeg_with_excessive_markers(filename: str):
	# Use bytearray for performance; += on 'bytes' is O(n^2)
	content = bytearray([0xFF, 0xD8]) # SOI

	# 1. Add the metadata 'stuffing'
	marker = bytes([0xFF, 0xFE, 0x00, 0x03, 0x41])
	for _ in range(1_887_000): # Adds up to 9MB
	content.extend(marker)

	# 2. Add a tiny 1x1 valid JPEG skeleton at the end
	# This ensures the parser doesn't give up early
	skeleton = bytes.fromhex(
	"FFDB 0043 00" # DQT Marker
	"08080808080808080808080808080808" # Quantization Table (64 bytes of 0x08)
	"08080808080808080808080808080808"
	"08080808080808080808080808080808"
	"08080808080808080808080808080808"
	"FFC0 0011 08 0001 0001" # SOF0 (1x1 pixels)
	"03 01 11 00 02 11 01 03 11 01"
	"FFDA 0008 01 01 00 00 3F 00" # SOS (Start of Scan)
	"003F FF00" # Dummy Scan Data
	)
	content.extend(skeleton)
	content.extend([0xFF, 0xD9]) # EOI

	Path(filename).write_bytes(content)


	def create_jpeg_pixel_bomb(filename: str):
	"""
	Valid JPEG under 10MB that expands to ~12GB in RAM.
	Uses a 1x1 minimal scan to satisfy the parser while claiming 64k dimensions.
	"""
	# Claims 65535x65535 pixels
	# 65535 * 65535 * 3 (RGB) = 12,884,508,675 bytes (~12 GB)
	# Pixel Bomb Header: Claims 65535x65535 resolution
	header = (
	bytes.fromhex("FFD8") # SOI
	+ bytes.fromhex("FFDB 0043 00") # DQT Header
	+ bytes([0x08] * 64) # DQT Table Data
	+ bytes.fromhex("FFC0 0011 08 FFFF FFFF") # SOF0 (Height/Width 65535)
	+ bytes.fromhex("03 01 11 00 02 11 01 03 11 01")
	+ bytes.fromhex("FFDA 0008 01 01 00 00 3F 00") # SOS
	)

	# Fill with 9MB of highly compressible null data to hit your size limit
	# and ensure the file is not 'truncated'
	pixel_data = b"\x00" * (9 * 1024 * 1024)
	footer = b"\xff\xd9" # EOI

	Path(filename).write_bytes(header + pixel_data + footer)


	def create_jpeg_cpu_scan_bomb(filename: str):
	"""
	A 9MB JPEG containing thousands of SOS (Start of Scan) markers.
	Forces the CPU to perform redundant decoding passes.
	"""
	# CPU Scan Bomb Base: Designed for repeating SOS markers
	content = bytearray(
	bytes.fromhex("FFD8") # SOI
	+ bytes.fromhex("FFDB 0043 00") # DQT Header
	+ bytes([0x08] * 64) # DQT Table Data
	+ bytes.fromhex("FFC0 0011 08 0100 0100") # SOF (256x256)
	+ bytes.fromhex("03 01 11 00 02 11 01 03 11 01")
	)

	# Add 9MB of SOS markers
	# Each SOS marker forces a new decoding pass in some parsers
	sos_marker = bytes(
	[0xFF, 0xDA, 0x00, 0x08, 0x01, 0x01, 0x00, 0x00, 0x3F, 0x00, 0x00]
	)

	for _ in range(800000): # Fill up to ~9MB
	content.extend(sos_marker)

	content.extend([0xFF, 0xD9]) # EOI
	Path(filename).write_bytes(content)