Spaces:

ScottzillaSystems
/

document-parser

Sleeping

App Files Files Community

document-parser / app.py

ScottzillaSystems

Refactor: production-grade error handling, progress bars, zip bomb protection, per-file isolation, Gradio 6 compat

d60d975 verified 9 days ago

raw

history blame contribute delete

33.6 kB

	"""
	📦 Document Parser — Production-Grade ZIP Document Extraction Tool

	Features:
	- Upload ZIP files and parse all supported document formats
	- Supports 40+ text/code formats, PDF, DOCX, XLSX
	- Zip bomb protection (decompression ratio + size limits)
	- Per-file error isolation — one corrupt file won't crash the whole parse
	- Progress bars for real-time feedback
	- Concurrency-limited to prevent resource exhaustion
	- Full structured JSON export + file detail drill-down
	"""

	from __future__ import annotations

	import io
	import logging
	import os
	import traceback
	import zipfile
	from dataclasses import dataclass, field
	from enum import Enum
	from typing import Optional

	import gradio as gr

	# ──────────────────────────────────────────────────────────────────────────────
	# Configuration constants
	# ──────────────────────────────────────────────────────────────────────────────
	MAX_ZIP_SIZE_MB = 200
	MAX_FILES_IN_ZIP = 500
	MAX_SINGLE_FILE_MB = 50
	MAX_DECOMPRESSION_RATIO = 100 # zip bomb guard: reject if total > ratio × compressed
	MAX_PREVIEW_CHARS = 5_000
	MAX_FULL_TEXT_CHARS = 500_000
	MAX_XLSX_ROWS = 100
	CONCURRENCY_LIMIT = 3

	# ──────────────────────────────────────────────────────────────────────────────
	# Logging
	# ──────────────────────────────────────────────────────────────────────────────
	logger = logging.getLogger("document_parser")
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s \| %(levelname)s \| %(message)s",
	)


	# ──────────────────────────────────────────────────────────────────────────────
	# File classification
	# ──────────────────────────────────────────────────────────────────────────────
	class FileCategory(str, Enum):
	TEXT = "text"
	PDF = "pdf"
	DOCX = "docx"
	XLSX = "xlsx"
	IMAGE = "image"
	BINARY = "binary"


	TEXT_EXTENSIONS = frozenset({
	".txt", ".md", ".rst", ".py", ".js", ".ts", ".jsx", ".tsx", ".html",
	".htm", ".css", ".scss", ".less", ".json", ".jsonl", ".yaml", ".yml",
	".csv", ".tsv", ".xml", ".toml", ".cfg", ".ini", ".conf", ".properties",
	".sh", ".bash", ".zsh", ".fish", ".bat", ".ps1", ".cmd",
	".r", ".rmd", ".java", ".c", ".cpp", ".h", ".hpp", ".cc", ".cxx",
	".go", ".rs", ".rb", ".php", ".swift", ".kt", ".kts", ".scala", ".clj",
	".sql", ".graphql", ".gql", ".proto", ".thrift",
	".dockerfile", ".makefile", ".cmake",
	".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
	".env", ".env.example", ".log", ".tex", ".bib", ".sty",
	".lua", ".vim", ".el", ".lisp", ".hs", ".ml", ".mli", ".ex", ".exs",
	".erl", ".hrl", ".dart", ".v", ".sv", ".vhd", ".vhdl",
	".tf", ".tfvars", ".hcl", ".nix", ".dhall",
	".ipynb",
	})

	KNOWN_TEXT_FILENAMES = frozenset({
	"Makefile", "Dockerfile", "Procfile", "Vagrantfile", "Gemfile",
	"Rakefile", "Brewfile", "Justfile", "Taskfile",
	".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
	".eslintrc", ".prettierrc", ".babelrc", ".browserslistrc",
	"LICENSE", "LICENCE", "COPYING", "AUTHORS", "CONTRIBUTORS",
	"CHANGELOG", "CHANGES", "HISTORY", "NEWS",
	"README", "INSTALL", "TODO", "HACKING",
	"requirements.txt",
	})

	IMAGE_EXTENSIONS = frozenset({
	".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico",
	".tiff", ".tif", ".avif", ".heic", ".heif",
	})

	CATEGORY_EMOJI = {
	FileCategory.TEXT: "📄",
	FileCategory.PDF: "📕",
	FileCategory.DOCX: "📘",
	FileCategory.XLSX: "📊",
	FileCategory.IMAGE: "🖼️",
	FileCategory.BINARY: "📦",
	}


	def classify_file(filename: str) -> tuple[FileCategory, str]:
	"""Classify a file by its extension and known filename patterns."""
	basename = filename.rsplit("/", 1)[-1] if "/" in filename else filename
	ext = os.path.splitext(basename)[1].lower()

	if not ext and basename in KNOWN_TEXT_FILENAMES:
	return FileCategory.TEXT, ""
	if not ext and basename.upper() in {n.upper() for n in KNOWN_TEXT_FILENAMES}:
	return FileCategory.TEXT, ""

	if ext in TEXT_EXTENSIONS:
	return FileCategory.TEXT, ext
	if ext == ".pdf":
	return FileCategory.PDF, ext
	if ext == ".docx":
	return FileCategory.DOCX, ext
	if ext in {".xlsx", ".xls"}:
	return FileCategory.XLSX, ext
	if ext in IMAGE_EXTENSIONS:
	return FileCategory.IMAGE, ext
	return FileCategory.BINARY, ext


	# ──────────────────────────────────────────────────────────────────────────────
	# Data classes
	# ──────────────────────────────────────────────────────────────────────────────
	@dataclass
	class ParsedFile:
	filename: str
	category: str
	extension: str
	size_bytes: int
	size_display: str
	content: str = ""
	preview: str = ""
	error: Optional[str] = None
	warnings: list[str] = field(default_factory=list)

	def to_table_row(self) -> list:
	status = "⚠️" if self.warnings else ("❌" if self.error else "✅")
	preview_text = self.error or self.preview[:200].replace("\n", " ")
	return [
	status,
	self.filename,
	self.extension or "(none)",
	self.category,
	self.size_display,
	preview_text,
	]


	@dataclass
	class ParseStats:
	total_files: int = 0
	parsed_ok: int = 0
	parse_warnings: int = 0
	parse_errors: int = 0
	skipped_dirs: int = 0
	total_compressed_bytes: int = 0
	total_uncompressed_bytes: int = 0
	by_category: dict = field(default_factory=lambda: {c.value: 0 for c in FileCategory})


	# ──────────────────────────────────────────────────────────────────────────────
	# Size formatting
	# ──────────────────────────────────────────────────────────────────────────────
	def format_size(size_bytes: int) -> str:
	if size_bytes < 0:
	return "0 B"
	if size_bytes < 1024:
	return f"{size_bytes} B"
	elif size_bytes < 1024 ** 2:
	return f"{size_bytes / 1024:.1f} KB"
	elif size_bytes < 1024 ** 3:
	return f"{size_bytes / (1024 ** 2):.1f} MB"
	else:
	return f"{size_bytes / (1024 ** 3):.2f} GB"


	# ──────────────────────────────────────────────────────────────────────────────
	# Document parsers — each returns (content, warnings) or raises
	# ──────────────────────────────────────────────────────────────────────────────
	def parse_text_content(data: bytes, filename: str) -> tuple[str, list[str]]:
	"""Parse plain text / code files."""
	warnings = []
	try:
	content = data.decode("utf-8")
	except UnicodeDecodeError:
	try:
	content = data.decode("latin-1")
	warnings.append("Decoded with latin-1 fallback (not valid UTF-8)")
	except Exception:
	content = data.decode("utf-8", errors="replace")
	warnings.append("Contains invalid bytes; replaced with placeholders")

	if len(content) > MAX_FULL_TEXT_CHARS:
	warnings.append(f"Content truncated to {MAX_FULL_TEXT_CHARS:,} characters (original: {len(content):,})")
	content = content[:MAX_FULL_TEXT_CHARS] + "\n\n... [TRUNCATED]"

	return content, warnings


	def parse_pdf_content(data: bytes, filename: str) -> tuple[str, list[str]]:
	"""Parse PDF bytes to text using PyMuPDF."""
	warnings = []
	try:
	import fitz
	except ImportError:
	return "[PDF library not available]", ["PyMuPDF not installed — install with: pip install PyMuPDF"]

	doc = None
	try:
	doc = fitz.open(stream=data, filetype="pdf")
	if doc.is_encrypted:
	return "", ["PDF is password-protected and cannot be parsed"]

	page_count = len(doc)
	if page_count == 0:
	return "", ["PDF has 0 pages"]

	text_parts = []
	empty_pages = 0
	for page_num in range(page_count):
	try:
	page = doc[page_num]
	page_text = page.get_text().strip()
	if page_text:
	text_parts.append(f"\n--- Page {page_num + 1}/{page_count} ---\n{page_text}")
	else:
	empty_pages += 1
	except Exception as e:
	warnings.append(f"Page {page_num + 1} failed: {type(e).__name__}: {e}")

	if empty_pages > 0:
	warnings.append(f"{empty_pages}/{page_count} pages had no extractable text (may be scanned/image-based)")

	content = "\n".join(text_parts) if text_parts else "[No extractable text found]"
	if not text_parts and empty_pages == page_count:
	warnings.append("PDF appears to be entirely image-based; OCR would be needed to extract text")

	return content, warnings

	except Exception as e:
	logger.error(f"PDF parse error for {filename}: {e}")
	return "", [f"PDF parse failed: {type(e).__name__}: {e}"]
	finally:
	if doc:
	try:
	doc.close()
	except Exception:
	pass


	def parse_docx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
	"""Parse DOCX bytes to text."""
	warnings = []
	try:
	from docx import Document
	except ImportError:
	return "[DOCX library not available]", ["python-docx not installed"]

	try:
	doc = Document(io.BytesIO(data))
	parts = []

	paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
	if paragraphs:
	parts.extend(paragraphs)

	for i, table in enumerate(doc.tables):
	try:
	table_text = f"\n--- Table {i + 1} ---\n"
	for row in table.rows:
	cells = [cell.text.strip() for cell in row.cells]
	table_text += " \| ".join(cells) + "\n"
	parts.append(table_text)
	except Exception as e:
	warnings.append(f"Table {i + 1} extraction failed: {e}")

	content = "\n".join(parts) if parts else "[DOCX: empty document]"
	if not parts:
	warnings.append("Document contains no paragraphs or tables")

	return content, warnings

	except Exception as e:
	logger.error(f"DOCX parse error for {filename}: {e}")
	return "", [f"DOCX parse failed: {type(e).__name__}: {e}"]


	def parse_xlsx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
	"""Parse XLSX bytes to text summary."""
	warnings = []
	try:
	import openpyxl
	except ImportError:
	return "[XLSX library not available]", ["openpyxl not installed"]

	wb = None
	try:
	wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
	parts = []

	for sheet_name in wb.sheetnames:
	try:
	ws = wb[sheet_name]
	sheet_text = f"\n--- Sheet: {sheet_name} ---\n"
	row_count = 0
	for row in ws.iter_rows(values_only=True):
	if row_count >= MAX_XLSX_ROWS:
	sheet_text += f"\n... (truncated at {MAX_XLSX_ROWS} rows)\n"
	warnings.append(f"Sheet '{sheet_name}' truncated at {MAX_XLSX_ROWS} rows")
	break
	cells = [str(cell) if cell is not None else "" for cell in row]
	sheet_text += " \| ".join(cells) + "\n"
	row_count += 1
	if row_count == 0:
	sheet_text += "(empty sheet)\n"
	parts.append(sheet_text)
	except Exception as e:
	warnings.append(f"Sheet '{sheet_name}' failed: {type(e).__name__}: {e}")

	content = "\n".join(parts) if parts else "[XLSX: empty workbook]"
	return content, warnings

	except Exception as e:
	logger.error(f"XLSX parse error for {filename}: {e}")
	return "", [f"XLSX parse failed: {type(e).__name__}: {e}"]
	finally:
	if wb:
	try:
	wb.close()
	except Exception:
	pass


	# ──────────────────────────────────────────────────────────────────────────────
	# Validation layer
	# ──────────────────────────────────────────────────────────────────────────────
	def validate_upload(file_path: str \| None) -> str:
	"""Validate the uploaded file. Returns the resolved file path. Raises gr.Error on failure."""
	if file_path is None:
	raise gr.Error("⚠️ Please upload a ZIP file first.")

	if not os.path.isfile(file_path):
	raise gr.Error("❌ Upload failed — file not found on server. Please try again.")

	file_size = os.path.getsize(file_path)
	if file_size == 0:
	raise gr.Error("❌ The uploaded file is empty (0 bytes).")

	size_mb = file_size / (1024 ** 2)
	if size_mb > MAX_ZIP_SIZE_MB:
	raise gr.Error(
	f"❌ File too large: {size_mb:.1f} MB. "
	f"Maximum allowed is {MAX_ZIP_SIZE_MB} MB."
	)

	if not zipfile.is_zipfile(file_path):
	raise gr.Error(
	"❌ Not a valid ZIP archive. The file may be corrupted, "
	"or it may be a different archive format (tar, rar, 7z)."
	)

	return file_path


	def check_zip_bomb(zf: zipfile.ZipFile, compressed_size: int) -> list[str]:
	"""Check for zip bomb indicators. Returns warnings. Raises gr.Error if malicious."""
	warnings = []
	total_uncompressed = sum(info.file_size for info in zf.infolist() if not info.is_dir())

	if compressed_size > 0:
	ratio = total_uncompressed / compressed_size
	if ratio > MAX_DECOMPRESSION_RATIO:
	raise gr.Error(
	f"🛡️ Zip bomb detected! Decompression ratio is {ratio:.0f}x "
	f"(compressed: {format_size(compressed_size)}, "
	f"uncompressed: {format_size(total_uncompressed)}). "
	f"Maximum allowed ratio is {MAX_DECOMPRESSION_RATIO}x."
	)
	if ratio > MAX_DECOMPRESSION_RATIO / 2:
	warnings.append(
	f"High decompression ratio ({ratio:.0f}x) — approaching the "
	f"{MAX_DECOMPRESSION_RATIO}x safety limit"
	)

	total_uncompressed_mb = total_uncompressed / (1024 ** 2)
	if total_uncompressed_mb > MAX_ZIP_SIZE_MB * 5:
	raise gr.Error(
	f"🛡️ Uncompressed content too large: {total_uncompressed_mb:.0f} MB. "
	f"Maximum is {MAX_ZIP_SIZE_MB * 5} MB."
	)

	return warnings


	# ──────────────────────────────────────────────────────────────────────────────
	# Core parsing engine
	# ──────────────────────────────────────────────────────────────────────────────
	def parse_zip(file_path: str, progress: gr.Progress) -> tuple[list[ParsedFile], ParseStats]:
	"""Parse all files in a ZIP archive with per-file error isolation."""
	file_size = os.path.getsize(file_path)
	stats = ParseStats()

	try:
	zf = zipfile.ZipFile(file_path, "r")
	except zipfile.BadZipFile:
	raise gr.Error("❌ ZIP file is corrupted and cannot be opened.")
	except Exception as e:
	raise gr.Error(f"❌ Failed to open ZIP: {type(e).__name__}: {e}")

	try:
	bomb_warnings = check_zip_bomb(zf, file_size)

	entries = [info for info in zf.infolist() if not info.is_dir()]
	stats.skipped_dirs = len(zf.infolist()) - len(entries)
	stats.total_files = len(entries)
	stats.total_compressed_bytes = file_size

	if stats.total_files == 0:
	raise gr.Error("❌ ZIP archive contains no files (only directories).")

	truncated = False
	if stats.total_files > MAX_FILES_IN_ZIP:
	gr.Warning(
	f"ZIP contains {stats.total_files} files — "
	f"processing first {MAX_FILES_IN_ZIP} only."
	)
	entries = entries[:MAX_FILES_IN_ZIP]
	truncated = True

	parsed_files: list[ParsedFile] = []

	for i, info in enumerate(progress.tqdm(entries, desc="Parsing documents")):
	category, ext = classify_file(info.filename)
	stats.by_category[category.value] += 1
	stats.total_uncompressed_bytes += info.file_size

	pf = ParsedFile(
	filename=info.filename,
	category=category.value,
	extension=ext or "(none)",
	size_bytes=info.file_size,
	size_display=format_size(info.file_size),
	)

	file_mb = info.file_size / (1024 ** 2)
	if file_mb > MAX_SINGLE_FILE_MB:
	pf.error = f"Skipped: file too large ({file_mb:.1f} MB > {MAX_SINGLE_FILE_MB} MB limit)"
	pf.warnings.append(pf.error)
	stats.parse_warnings += 1
	parsed_files.append(pf)
	continue

	try:
	raw_data = zf.read(info)
	except RuntimeError as e:
	pf.error = f"Cannot read: {e}"
	if "password" in str(e).lower():
	pf.error = "File is password-protected"
	stats.parse_errors += 1
	parsed_files.append(pf)
	continue
	except Exception as e:
	pf.error = f"Read failed: {type(e).__name__}: {e}"
	stats.parse_errors += 1
	parsed_files.append(pf)
	continue

	try:
	if category == FileCategory.TEXT:
	content, warnings = parse_text_content(raw_data, info.filename)
	elif category == FileCategory.PDF:
	content, warnings = parse_pdf_content(raw_data, info.filename)
	elif category == FileCategory.DOCX:
	content, warnings = parse_docx_content(raw_data, info.filename)
	elif category == FileCategory.XLSX:
	content, warnings = parse_xlsx_content(raw_data, info.filename)
	elif category == FileCategory.IMAGE:
	content = ""
	warnings = []
	pf.preview = f"[Image: {ext}, {pf.size_display}]"
	else:
	content = ""
	warnings = []
	pf.preview = f"[Binary: {ext}, {pf.size_display}]"

	pf.content = content
	pf.preview = content[:MAX_PREVIEW_CHARS] if content else pf.preview
	pf.warnings = warnings

	if warnings:
	stats.parse_warnings += 1
	else:
	stats.parsed_ok += 1

	except MemoryError:
	pf.error = "Out of memory while parsing this file"
	stats.parse_errors += 1
	logger.error(f"MemoryError parsing {info.filename}")
	except Exception as e:
	pf.error = f"Parse failed: {type(e).__name__}: {e}"
	stats.parse_errors += 1
	logger.error(f"Parse error for {info.filename}: {e}")
	traceback.print_exc()

	parsed_files.append(pf)

	if bomb_warnings:
	for w in bomb_warnings:
	gr.Warning(w)

	if truncated:
	stats.parse_warnings += 1

	return parsed_files, stats

	finally:
	try:
	zf.close()
	except Exception:
	pass


	# ──────────────────────────────────────────────────────────────────────────────
	# Output formatters
	# ──────────────────────────────────────────────────────────────────────────────
	def build_summary(stats: ParseStats, parsed_files: list[ParsedFile]) -> str:
	"""Build a rich markdown summary."""
	alerts = []
	if stats.parse_errors > 0:
	alerts.append(f"⚠️ {stats.parse_errors} file(s) failed to parse — see ❌ markers in the file listing")
	if stats.parse_warnings > 0:
	alerts.append(f"ℹ️ {stats.parse_warnings} file(s) had warnings — see ⚠️ markers in the file listing")

	alert_block = "\n".join(alerts) + "\n\n" if alerts else ""

	error_files = [pf for pf in parsed_files if pf.error]
	error_block = ""
	if error_files:
	error_lines = []
	for pf in error_files[:10]:
	error_lines.append(f"- `{pf.filename}`: {pf.error}")
	if len(error_files) > 10:
	error_lines.append(f"- ... and {len(error_files) - 10} more")
	error_block = "\n### ❌ Failed Files\n" + "\n".join(error_lines) + "\n\n"

	return f"""## 📦 ZIP Archive Summary

	{alert_block}\| Metric \| Value \|
	\|--------\|-------\|
	\| Total files \| {stats.total_files} \|
	\| Parsed successfully \| {stats.parsed_ok} \|
	\| With warnings \| {stats.parse_warnings} \|
	\| Failed \| {stats.parse_errors} \|
	\| Compressed size \| {format_size(stats.total_compressed_bytes)} \|
	\| Uncompressed size \| {format_size(stats.total_uncompressed_bytes)} \|
	\| Directories skipped \| {stats.skipped_dirs} \|

	### 📊 File Types
	\| Category \| Count \|
	\|----------\|-------\|
	\| Text/Code \| {stats.by_category.get('text', 0)} \|
	\| PDF \| {stats.by_category.get('pdf', 0)} \|
	\| DOCX \| {stats.by_category.get('docx', 0)} \|
	\| XLSX \| {stats.by_category.get('xlsx', 0)} \|
	\| Image \| {stats.by_category.get('image', 0)} \|
	\| Binary \| {stats.by_category.get('binary', 0)} \|

	{error_block}"""


	def build_full_text(parsed_files: list[ParsedFile]) -> str:
	"""Build concatenated text output from all parsed files."""
	parts = []
	for pf in parsed_files:
	if pf.content:
	emoji = CATEGORY_EMOJI.get(FileCategory(pf.category), "📄")
	parts.append(
	f"\n{'=' * 70}\n"
	f"{emoji} {pf.filename}"
	f"{' ⚠️ ' + ', '.join(pf.warnings) if pf.warnings else ''}\n"
	f"{'=' * 70}\n"
	f"{pf.content}"
	)
	elif pf.error:
	parts.append(
	f"\n{'=' * 70}\n"
	f"❌ {pf.filename} — ERROR: {pf.error}\n"
	f"{'=' * 70}"
	)

	if not parts:
	return "(No text content was extracted from any file in the archive.)"

	full = "\n".join(parts)
	if len(full) > MAX_FULL_TEXT_CHARS:
	full = full[:MAX_FULL_TEXT_CHARS] + "\n\n... [OUTPUT TRUNCATED — too large to display fully]"
	return full


	def build_json(parsed_files: list[ParsedFile]) -> list[dict]:
	"""Build structured JSON output."""
	output = []
	for pf in parsed_files:
	entry = {
	"filename": pf.filename,
	"category": pf.category,
	"extension": pf.extension,
	"size_bytes": pf.size_bytes,
	"size_display": pf.size_display,
	"preview": pf.preview[:1000],
	"status": "error" if pf.error else ("warning" if pf.warnings else "ok"),
	}
	if pf.error:
	entry["error"] = pf.error
	if pf.warnings:
	entry["warnings"] = pf.warnings
	output.append(entry)
	return output


	def build_detail(file_data: list[dict], evt: gr.SelectData) -> str:
	"""Build detail view when user clicks a table row."""
	if not file_data or not isinstance(file_data, list):
	return "ℹ️ Select a file from the File Listing tab to see its full preview here."

	try:
	row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
	except (TypeError, IndexError):
	return "⚠️ Could not determine selected row. Please click a row in the file listing."

	if not (0 <= row_idx < len(file_data)):
	return f"⚠️ Row index {row_idx} is out of range (0–{len(file_data) - 1})."

	item = file_data[row_idx]

	header = f"## {CATEGORY_EMOJI.get(item.get('category', ''), '📄')} {item['filename']}\n"
	meta = f"Category: {item.get('category', 'unknown')} \| Size: {item.get('size_display', 'unknown')}\n\n"

	sections = [header, meta]

	if item.get("error"):
	sections.append(f"### ❌ Error\n```\n{item['error']}\n```\n")

	if item.get("warnings"):
	sections.append("### ⚠️ Warnings\n" + "\n".join(f"- {w}" for w in item["warnings"]) + "\n\n")

	preview = item.get("preview", "")
	if preview and not preview.startswith("["):
	ext = item.get("extension", "").lstrip(".")
	lang_map = {
	"py": "python", "js": "javascript", "ts": "typescript",
	"json": "json", "yaml": "yaml", "yml": "yaml",
	"html": "html", "htm": "html", "css": "css",
	"sql": "sql", "sh": "bash", "bash": "bash",
	"java": "java", "c": "c", "cpp": "cpp", "go": "go",
	"rs": "rust", "rb": "ruby", "php": "php", "xml": "xml",
	"md": "markdown", "toml": "toml", "csv": "csv",
	}
	lang = lang_map.get(ext, "")
	sections.append(f"### 📝 Content Preview\n```{lang}\n{preview}\n```")
	elif preview:
	sections.append(f"### 📝 Info\n{preview}")
	else:
	sections.append("(No content to preview for this file type.)")

	return "\n".join(sections)


	# ──────────────────────────────────────────────────────────────────────────────
	# Main entry point
	# ──────────────────────────────────────────────────────────────────────────────
	def run_parse(file_obj, progress=gr.Progress()):
	"""Top-level handler: validate → parse → format outputs."""
	try:
	file_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)

	progress(0.0, desc="Validating upload...")
	file_path = validate_upload(file_path)

	gr.Info(f"📦 Processing ZIP file ({format_size(os.path.getsize(file_path))})...")

	parsed_files, stats = parse_zip(file_path, progress)

	progress(0.95, desc="Building output...")
	summary = build_summary(stats, parsed_files)
	table_rows = [pf.to_table_row() for pf in parsed_files]
	full_text = build_full_text(parsed_files)
	json_data = build_json(parsed_files)

	progress(1.0, desc="Done!")

	if stats.parse_errors > 0:
	gr.Warning(f"{stats.parse_errors} file(s) failed to parse. See details below.")
	elif stats.parse_warnings > 0:
	gr.Info(f"✅ Parsed {stats.parsed_ok} files with {stats.parse_warnings} warning(s).")
	else:
	gr.Info(f"✅ Successfully parsed all {stats.parsed_ok} files!")

	return summary, table_rows, full_text, json_data, json_data

	except gr.Error:
	raise

	except MemoryError:
	logger.error("MemoryError during ZIP processing")
	raise gr.Error(
	"💥 Out of memory! The ZIP file contents are too large to process. "
	"Try a smaller archive or one with fewer/smaller files."
	)

	except Exception as e:
	logger.error(f"Unexpected error: {type(e).__name__}: {e}")
	traceback.print_exc()
	raise gr.Error(
	f"💥 An unexpected error occurred: {type(e).__name__}: {e}\n\n"
	"If this persists, please report it as a bug."
	)


	# ──────────────────────────────────────────────────────────────────────────────
	# Gradio UI
	# ──────────────────────────────────────────────────────────────────────────────
	with gr.Blocks(
	title="📦 Document Parser",
	) as demo:
	gr.Markdown("""
	# 📦 Document Parser

	Upload a ZIP file and this tool extracts & parses text from every supported document inside it.

	Supported formats: `.txt`, `.md`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.csv`, `.html`, `.xml`,
	`.pdf`, `.docx`, `.xlsx`, and 40+ more text/code formats — including `Makefile`, `Dockerfile`, `LICENSE`, etc.

	Limits: Max ZIP size: {max_zip}MB · Max files: {max_files} · Max single file: {max_file}MB · Zip bomb protection enabled
	""".format(max_zip=MAX_ZIP_SIZE_MB, max_files=MAX_FILES_IN_ZIP, max_file=MAX_SINGLE_FILE_MB))

	with gr.Row():
	with gr.Column(scale=1):
	zip_input = gr.File(
	label="Upload ZIP File",
	file_types=[".zip"],
	type="filepath",
	)
	parse_btn = gr.Button(
	"🔍 Parse Documents",
	variant="primary",
	size="lg",
	)

	summary_output = gr.Markdown(label="Summary", value="Upload a ZIP file to get started.")

	with gr.Tabs():
	with gr.Tab("📋 File Listing"):
	file_table = gr.Dataframe(
	headers=["Status", "Filename", "Extension", "Type", "Size", "Preview"],
	label="Files in Archive",
	interactive=False,
	wrap=True,
	)
	with gr.Tab("📝 Extracted Text"):
	text_output = gr.Textbox(
	label="Full Extracted Text (all parseable files concatenated)",
	lines=30,
	max_lines=100,
	buttons=["copy"],
	)
	with gr.Tab("🔎 File Detail"):
	gr.Markdown("Click a row in the File Listing* tab, then switch here to see the full preview.*")
	detail_output = gr.Markdown(
	"ℹ️ Select a file from the File Listing tab to see its full preview here."
	)
	with gr.Tab("📊 JSON Export"):
	json_output = gr.JSON(label="Structured Parse Results")

	file_data_state = gr.State([])

	parse_btn.click(
	fn=run_parse,
	inputs=zip_input,
	outputs=[summary_output, file_table, text_output, json_output, file_data_state],
	concurrency_limit=CONCURRENCY_LIMIT,
	concurrency_id="parse_engine",
	trigger_mode="once",
	)

	zip_input.upload(
	fn=run_parse,
	inputs=zip_input,
	outputs=[summary_output, file_table, text_output, json_output, file_data_state],
	concurrency_limit=CONCURRENCY_LIMIT,
	concurrency_id="parse_engine",
	trigger_mode="once",
	)

	file_table.select(
	fn=build_detail,
	inputs=file_data_state,
	outputs=detail_output,
	)

	demo.queue(default_concurrency_limit=CONCURRENCY_LIMIT, max_size=20)

	if __name__ == "__main__":
	demo.launch(
	show_error=True,
	theme=gr.themes.Soft(),
	css="""
	.file-table { font-size: 0.9em; }
	footer { display: none !important; }
	""",
	)