Spaces:

ScottzillaSystems
/

document-parser

Sleeping

App Files Files Community

ScottzillaSystems commited on 9 days ago

Commit

d60d975

verified ·

1 Parent(s): 61ca336

Refactor: production-grade error handling, progress bars, zip bomb protection, per-file isolation, Gradio 6 compat

Browse files

Files changed (1) hide show

app.py +720 -192

app.py CHANGED Viewed

@@ -1,233 +1,744 @@
-import gradio as gr
-import zipfile
-import os
 import io
-import json
-import tempfile
-import shutil
-# Supported text-based extensions
-TEXT_EXTS = {
-    ".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css",
-    ".json", ".yaml", ".yml", ".csv", ".xml", ".toml", ".cfg", ".ini",
-    ".sh", ".bash", ".bat", ".ps1", ".r", ".java", ".c", ".cpp", ".h",
-    ".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala",
-    ".sql", ".dockerfile", ".makefile", ".gitignore", ".env", ".log",
 }
-# Extensions we can parse with special libraries
-PDF_EXTS = {".pdf"}
-DOCX_EXTS = {".docx"}
-XLSX_EXTS = {".xlsx"}
-IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico"}
-def get_file_type(filename):
-    """Categorize file by extension."""
-    ext = os.path.splitext(filename)[1].lower()
-    if not ext and filename.split("/")[-1] in {"Makefile", "Dockerfile", "Procfile", ".gitignore", ".dockerignore"}:
-        return "text", ext
-    if ext in TEXT_EXTS:
-        return "text", ext
-    if ext in PDF_EXTS:
-        return "pdf", ext
-    if ext in DOCX_EXTS:
-        return "docx", ext
-    if ext in XLSX_EXTS:
-        return "xlsx", ext
-    if ext in IMAGE_EXTS:
-        return "image", ext
-    return "binary", ext
-def parse_pdf_content(data):
     """Parse PDF bytes to text using PyMuPDF."""
     try:
         import fitz
-        doc = fitz.open(stream=data, filetype="pdf")
-        text = ""
-        for page_num, page in enumerate(doc):
-            text += f"\n--- Page {page_num + 1} ---\n"
-            text += page.get_text()
-        doc.close()
-        return text.strip() if text.strip() else "[PDF: no extractable text]"
     except ImportError:
-        return "[PDF parsing unavailable - PyMuPDF not installed]"
     except Exception as e:
-        return f"[PDF parse error: {e}]"
-def parse_docx_content(data):
     """Parse DOCX bytes to text."""
     try:
         from docx import Document
         doc = Document(io.BytesIO(data))
         paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-        return "\n".join(paragraphs) if paragraphs else "[DOCX: empty document]"
-    except ImportError:
-        return "[DOCX parsing unavailable - python-docx not installed]"
     except Exception as e:
-        return f"[DOCX parse error: {e}]"
-def parse_xlsx_content(data):
     """Parse XLSX bytes to text summary."""
     try:
         import openpyxl
-        wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
-        text = ""
-        for sheet_name in wb.sheetnames:
-            ws = wb[sheet_name]
-            text += f"\n--- Sheet: {sheet_name} ---\n"
-            row_count = 0
-            for row in ws.iter_rows(values_only=True):
-                if row_count >= 50:  # Limit rows shown
-                    text += f"\n... (more rows exist)\n"
-                    break
-                text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
-                row_count += 1
-        wb.close()
-        return text.strip() if text.strip() else "[XLSX: empty workbook]"
     except ImportError:
-        return "[XLSX parsing unavailable - openpyxl not installed]"
     except Exception as e:
-        return f"[XLSX parse error: {e}]"
-def format_size(size_bytes):
-    """Format bytes to human-readable string."""
-    if size_bytes < 1024:
-        return f"{size_bytes} B"
-    elif size_bytes < 1024 * 1024:
-        return f"{size_bytes / 1024:.1f} KB"
-    else:
-        return f"{size_bytes / (1024 * 1024):.1f} MB"
-def parse_zip(file_obj):
-    """Main parsing function for uploaded zip files."""
-    if file_obj is None:
-        return "⚠️ Please upload a ZIP file.", [], "", []
-    file_path = file_obj if isinstance(file_obj, str) else file_obj.name
     if not zipfile.is_zipfile(file_path):
-        return "❌ The uploaded file is not a valid ZIP archive.", [], "", []
-    results = []
-    table_rows = []
-    full_text_parts = []
-    stats = {"total_files": 0, "text_files": 0, "pdf_files": 0, "docx_files": 0,
-             "xlsx_files": 0, "image_files": 0, "binary_files": 0, "total_size": 0}
-    with zipfile.ZipFile(file_path, "r") as zf:
-        for info in zf.infolist():
-            if info.is_dir():
-                continue
-            stats["total_files"] += 1
-            stats["total_size"] += info.file_size
-            file_type, ext = get_file_type(info.filename)
-            content_preview = ""
             try:
                 raw_data = zf.read(info)
             except Exception as e:
-                content_preview = f"[Read error: {e}]"
-                raw_data = None
-            if raw_data is not None:
-                if file_type == "text":
-                    stats["text_files"] += 1
-                    try:
-                        content = raw_data.decode("utf-8", errors="replace")
-                        content_preview = content[:2000]
-                        full_text_parts.append(f"\n{'='*60}\n📄 {info.filename}\n{'='*60}\n{content}")
-                    except Exception as e:
-                        content_preview = f"[Decode error: {e}]"
-                elif file_type == "pdf":
-                    stats["pdf_files"] += 1
-                    content = parse_pdf_content(raw_data)
-                    content_preview = content[:2000]
-                    full_text_parts.append(f"\n{'='*60}\n📕 {info.filename}\n{'='*60}\n{content}")
-                elif file_type == "docx":
-                    stats["docx_files"] += 1
-                    content = parse_docx_content(raw_data)
-                    content_preview = content[:2000]
-                    full_text_parts.append(f"\n{'='*60}\n📘 {info.filename}\n{'='*60}\n{content}")
-                elif file_type == "xlsx":
-                    stats["xlsx_files"] += 1
-                    content = parse_xlsx_content(raw_data)
-                    content_preview = content[:2000]
-                    full_text_parts.append(f"\n{'='*60}\n📊 {info.filename}\n{'='*60}\n{content}")
-                elif file_type == "image":
-                    stats["image_files"] += 1
-                    content_preview = f"[Image: {ext}]"
                 else:
-                    stats["binary_files"] += 1
-                    content_preview = f"[Binary file: {ext}]"
-            results.append({
-                "filename": info.filename,
-                "type": file_type,
-                "extension": ext or "(none)",
-                "size": info.file_size,
-                "size_formatted": format_size(info.file_size),
-                "preview": content_preview[:500],
-            })
-            table_rows.append([
-                info.filename,
-                ext or "(none)",
-                file_type,
-                format_size(info.file_size),
-                content_preview[:200].replace("\n", " "),
-            ])
-    # Build summary
-    summary = f"""## 📦 ZIP Archive Summary
-| Metric | Value |
 |--------|-------|
-| **Total files** | {stats['total_files']} |
-| **Total size** | {format_size(stats['total_size'])} |
-| **Text/Code files** | {stats['text_files']} |
-| **PDF files** | {stats['pdf_files']} |
-| **DOCX files** | {stats['docx_files']} |
-| **XLSX files** | {stats['xlsx_files']} |
-| **Image files** | {stats['image_files']} |
-| **Binary files** | {stats['binary_files']} |
-"""
-    full_text = "\n".join(full_text_parts) if full_text_parts else "(No text content extracted)"
-    return summary, table_rows, full_text, results
-def select_file_content(file_data_json, evt: gr.SelectData):
-    """When user clicks a row in the table, show that file's full preview."""
-    if not file_data_json or not isinstance(file_data_json, list):
-        return "Select a file from the table above."
-    row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
-    if 0 <= row_idx < len(file_data_json):
-        item = file_data_json[row_idx]
-        return f"## 📄 {item['filename']}\n**Type:** {item['type']} | **Size:** {item['size_formatted']}\n\n```\n{item.get('preview', '(no preview)')}\n```"
-    return "File not found."
-# ─── Gradio UI ───────────────────────────────────────────
 with gr.Blocks(
     title="📦 Document Parser",
-    theme=gr.themes.Soft(),
 ) as demo:
     gr.Markdown("""
 # 📦 Document Parser
-Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
-**Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.json`, `.yaml`, `.csv`, `.html`, `.pdf`, `.docx`, `.xlsx`, and 30+ more text/code formats.
-    """)
     with gr.Row():
         with gr.Column(scale=1):
@@ -236,54 +747,71 @@ Upload a **ZIP file** containing documents and this tool will parse and extract
                 file_types=[".zip"],
                 type="filepath",
             )
-            parse_btn = gr.Button("🔍 Parse Documents", variant="primary", size="lg")
-    summary_output = gr.Markdown(label="Summary")
     with gr.Tabs():
         with gr.Tab("📋 File Listing"):
             file_table = gr.Dataframe(
-                headers=["Filename", "Extension", "Type", "Size", "Preview"],
                 label="Files in Archive",
                 interactive=False,
                 wrap=True,
             )
         with gr.Tab("📝 Extracted Text"):
             text_output = gr.Textbox(
-                label="Full Extracted Text",
                 lines=30,
                 max_lines=100,
-                show_copy_button=True,
             )
         with gr.Tab("🔎 File Detail"):
-            gr.Markdown("*Click a row in the File Listing tab to see its full preview here.*")
-            detail_output = gr.Markdown("Select a file from the table above.")
-        with gr.Tab("📊 JSON Data"):
             json_output = gr.JSON(label="Structured Parse Results")
-    # Hidden state for file data
     file_data_state = gr.State([])
-    def run_parse(file_obj):
-        summary, table, text, data = parse_zip(file_obj)
-        return summary, table, text, data, data
     parse_btn.click(
         fn=run_parse,
         inputs=zip_input,
         outputs=[summary_output, file_table, text_output, json_output, file_data_state],
     )
     zip_input.upload(
         fn=run_parse,
         inputs=zip_input,
         outputs=[summary_output, file_table, text_output, json_output, file_data_state],
     )
     file_table.select(
-        fn=select_file_content,
         inputs=file_data_state,
         outputs=detail_output,
     )
 if __name__ == "__main__":
-    demo.launch()

+"""
+📦 Document Parser — Production-Grade ZIP Document Extraction Tool
+Features:
+- Upload ZIP files and parse all supported document formats
+- Supports 40+ text/code formats, PDF, DOCX, XLSX
+- Zip bomb protection (decompression ratio + size limits)
+- Per-file error isolation — one corrupt file won't crash the whole parse
+- Progress bars for real-time feedback
+- Concurrency-limited to prevent resource exhaustion
+- Full structured JSON export + file detail drill-down
+"""
+from __future__ import annotations
 import io
+import logging
+import os
+import traceback
+import zipfile
+from dataclasses import dataclass, field
+from enum import Enum
+from typing import Optional
+import gradio as gr
+# ──────────────────────────────────────────────────────────────────────────────
+# Configuration constants
+# ──────────────────────────────────────────────────────────────────────────────
+MAX_ZIP_SIZE_MB = 200
+MAX_FILES_IN_ZIP = 500
+MAX_SINGLE_FILE_MB = 50
+MAX_DECOMPRESSION_RATIO = 100  # zip bomb guard: reject if total > ratio × compressed
+MAX_PREVIEW_CHARS = 5_000
+MAX_FULL_TEXT_CHARS = 500_000
+MAX_XLSX_ROWS = 100
+CONCURRENCY_LIMIT = 3
+# ──────────────────────────────────────────────────────────────────────────────
+# Logging
+# ──────────────────────────────────────────────────────────────────────────────
+logger = logging.getLogger("document_parser")
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+)
+# ──────────────────────────────────────────────────────────────────────────────
+# File classification
+# ──────────────────────────────────────────────────────────────────────────────
+class FileCategory(str, Enum):
+    TEXT = "text"
+    PDF = "pdf"
+    DOCX = "docx"
+    XLSX = "xlsx"
+    IMAGE = "image"
+    BINARY = "binary"
+TEXT_EXTENSIONS = frozenset({
+    ".txt", ".md", ".rst", ".py", ".js", ".ts", ".jsx", ".tsx", ".html",
+    ".htm", ".css", ".scss", ".less", ".json", ".jsonl", ".yaml", ".yml",
+    ".csv", ".tsv", ".xml", ".toml", ".cfg", ".ini", ".conf", ".properties",
+    ".sh", ".bash", ".zsh", ".fish", ".bat", ".ps1", ".cmd",
+    ".r", ".rmd", ".java", ".c", ".cpp", ".h", ".hpp", ".cc", ".cxx",
+    ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".kts", ".scala", ".clj",
+    ".sql", ".graphql", ".gql", ".proto", ".thrift",
+    ".dockerfile", ".makefile", ".cmake",
+    ".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
+    ".env", ".env.example", ".log", ".tex", ".bib", ".sty",
+    ".lua", ".vim", ".el", ".lisp", ".hs", ".ml", ".mli", ".ex", ".exs",
+    ".erl", ".hrl", ".dart", ".v", ".sv", ".vhd", ".vhdl",
+    ".tf", ".tfvars", ".hcl", ".nix", ".dhall",
+    ".ipynb",
+})
+KNOWN_TEXT_FILENAMES = frozenset({
+    "Makefile", "Dockerfile", "Procfile", "Vagrantfile", "Gemfile",
+    "Rakefile", "Brewfile", "Justfile", "Taskfile",
+    ".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
+    ".eslintrc", ".prettierrc", ".babelrc", ".browserslistrc",
+    "LICENSE", "LICENCE", "COPYING", "AUTHORS", "CONTRIBUTORS",
+    "CHANGELOG", "CHANGES", "HISTORY", "NEWS",
+    "README", "INSTALL", "TODO", "HACKING",
+    "requirements.txt",
+})
+IMAGE_EXTENSIONS = frozenset({
+    ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico",
+    ".tiff", ".tif", ".avif", ".heic", ".heif",
+})
+CATEGORY_EMOJI = {
+    FileCategory.TEXT: "📄",
+    FileCategory.PDF: "📕",
+    FileCategory.DOCX: "📘",
+    FileCategory.XLSX: "📊",
+    FileCategory.IMAGE: "🖼️",
+    FileCategory.BINARY: "📦",
 }
+def classify_file(filename: str) -> tuple[FileCategory, str]:
+    """Classify a file by its extension and known filename patterns."""
+    basename = filename.rsplit("/", 1)[-1] if "/" in filename else filename
+    ext = os.path.splitext(basename)[1].lower()
+    if not ext and basename in KNOWN_TEXT_FILENAMES:
+        return FileCategory.TEXT, ""
+    if not ext and basename.upper() in {n.upper() for n in KNOWN_TEXT_FILENAMES}:
+        return FileCategory.TEXT, ""
+    if ext in TEXT_EXTENSIONS:
+        return FileCategory.TEXT, ext
+    if ext == ".pdf":
+        return FileCategory.PDF, ext
+    if ext == ".docx":
+        return FileCategory.DOCX, ext
+    if ext in {".xlsx", ".xls"}:
+        return FileCategory.XLSX, ext
+    if ext in IMAGE_EXTENSIONS:
+        return FileCategory.IMAGE, ext
+    return FileCategory.BINARY, ext
+# ──────────────────────────────────────────────────────────────────────────────
+# Data classes
+# ──────────────────────────────────────────────────────────────────────────────
+@dataclass
+class ParsedFile:
+    filename: str
+    category: str
+    extension: str
+    size_bytes: int
+    size_display: str
+    content: str = ""
+    preview: str = ""
+    error: Optional[str] = None
+    warnings: list[str] = field(default_factory=list)
+    def to_table_row(self) -> list:
+        status = "⚠️" if self.warnings else ("❌" if self.error else "✅")
+        preview_text = self.error or self.preview[:200].replace("\n", " ")
+        return [
+            status,
+            self.filename,
+            self.extension or "(none)",
+            self.category,
+            self.size_display,
+            preview_text,
+        ]
+@dataclass
+class ParseStats:
+    total_files: int = 0
+    parsed_ok: int = 0
+    parse_warnings: int = 0
+    parse_errors: int = 0
+    skipped_dirs: int = 0
+    total_compressed_bytes: int = 0
+    total_uncompressed_bytes: int = 0
+    by_category: dict = field(default_factory=lambda: {c.value: 0 for c in FileCategory})
+# ──────────────────────────────────────────────────────────────────────────────
+# Size formatting
+# ──────────────────────────────────────────────────────────────────────────────
+def format_size(size_bytes: int) -> str:
+    if size_bytes < 0:
+        return "0 B"
+    if size_bytes < 1024:
+        return f"{size_bytes} B"
+    elif size_bytes < 1024 ** 2:
+        return f"{size_bytes / 1024:.1f} KB"
+    elif size_bytes < 1024 ** 3:
+        return f"{size_bytes / (1024 ** 2):.1f} MB"
+    else:
+        return f"{size_bytes / (1024 ** 3):.2f} GB"
+# ──────────────────────────────────────────────────────────────────────────────
+# Document parsers — each returns (content, warnings) or raises
+# ──────────────────────────────────────────────────────────────────────────────
+def parse_text_content(data: bytes, filename: str) -> tuple[str, list[str]]:
+    """Parse plain text / code files."""
+    warnings = []
+    try:
+        content = data.decode("utf-8")
+    except UnicodeDecodeError:
+        try:
+            content = data.decode("latin-1")
+            warnings.append("Decoded with latin-1 fallback (not valid UTF-8)")
+        except Exception:
+            content = data.decode("utf-8", errors="replace")
+            warnings.append("Contains invalid bytes; replaced with placeholders")
+    if len(content) > MAX_FULL_TEXT_CHARS:
+        warnings.append(f"Content truncated to {MAX_FULL_TEXT_CHARS:,} characters (original: {len(content):,})")
+        content = content[:MAX_FULL_TEXT_CHARS] + "\n\n... [TRUNCATED]"
+    return content, warnings
+def parse_pdf_content(data: bytes, filename: str) -> tuple[str, list[str]]:
     """Parse PDF bytes to text using PyMuPDF."""
+    warnings = []
     try:
         import fitz
     except ImportError:
+        return "[PDF library not available]", ["PyMuPDF not installed — install with: pip install PyMuPDF"]
+    doc = None
+    try:
+        doc = fitz.open(stream=data, filetype="pdf")
+        if doc.is_encrypted:
+            return "", ["PDF is password-protected and cannot be parsed"]
+        page_count = len(doc)
+        if page_count == 0:
+            return "", ["PDF has 0 pages"]
+        text_parts = []
+        empty_pages = 0
+        for page_num in range(page_count):
+            try:
+                page = doc[page_num]
+                page_text = page.get_text().strip()
+                if page_text:
+                    text_parts.append(f"\n--- Page {page_num + 1}/{page_count} ---\n{page_text}")
+                else:
+                    empty_pages += 1
+            except Exception as e:
+                warnings.append(f"Page {page_num + 1} failed: {type(e).__name__}: {e}")
+        if empty_pages > 0:
+            warnings.append(f"{empty_pages}/{page_count} pages had no extractable text (may be scanned/image-based)")
+        content = "\n".join(text_parts) if text_parts else "[No extractable text found]"
+        if not text_parts and empty_pages == page_count:
+            warnings.append("PDF appears to be entirely image-based; OCR would be needed to extract text")
+        return content, warnings
     except Exception as e:
+        logger.error(f"PDF parse error for {filename}: {e}")
+        return "", [f"PDF parse failed: {type(e).__name__}: {e}"]
+    finally:
+        if doc:
+            try:
+                doc.close()
+            except Exception:
+                pass
+def parse_docx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
     """Parse DOCX bytes to text."""
+    warnings = []
     try:
         from docx import Document
+    except ImportError:
+        return "[DOCX library not available]", ["python-docx not installed"]
+    try:
         doc = Document(io.BytesIO(data))
+        parts = []
         paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
+        if paragraphs:
+            parts.extend(paragraphs)
+        for i, table in enumerate(doc.tables):
+            try:
+                table_text = f"\n--- Table {i + 1} ---\n"
+                for row in table.rows:
+                    cells = [cell.text.strip() for cell in row.cells]
+                    table_text += " | ".join(cells) + "\n"
+                parts.append(table_text)
+            except Exception as e:
+                warnings.append(f"Table {i + 1} extraction failed: {e}")
+        content = "\n".join(parts) if parts else "[DOCX: empty document]"
+        if not parts:
+            warnings.append("Document contains no paragraphs or tables")
+        return content, warnings
     except Exception as e:
+        logger.error(f"DOCX parse error for {filename}: {e}")
+        return "", [f"DOCX parse failed: {type(e).__name__}: {e}"]
+def parse_xlsx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
     """Parse XLSX bytes to text summary."""
+    warnings = []
     try:
         import openpyxl
     except ImportError:
+        return "[XLSX library not available]", ["openpyxl not installed"]
+    wb = None
+    try:
+        wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
+        parts = []
+        for sheet_name in wb.sheetnames:
+            try:
+                ws = wb[sheet_name]
+                sheet_text = f"\n--- Sheet: {sheet_name} ---\n"
+                row_count = 0
+                for row in ws.iter_rows(values_only=True):
+                    if row_count >= MAX_XLSX_ROWS:
+                        sheet_text += f"\n... (truncated at {MAX_XLSX_ROWS} rows)\n"
+                        warnings.append(f"Sheet '{sheet_name}' truncated at {MAX_XLSX_ROWS} rows")
+                        break
+                    cells = [str(cell) if cell is not None else "" for cell in row]
+                    sheet_text += " | ".join(cells) + "\n"
+                    row_count += 1
+                if row_count == 0:
+                    sheet_text += "(empty sheet)\n"
+                parts.append(sheet_text)
+            except Exception as e:
+                warnings.append(f"Sheet '{sheet_name}' failed: {type(e).__name__}: {e}")
+        content = "\n".join(parts) if parts else "[XLSX: empty workbook]"
+        return content, warnings
     except Exception as e:
+        logger.error(f"XLSX parse error for {filename}: {e}")
+        return "", [f"XLSX parse failed: {type(e).__name__}: {e}"]
+    finally:
+        if wb:
+            try:
+                wb.close()
+            except Exception:
+                pass
+# ──────────────────────────────────────────────────────────────────────────────
+# Validation layer
+# ──────────────────────────────────────────────────────────────────────────────
+def validate_upload(file_path: str | None) -> str:
+    """Validate the uploaded file. Returns the resolved file path. Raises gr.Error on failure."""
+    if file_path is None:
+        raise gr.Error("⚠️ Please upload a ZIP file first.")
+    if not os.path.isfile(file_path):
+        raise gr.Error("❌ Upload failed — file not found on server. Please try again.")
+    file_size = os.path.getsize(file_path)
+    if file_size == 0:
+        raise gr.Error("❌ The uploaded file is empty (0 bytes).")
+    size_mb = file_size / (1024 ** 2)
+    if size_mb > MAX_ZIP_SIZE_MB:
+        raise gr.Error(
+            f"❌ File too large: {size_mb:.1f} MB. "
+            f"Maximum allowed is {MAX_ZIP_SIZE_MB} MB."
+        )
     if not zipfile.is_zipfile(file_path):
+        raise gr.Error(
+            "❌ Not a valid ZIP archive. The file may be corrupted, "
+            "or it may be a different archive format (tar, rar, 7z)."
+        )
+    return file_path
+def check_zip_bomb(zf: zipfile.ZipFile, compressed_size: int) -> list[str]:
+    """Check for zip bomb indicators. Returns warnings. Raises gr.Error if malicious."""
+    warnings = []
+    total_uncompressed = sum(info.file_size for info in zf.infolist() if not info.is_dir())
+    if compressed_size > 0:
+        ratio = total_uncompressed / compressed_size
+        if ratio > MAX_DECOMPRESSION_RATIO:
+            raise gr.Error(
+                f"🛡️ Zip bomb detected! Decompression ratio is {ratio:.0f}x "
+                f"(compressed: {format_size(compressed_size)}, "
+                f"uncompressed: {format_size(total_uncompressed)}). "
+                f"Maximum allowed ratio is {MAX_DECOMPRESSION_RATIO}x."
+            )
+        if ratio > MAX_DECOMPRESSION_RATIO / 2:
+            warnings.append(
+                f"High decompression ratio ({ratio:.0f}x) — approaching the "
+                f"{MAX_DECOMPRESSION_RATIO}x safety limit"
+            )
+    total_uncompressed_mb = total_uncompressed / (1024 ** 2)
+    if total_uncompressed_mb > MAX_ZIP_SIZE_MB * 5:
+        raise gr.Error(
+            f"🛡️ Uncompressed content too large: {total_uncompressed_mb:.0f} MB. "
+            f"Maximum is {MAX_ZIP_SIZE_MB * 5} MB."
+        )
+    return warnings
+# ──────────────────────────────────────────────────────────────────────────────
+# Core parsing engine
+# ──────────────────────────────────────────────────────────────────────────────
+def parse_zip(file_path: str, progress: gr.Progress) -> tuple[list[ParsedFile], ParseStats]:
+    """Parse all files in a ZIP archive with per-file error isolation."""
+    file_size = os.path.getsize(file_path)
+    stats = ParseStats()
+    try:
+        zf = zipfile.ZipFile(file_path, "r")
+    except zipfile.BadZipFile:
+        raise gr.Error("❌ ZIP file is corrupted and cannot be opened.")
+    except Exception as e:
+        raise gr.Error(f"❌ Failed to open ZIP: {type(e).__name__}: {e}")
+    try:
+        bomb_warnings = check_zip_bomb(zf, file_size)
+        entries = [info for info in zf.infolist() if not info.is_dir()]
+        stats.skipped_dirs = len(zf.infolist()) - len(entries)
+        stats.total_files = len(entries)
+        stats.total_compressed_bytes = file_size
+        if stats.total_files == 0:
+            raise gr.Error("❌ ZIP archive contains no files (only directories).")
+        truncated = False
+        if stats.total_files > MAX_FILES_IN_ZIP:
+            gr.Warning(
+                f"ZIP contains {stats.total_files} files — "
+                f"processing first {MAX_FILES_IN_ZIP} only."
+            )
+            entries = entries[:MAX_FILES_IN_ZIP]
+            truncated = True
+        parsed_files: list[ParsedFile] = []
+        for i, info in enumerate(progress.tqdm(entries, desc="Parsing documents")):
+            category, ext = classify_file(info.filename)
+            stats.by_category[category.value] += 1
+            stats.total_uncompressed_bytes += info.file_size
+            pf = ParsedFile(
+                filename=info.filename,
+                category=category.value,
+                extension=ext or "(none)",
+                size_bytes=info.file_size,
+                size_display=format_size(info.file_size),
+            )
+            file_mb = info.file_size / (1024 ** 2)
+            if file_mb > MAX_SINGLE_FILE_MB:
+                pf.error = f"Skipped: file too large ({file_mb:.1f} MB > {MAX_SINGLE_FILE_MB} MB limit)"
+                pf.warnings.append(pf.error)
+                stats.parse_warnings += 1
+                parsed_files.append(pf)
+                continue
             try:
                 raw_data = zf.read(info)
+            except RuntimeError as e:
+                pf.error = f"Cannot read: {e}"
+                if "password" in str(e).lower():
+                    pf.error = "File is password-protected"
+                stats.parse_errors += 1
+                parsed_files.append(pf)
+                continue
             except Exception as e:
+                pf.error = f"Read failed: {type(e).__name__}: {e}"
+                stats.parse_errors += 1
+                parsed_files.append(pf)
+                continue
+            try:
+                if category == FileCategory.TEXT:
+                    content, warnings = parse_text_content(raw_data, info.filename)
+                elif category == FileCategory.PDF:
+                    content, warnings = parse_pdf_content(raw_data, info.filename)
+                elif category == FileCategory.DOCX:
+                    content, warnings = parse_docx_content(raw_data, info.filename)
+                elif category == FileCategory.XLSX:
+                    content, warnings = parse_xlsx_content(raw_data, info.filename)
+                elif category == FileCategory.IMAGE:
+                    content = ""
+                    warnings = []
+                    pf.preview = f"[Image: {ext}, {pf.size_display}]"
+                else:
+                    content = ""
+                    warnings = []
+                    pf.preview = f"[Binary: {ext}, {pf.size_display}]"
+                pf.content = content
+                pf.preview = content[:MAX_PREVIEW_CHARS] if content else pf.preview
+                pf.warnings = warnings
+                if warnings:
+                    stats.parse_warnings += 1
                 else:
+                    stats.parsed_ok += 1
+            except MemoryError:
+                pf.error = "Out of memory while parsing this file"
+                stats.parse_errors += 1
+                logger.error(f"MemoryError parsing {info.filename}")
+            except Exception as e:
+                pf.error = f"Parse failed: {type(e).__name__}: {e}"
+                stats.parse_errors += 1
+                logger.error(f"Parse error for {info.filename}: {e}")
+                traceback.print_exc()
+            parsed_files.append(pf)
+        if bomb_warnings:
+            for w in bomb_warnings:
+                gr.Warning(w)
+        if truncated:
+            stats.parse_warnings += 1
+        return parsed_files, stats
+    finally:
+        try:
+            zf.close()
+        except Exception:
+            pass
+# ──────────────────────────────────────────────────────────────────────────────
+# Output formatters
+# ──────────────────────────────────────────────────────────────────────────────
+def build_summary(stats: ParseStats, parsed_files: list[ParsedFile]) -> str:
+    """Build a rich markdown summary."""
+    alerts = []
+    if stats.parse_errors > 0:
+        alerts.append(f"⚠️ **{stats.parse_errors} file(s) failed to parse** — see ❌ markers in the file listing")
+    if stats.parse_warnings > 0:
+        alerts.append(f"ℹ️ **{stats.parse_warnings} file(s) had warnings** — see ⚠️ markers in the file listing")
+    alert_block = "\n".join(alerts) + "\n\n" if alerts else ""
+    error_files = [pf for pf in parsed_files if pf.error]
+    error_block = ""
+    if error_files:
+        error_lines = []
+        for pf in error_files[:10]:
+            error_lines.append(f"- `{pf.filename}`: {pf.error}")
+        if len(error_files) > 10:
+            error_lines.append(f"- ... and {len(error_files) - 10} more")
+        error_block = "\n### ❌ Failed Files\n" + "\n".join(error_lines) + "\n\n"
+    return f"""## 📦 ZIP Archive Summary
+{alert_block}| Metric | Value |
 |--------|-------|
+| **Total files** | {stats.total_files} |
+| **Parsed successfully** | {stats.parsed_ok} |
+| **With warnings** | {stats.parse_warnings} |
+| **Failed** | {stats.parse_errors} |
+| **Compressed size** | {format_size(stats.total_compressed_bytes)} |
+| **Uncompressed size** | {format_size(stats.total_uncompressed_bytes)} |
+| **Directories skipped** | {stats.skipped_dirs} |
+### 📊 File Types
+| Category | Count |
+|----------|-------|
+| Text/Code | {stats.by_category.get('text', 0)} |
+| PDF | {stats.by_category.get('pdf', 0)} |
+| DOCX | {stats.by_category.get('docx', 0)} |
+| XLSX | {stats.by_category.get('xlsx', 0)} |
+| Image | {stats.by_category.get('image', 0)} |
+| Binary | {stats.by_category.get('binary', 0)} |
+{error_block}"""
+def build_full_text(parsed_files: list[ParsedFile]) -> str:
+    """Build concatenated text output from all parsed files."""
+    parts = []
+    for pf in parsed_files:
+        if pf.content:
+            emoji = CATEGORY_EMOJI.get(FileCategory(pf.category), "📄")
+            parts.append(
+                f"\n{'=' * 70}\n"
+                f"{emoji} {pf.filename}"
+                f"{' ⚠️ ' + ', '.join(pf.warnings) if pf.warnings else ''}\n"
+                f"{'=' * 70}\n"
+                f"{pf.content}"
+            )
+        elif pf.error:
+            parts.append(
+                f"\n{'=' * 70}\n"
+                f"❌ {pf.filename} — ERROR: {pf.error}\n"
+                f"{'=' * 70}"
+            )
+    if not parts:
+        return "(No text content was extracted from any file in the archive.)"
+    full = "\n".join(parts)
+    if len(full) > MAX_FULL_TEXT_CHARS:
+        full = full[:MAX_FULL_TEXT_CHARS] + "\n\n... [OUTPUT TRUNCATED — too large to display fully]"
+    return full
+def build_json(parsed_files: list[ParsedFile]) -> list[dict]:
+    """Build structured JSON output."""
+    output = []
+    for pf in parsed_files:
+        entry = {
+            "filename": pf.filename,
+            "category": pf.category,
+            "extension": pf.extension,
+            "size_bytes": pf.size_bytes,
+            "size_display": pf.size_display,
+            "preview": pf.preview[:1000],
+            "status": "error" if pf.error else ("warning" if pf.warnings else "ok"),
+        }
+        if pf.error:
+            entry["error"] = pf.error
+        if pf.warnings:
+            entry["warnings"] = pf.warnings
+        output.append(entry)
+    return output
+def build_detail(file_data: list[dict], evt: gr.SelectData) -> str:
+    """Build detail view when user clicks a table row."""
+    if not file_data or not isinstance(file_data, list):
+        return "ℹ️ Select a file from the **File Listing** tab to see its full preview here."
+    try:
+        row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
+    except (TypeError, IndexError):
+        return "⚠️ Could not determine selected row. Please click a row in the file listing."
+    if not (0 <= row_idx < len(file_data)):
+        return f"⚠️ Row index {row_idx} is out of range (0–{len(file_data) - 1})."
+    item = file_data[row_idx]
+    header = f"## {CATEGORY_EMOJI.get(item.get('category', ''), '📄')} {item['filename']}\n"
+    meta = f"**Category:** {item.get('category', 'unknown')} | **Size:** {item.get('size_display', 'unknown')}\n\n"
+    sections = [header, meta]
+    if item.get("error"):
+        sections.append(f"### ❌ Error\n```\n{item['error']}\n```\n")
+    if item.get("warnings"):
+        sections.append("### ⚠️ Warnings\n" + "\n".join(f"- {w}" for w in item["warnings"]) + "\n\n")
+    preview = item.get("preview", "")
+    if preview and not preview.startswith("["):
+        ext = item.get("extension", "").lstrip(".")
+        lang_map = {
+            "py": "python", "js": "javascript", "ts": "typescript",
+            "json": "json", "yaml": "yaml", "yml": "yaml",
+            "html": "html", "htm": "html", "css": "css",
+            "sql": "sql", "sh": "bash", "bash": "bash",
+            "java": "java", "c": "c", "cpp": "cpp", "go": "go",
+            "rs": "rust", "rb": "ruby", "php": "php", "xml": "xml",
+            "md": "markdown", "toml": "toml", "csv": "csv",
+        }
+        lang = lang_map.get(ext, "")
+        sections.append(f"### 📝 Content Preview\n```{lang}\n{preview}\n```")
+    elif preview:
+        sections.append(f"### 📝 Info\n{preview}")
+    else:
+        sections.append("*(No content to preview for this file type.)*")
+    return "\n".join(sections)
+# ──────────────────────────────────────────────────────────────────────────────
+# Main entry point
+# ──────────────────────────────────────────────────────────────────────────────
+def run_parse(file_obj, progress=gr.Progress()):
+    """Top-level handler: validate → parse → format outputs."""
+    try:
+        file_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
+        progress(0.0, desc="Validating upload...")
+        file_path = validate_upload(file_path)
+        gr.Info(f"📦 Processing ZIP file ({format_size(os.path.getsize(file_path))})...")
+        parsed_files, stats = parse_zip(file_path, progress)
+        progress(0.95, desc="Building output...")
+        summary = build_summary(stats, parsed_files)
+        table_rows = [pf.to_table_row() for pf in parsed_files]
+        full_text = build_full_text(parsed_files)
+        json_data = build_json(parsed_files)
+        progress(1.0, desc="Done!")
+        if stats.parse_errors > 0:
+            gr.Warning(f"{stats.parse_errors} file(s) failed to parse. See details below.")
+        elif stats.parse_warnings > 0:
+            gr.Info(f"✅ Parsed {stats.parsed_ok} files with {stats.parse_warnings} warning(s).")
+        else:
+            gr.Info(f"✅ Successfully parsed all {stats.parsed_ok} files!")
+        return summary, table_rows, full_text, json_data, json_data
+    except gr.Error:
+        raise
+    except MemoryError:
+        logger.error("MemoryError during ZIP processing")
+        raise gr.Error(
+            "💥 Out of memory! The ZIP file contents are too large to process. "
+            "Try a smaller archive or one with fewer/smaller files."
+        )
+    except Exception as e:
+        logger.error(f"Unexpected error: {type(e).__name__}: {e}")
+        traceback.print_exc()
+        raise gr.Error(
+            f"💥 An unexpected error occurred: {type(e).__name__}: {e}\n\n"
+            "If this persists, please report it as a bug."
+        )
+# ──────────────────────────────────────────────────────────────────────────────
+# Gradio UI
+# ──────────────────────────────────────────────────────────────────────────────
 with gr.Blocks(
     title="📦 Document Parser",
 ) as demo:
     gr.Markdown("""
 # 📦 Document Parser
+Upload a **ZIP file** and this tool extracts & parses text from every supported document inside it.
+**Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.csv`, `.html`, `.xml`,
+`.pdf`, `.docx`, `.xlsx`, and **40+ more** text/code formats — including `Makefile`, `Dockerfile`, `LICENSE`, etc.
+**Limits:** Max ZIP size: {max_zip}MB · Max files: {max_files} · Max single file: {max_file}MB · Zip bomb protection enabled
+    """.format(max_zip=MAX_ZIP_SIZE_MB, max_files=MAX_FILES_IN_ZIP, max_file=MAX_SINGLE_FILE_MB))
     with gr.Row():
         with gr.Column(scale=1):
                 file_types=[".zip"],
                 type="filepath",
             )
+            parse_btn = gr.Button(
+                "🔍 Parse Documents",
+                variant="primary",
+                size="lg",
+            )
+    summary_output = gr.Markdown(label="Summary", value="*Upload a ZIP file to get started.*")
     with gr.Tabs():
         with gr.Tab("📋 File Listing"):
             file_table = gr.Dataframe(
+                headers=["Status", "Filename", "Extension", "Type", "Size", "Preview"],
                 label="Files in Archive",
                 interactive=False,
                 wrap=True,
             )
         with gr.Tab("📝 Extracted Text"):
             text_output = gr.Textbox(
+                label="Full Extracted Text (all parseable files concatenated)",
                 lines=30,
                 max_lines=100,
+                buttons=["copy"],
             )
         with gr.Tab("🔎 File Detail"):
+            gr.Markdown("*Click a row in the **File Listing** tab, then switch here to see the full preview.*")
+            detail_output = gr.Markdown(
+                "ℹ️ Select a file from the **File Listing** tab to see its full preview here."
+            )
+        with gr.Tab("📊 JSON Export"):
             json_output = gr.JSON(label="Structured Parse Results")
     file_data_state = gr.State([])
     parse_btn.click(
         fn=run_parse,
         inputs=zip_input,
         outputs=[summary_output, file_table, text_output, json_output, file_data_state],
+        concurrency_limit=CONCURRENCY_LIMIT,
+        concurrency_id="parse_engine",
+        trigger_mode="once",
     )
     zip_input.upload(
         fn=run_parse,
         inputs=zip_input,
         outputs=[summary_output, file_table, text_output, json_output, file_data_state],
+        concurrency_limit=CONCURRENCY_LIMIT,
+        concurrency_id="parse_engine",
+        trigger_mode="once",
     )
     file_table.select(
+        fn=build_detail,
         inputs=file_data_state,
         outputs=detail_output,
     )
+demo.queue(default_concurrency_limit=CONCURRENCY_LIMIT, max_size=20)
 if __name__ == "__main__":
+    demo.launch(
+        show_error=True,
+        theme=gr.themes.Soft(),
+        css="""
+            .file-table { font-size: 0.9em; }
+            footer { display: none !important; }
+        """,
+    )