ScottzillaSystems commited on
Commit
d60d975
Β·
verified Β·
1 Parent(s): 61ca336

Refactor: production-grade error handling, progress bars, zip bomb protection, per-file isolation, Gradio 6 compat

Browse files
Files changed (1) hide show
  1. app.py +720 -192
app.py CHANGED
@@ -1,233 +1,744 @@
1
- import gradio as gr
2
- import zipfile
3
- import os
 
 
 
 
 
 
 
 
 
 
 
 
4
  import io
5
- import json
6
- import tempfile
7
- import shutil
8
-
9
- # Supported text-based extensions
10
- TEXT_EXTS = {
11
- ".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css",
12
- ".json", ".yaml", ".yml", ".csv", ".xml", ".toml", ".cfg", ".ini",
13
- ".sh", ".bash", ".bat", ".ps1", ".r", ".java", ".c", ".cpp", ".h",
14
- ".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala",
15
- ".sql", ".dockerfile", ".makefile", ".gitignore", ".env", ".log",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
18
- # Extensions we can parse with special libraries
19
- PDF_EXTS = {".pdf"}
20
- DOCX_EXTS = {".docx"}
21
- XLSX_EXTS = {".xlsx"}
22
- IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico"}
23
-
24
-
25
- def get_file_type(filename):
26
- """Categorize file by extension."""
27
- ext = os.path.splitext(filename)[1].lower()
28
- if not ext and filename.split("/")[-1] in {"Makefile", "Dockerfile", "Procfile", ".gitignore", ".dockerignore"}:
29
- return "text", ext
30
- if ext in TEXT_EXTS:
31
- return "text", ext
32
- if ext in PDF_EXTS:
33
- return "pdf", ext
34
- if ext in DOCX_EXTS:
35
- return "docx", ext
36
- if ext in XLSX_EXTS:
37
- return "xlsx", ext
38
- if ext in IMAGE_EXTS:
39
- return "image", ext
40
- return "binary", ext
41
-
42
-
43
- def parse_pdf_content(data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  """Parse PDF bytes to text using PyMuPDF."""
 
45
  try:
46
  import fitz
47
- doc = fitz.open(stream=data, filetype="pdf")
48
- text = ""
49
- for page_num, page in enumerate(doc):
50
- text += f"\n--- Page {page_num + 1} ---\n"
51
- text += page.get_text()
52
- doc.close()
53
- return text.strip() if text.strip() else "[PDF: no extractable text]"
54
  except ImportError:
55
- return "[PDF parsing unavailable - PyMuPDF not installed]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  except Exception as e:
57
- return f"[PDF parse error: {e}]"
 
 
 
 
 
 
 
58
 
59
 
60
- def parse_docx_content(data):
61
  """Parse DOCX bytes to text."""
 
62
  try:
63
  from docx import Document
 
 
 
 
64
  doc = Document(io.BytesIO(data))
 
 
65
  paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
66
- return "\n".join(paragraphs) if paragraphs else "[DOCX: empty document]"
67
- except ImportError:
68
- return "[DOCX parsing unavailable - python-docx not installed]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  except Exception as e:
70
- return f"[DOCX parse error: {e}]"
 
71
 
72
 
73
- def parse_xlsx_content(data):
74
  """Parse XLSX bytes to text summary."""
 
75
  try:
76
  import openpyxl
77
- wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
78
- text = ""
79
- for sheet_name in wb.sheetnames:
80
- ws = wb[sheet_name]
81
- text += f"\n--- Sheet: {sheet_name} ---\n"
82
- row_count = 0
83
- for row in ws.iter_rows(values_only=True):
84
- if row_count >= 50: # Limit rows shown
85
- text += f"\n... (more rows exist)\n"
86
- break
87
- text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
88
- row_count += 1
89
- wb.close()
90
- return text.strip() if text.strip() else "[XLSX: empty workbook]"
91
  except ImportError:
92
- return "[XLSX parsing unavailable - openpyxl not installed]"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  except Exception as e:
94
- return f"[XLSX parse error: {e}]"
 
 
 
 
 
 
 
95
 
96
 
97
- def format_size(size_bytes):
98
- """Format bytes to human-readable string."""
99
- if size_bytes < 1024:
100
- return f"{size_bytes} B"
101
- elif size_bytes < 1024 * 1024:
102
- return f"{size_bytes / 1024:.1f} KB"
103
- else:
104
- return f"{size_bytes / (1024 * 1024):.1f} MB"
105
 
 
 
106
 
107
- def parse_zip(file_obj):
108
- """Main parsing function for uploaded zip files."""
109
- if file_obj is None:
110
- return "⚠️ Please upload a ZIP file.", [], "", []
111
 
112
- file_path = file_obj if isinstance(file_obj, str) else file_obj.name
 
 
 
 
 
113
 
114
  if not zipfile.is_zipfile(file_path):
115
- return "❌ The uploaded file is not a valid ZIP archive.", [], "", []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
 
117
- results = []
118
- table_rows = []
119
- full_text_parts = []
120
- stats = {"total_files": 0, "text_files": 0, "pdf_files": 0, "docx_files": 0,
121
- "xlsx_files": 0, "image_files": 0, "binary_files": 0, "total_size": 0}
 
122
 
123
- with zipfile.ZipFile(file_path, "r") as zf:
124
- for info in zf.infolist():
125
- if info.is_dir():
126
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
- stats["total_files"] += 1
129
- stats["total_size"] += info.file_size
130
- file_type, ext = get_file_type(info.filename)
131
- content_preview = ""
 
 
 
132
 
133
  try:
134
  raw_data = zf.read(info)
 
 
 
 
 
 
 
135
  except Exception as e:
136
- content_preview = f"[Read error: {e}]"
137
- raw_data = None
138
-
139
- if raw_data is not None:
140
- if file_type == "text":
141
- stats["text_files"] += 1
142
- try:
143
- content = raw_data.decode("utf-8", errors="replace")
144
- content_preview = content[:2000]
145
- full_text_parts.append(f"\n{'='*60}\nπŸ“„ {info.filename}\n{'='*60}\n{content}")
146
- except Exception as e:
147
- content_preview = f"[Decode error: {e}]"
148
- elif file_type == "pdf":
149
- stats["pdf_files"] += 1
150
- content = parse_pdf_content(raw_data)
151
- content_preview = content[:2000]
152
- full_text_parts.append(f"\n{'='*60}\nπŸ“• {info.filename}\n{'='*60}\n{content}")
153
- elif file_type == "docx":
154
- stats["docx_files"] += 1
155
- content = parse_docx_content(raw_data)
156
- content_preview = content[:2000]
157
- full_text_parts.append(f"\n{'='*60}\nπŸ“˜ {info.filename}\n{'='*60}\n{content}")
158
- elif file_type == "xlsx":
159
- stats["xlsx_files"] += 1
160
- content = parse_xlsx_content(raw_data)
161
- content_preview = content[:2000]
162
- full_text_parts.append(f"\n{'='*60}\nπŸ“Š {info.filename}\n{'='*60}\n{content}")
163
- elif file_type == "image":
164
- stats["image_files"] += 1
165
- content_preview = f"[Image: {ext}]"
166
  else:
167
- stats["binary_files"] += 1
168
- content_preview = f"[Binary file: {ext}]"
169
-
170
- results.append({
171
- "filename": info.filename,
172
- "type": file_type,
173
- "extension": ext or "(none)",
174
- "size": info.file_size,
175
- "size_formatted": format_size(info.file_size),
176
- "preview": content_preview[:500],
177
- })
178
-
179
- table_rows.append([
180
- info.filename,
181
- ext or "(none)",
182
- file_type,
183
- format_size(info.file_size),
184
- content_preview[:200].replace("\n", " "),
185
- ])
186
-
187
- # Build summary
188
- summary = f"""## πŸ“¦ ZIP Archive Summary
189
-
190
- | Metric | Value |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
  |--------|-------|
192
- | **Total files** | {stats['total_files']} |
193
- | **Total size** | {format_size(stats['total_size'])} |
194
- | **Text/Code files** | {stats['text_files']} |
195
- | **PDF files** | {stats['pdf_files']} |
196
- | **DOCX files** | {stats['docx_files']} |
197
- | **XLSX files** | {stats['xlsx_files']} |
198
- | **Image files** | {stats['image_files']} |
199
- | **Binary files** | {stats['binary_files']} |
200
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- full_text = "\n".join(full_text_parts) if full_text_parts else "(No text content extracted)"
203
 
204
- return summary, table_rows, full_text, results
205
 
 
 
 
 
 
206
 
207
- def select_file_content(file_data_json, evt: gr.SelectData):
208
- """When user clicks a row in the table, show that file's full preview."""
209
- if not file_data_json or not isinstance(file_data_json, list):
210
- return "Select a file from the table above."
211
 
212
- row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
213
- if 0 <= row_idx < len(file_data_json):
214
- item = file_data_json[row_idx]
215
- return f"## πŸ“„ {item['filename']}\n**Type:** {item['type']} | **Size:** {item['size_formatted']}\n\n```\n{item.get('preview', '(no preview)')}\n```"
216
- return "File not found."
 
217
 
 
218
 
219
- # ─── Gradio UI ───────────────────────────────────────────
 
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  with gr.Blocks(
222
  title="πŸ“¦ Document Parser",
223
- theme=gr.themes.Soft(),
224
  ) as demo:
225
  gr.Markdown("""
226
  # πŸ“¦ Document Parser
227
- Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
228
 
229
- **Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.json`, `.yaml`, `.csv`, `.html`, `.pdf`, `.docx`, `.xlsx`, and 30+ more text/code formats.
230
- """)
 
 
 
 
 
231
 
232
  with gr.Row():
233
  with gr.Column(scale=1):
@@ -236,54 +747,71 @@ Upload a **ZIP file** containing documents and this tool will parse and extract
236
  file_types=[".zip"],
237
  type="filepath",
238
  )
239
- parse_btn = gr.Button("πŸ” Parse Documents", variant="primary", size="lg")
 
 
 
 
240
 
241
- summary_output = gr.Markdown(label="Summary")
242
 
243
  with gr.Tabs():
244
  with gr.Tab("πŸ“‹ File Listing"):
245
  file_table = gr.Dataframe(
246
- headers=["Filename", "Extension", "Type", "Size", "Preview"],
247
  label="Files in Archive",
248
  interactive=False,
249
  wrap=True,
250
  )
251
  with gr.Tab("πŸ“ Extracted Text"):
252
  text_output = gr.Textbox(
253
- label="Full Extracted Text",
254
  lines=30,
255
  max_lines=100,
256
- show_copy_button=True,
257
  )
258
  with gr.Tab("πŸ”Ž File Detail"):
259
- gr.Markdown("*Click a row in the File Listing tab to see its full preview here.*")
260
- detail_output = gr.Markdown("Select a file from the table above.")
261
- with gr.Tab("πŸ“Š JSON Data"):
 
 
262
  json_output = gr.JSON(label="Structured Parse Results")
263
 
264
- # Hidden state for file data
265
  file_data_state = gr.State([])
266
 
267
- def run_parse(file_obj):
268
- summary, table, text, data = parse_zip(file_obj)
269
- return summary, table, text, data, data
270
-
271
  parse_btn.click(
272
  fn=run_parse,
273
  inputs=zip_input,
274
  outputs=[summary_output, file_table, text_output, json_output, file_data_state],
 
 
 
275
  )
 
276
  zip_input.upload(
277
  fn=run_parse,
278
  inputs=zip_input,
279
  outputs=[summary_output, file_table, text_output, json_output, file_data_state],
 
 
 
280
  )
281
 
282
  file_table.select(
283
- fn=select_file_content,
284
  inputs=file_data_state,
285
  outputs=detail_output,
286
  )
287
 
 
 
288
  if __name__ == "__main__":
289
- demo.launch()
 
 
 
 
 
 
 
 
1
+ """
2
+ πŸ“¦ Document Parser β€” Production-Grade ZIP Document Extraction Tool
3
+
4
+ Features:
5
+ - Upload ZIP files and parse all supported document formats
6
+ - Supports 40+ text/code formats, PDF, DOCX, XLSX
7
+ - Zip bomb protection (decompression ratio + size limits)
8
+ - Per-file error isolation β€” one corrupt file won't crash the whole parse
9
+ - Progress bars for real-time feedback
10
+ - Concurrency-limited to prevent resource exhaustion
11
+ - Full structured JSON export + file detail drill-down
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
  import io
17
+ import logging
18
+ import os
19
+ import traceback
20
+ import zipfile
21
+ from dataclasses import dataclass, field
22
+ from enum import Enum
23
+ from typing import Optional
24
+
25
+ import gradio as gr
26
+
27
+ # ──────────────────────────────────────────────────────────────────────────────
28
+ # Configuration constants
29
+ # ──────────────────────────────────────────────────────────────────────────────
30
+ MAX_ZIP_SIZE_MB = 200
31
+ MAX_FILES_IN_ZIP = 500
32
+ MAX_SINGLE_FILE_MB = 50
33
+ MAX_DECOMPRESSION_RATIO = 100 # zip bomb guard: reject if total > ratio Γ— compressed
34
+ MAX_PREVIEW_CHARS = 5_000
35
+ MAX_FULL_TEXT_CHARS = 500_000
36
+ MAX_XLSX_ROWS = 100
37
+ CONCURRENCY_LIMIT = 3
38
+
39
+ # ──────────────────────────────────────────────────────────────────────────────
40
+ # Logging
41
+ # ──────────────────────────────────────────────────────────────────────────────
42
+ logger = logging.getLogger("document_parser")
43
+ logging.basicConfig(
44
+ level=logging.INFO,
45
+ format="%(asctime)s | %(levelname)s | %(message)s",
46
+ )
47
+
48
+
49
+ # ──────────────────────────────────────────────────────────────────────────────
50
+ # File classification
51
+ # ──────────────────────────────────────────────────────────────────────────────
52
+ class FileCategory(str, Enum):
53
+ TEXT = "text"
54
+ PDF = "pdf"
55
+ DOCX = "docx"
56
+ XLSX = "xlsx"
57
+ IMAGE = "image"
58
+ BINARY = "binary"
59
+
60
+
61
+ TEXT_EXTENSIONS = frozenset({
62
+ ".txt", ".md", ".rst", ".py", ".js", ".ts", ".jsx", ".tsx", ".html",
63
+ ".htm", ".css", ".scss", ".less", ".json", ".jsonl", ".yaml", ".yml",
64
+ ".csv", ".tsv", ".xml", ".toml", ".cfg", ".ini", ".conf", ".properties",
65
+ ".sh", ".bash", ".zsh", ".fish", ".bat", ".ps1", ".cmd",
66
+ ".r", ".rmd", ".java", ".c", ".cpp", ".h", ".hpp", ".cc", ".cxx",
67
+ ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".kts", ".scala", ".clj",
68
+ ".sql", ".graphql", ".gql", ".proto", ".thrift",
69
+ ".dockerfile", ".makefile", ".cmake",
70
+ ".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
71
+ ".env", ".env.example", ".log", ".tex", ".bib", ".sty",
72
+ ".lua", ".vim", ".el", ".lisp", ".hs", ".ml", ".mli", ".ex", ".exs",
73
+ ".erl", ".hrl", ".dart", ".v", ".sv", ".vhd", ".vhdl",
74
+ ".tf", ".tfvars", ".hcl", ".nix", ".dhall",
75
+ ".ipynb",
76
+ })
77
+
78
+ KNOWN_TEXT_FILENAMES = frozenset({
79
+ "Makefile", "Dockerfile", "Procfile", "Vagrantfile", "Gemfile",
80
+ "Rakefile", "Brewfile", "Justfile", "Taskfile",
81
+ ".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
82
+ ".eslintrc", ".prettierrc", ".babelrc", ".browserslistrc",
83
+ "LICENSE", "LICENCE", "COPYING", "AUTHORS", "CONTRIBUTORS",
84
+ "CHANGELOG", "CHANGES", "HISTORY", "NEWS",
85
+ "README", "INSTALL", "TODO", "HACKING",
86
+ "requirements.txt",
87
+ })
88
+
89
+ IMAGE_EXTENSIONS = frozenset({
90
+ ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico",
91
+ ".tiff", ".tif", ".avif", ".heic", ".heif",
92
+ })
93
+
94
+ CATEGORY_EMOJI = {
95
+ FileCategory.TEXT: "πŸ“„",
96
+ FileCategory.PDF: "πŸ“•",
97
+ FileCategory.DOCX: "πŸ“˜",
98
+ FileCategory.XLSX: "πŸ“Š",
99
+ FileCategory.IMAGE: "πŸ–ΌοΈ",
100
+ FileCategory.BINARY: "πŸ“¦",
101
  }
102
 
103
+
104
+ def classify_file(filename: str) -> tuple[FileCategory, str]:
105
+ """Classify a file by its extension and known filename patterns."""
106
+ basename = filename.rsplit("/", 1)[-1] if "/" in filename else filename
107
+ ext = os.path.splitext(basename)[1].lower()
108
+
109
+ if not ext and basename in KNOWN_TEXT_FILENAMES:
110
+ return FileCategory.TEXT, ""
111
+ if not ext and basename.upper() in {n.upper() for n in KNOWN_TEXT_FILENAMES}:
112
+ return FileCategory.TEXT, ""
113
+
114
+ if ext in TEXT_EXTENSIONS:
115
+ return FileCategory.TEXT, ext
116
+ if ext == ".pdf":
117
+ return FileCategory.PDF, ext
118
+ if ext == ".docx":
119
+ return FileCategory.DOCX, ext
120
+ if ext in {".xlsx", ".xls"}:
121
+ return FileCategory.XLSX, ext
122
+ if ext in IMAGE_EXTENSIONS:
123
+ return FileCategory.IMAGE, ext
124
+ return FileCategory.BINARY, ext
125
+
126
+
127
+ # ──────────────────────────────────────────────────────────────────────────────
128
+ # Data classes
129
+ # ──────────────────────────────────────────────────────────────────────────────
130
+ @dataclass
131
+ class ParsedFile:
132
+ filename: str
133
+ category: str
134
+ extension: str
135
+ size_bytes: int
136
+ size_display: str
137
+ content: str = ""
138
+ preview: str = ""
139
+ error: Optional[str] = None
140
+ warnings: list[str] = field(default_factory=list)
141
+
142
+ def to_table_row(self) -> list:
143
+ status = "⚠️" if self.warnings else ("❌" if self.error else "βœ…")
144
+ preview_text = self.error or self.preview[:200].replace("\n", " ")
145
+ return [
146
+ status,
147
+ self.filename,
148
+ self.extension or "(none)",
149
+ self.category,
150
+ self.size_display,
151
+ preview_text,
152
+ ]
153
+
154
+
155
+ @dataclass
156
+ class ParseStats:
157
+ total_files: int = 0
158
+ parsed_ok: int = 0
159
+ parse_warnings: int = 0
160
+ parse_errors: int = 0
161
+ skipped_dirs: int = 0
162
+ total_compressed_bytes: int = 0
163
+ total_uncompressed_bytes: int = 0
164
+ by_category: dict = field(default_factory=lambda: {c.value: 0 for c in FileCategory})
165
+
166
+
167
+ # ──────────────────────────────────────────────────────────────────────────────
168
+ # Size formatting
169
+ # ──────────────────────────────────────────────────────────────────────────────
170
+ def format_size(size_bytes: int) -> str:
171
+ if size_bytes < 0:
172
+ return "0 B"
173
+ if size_bytes < 1024:
174
+ return f"{size_bytes} B"
175
+ elif size_bytes < 1024 ** 2:
176
+ return f"{size_bytes / 1024:.1f} KB"
177
+ elif size_bytes < 1024 ** 3:
178
+ return f"{size_bytes / (1024 ** 2):.1f} MB"
179
+ else:
180
+ return f"{size_bytes / (1024 ** 3):.2f} GB"
181
+
182
+
183
+ # ──────────────────────────────────────────────────────────────────────────────
184
+ # Document parsers β€” each returns (content, warnings) or raises
185
+ # ──────────────────────────────────────────────────────────────────────────────
186
+ def parse_text_content(data: bytes, filename: str) -> tuple[str, list[str]]:
187
+ """Parse plain text / code files."""
188
+ warnings = []
189
+ try:
190
+ content = data.decode("utf-8")
191
+ except UnicodeDecodeError:
192
+ try:
193
+ content = data.decode("latin-1")
194
+ warnings.append("Decoded with latin-1 fallback (not valid UTF-8)")
195
+ except Exception:
196
+ content = data.decode("utf-8", errors="replace")
197
+ warnings.append("Contains invalid bytes; replaced with placeholders")
198
+
199
+ if len(content) > MAX_FULL_TEXT_CHARS:
200
+ warnings.append(f"Content truncated to {MAX_FULL_TEXT_CHARS:,} characters (original: {len(content):,})")
201
+ content = content[:MAX_FULL_TEXT_CHARS] + "\n\n... [TRUNCATED]"
202
+
203
+ return content, warnings
204
+
205
+
206
+ def parse_pdf_content(data: bytes, filename: str) -> tuple[str, list[str]]:
207
  """Parse PDF bytes to text using PyMuPDF."""
208
+ warnings = []
209
  try:
210
  import fitz
 
 
 
 
 
 
 
211
  except ImportError:
212
+ return "[PDF library not available]", ["PyMuPDF not installed β€” install with: pip install PyMuPDF"]
213
+
214
+ doc = None
215
+ try:
216
+ doc = fitz.open(stream=data, filetype="pdf")
217
+ if doc.is_encrypted:
218
+ return "", ["PDF is password-protected and cannot be parsed"]
219
+
220
+ page_count = len(doc)
221
+ if page_count == 0:
222
+ return "", ["PDF has 0 pages"]
223
+
224
+ text_parts = []
225
+ empty_pages = 0
226
+ for page_num in range(page_count):
227
+ try:
228
+ page = doc[page_num]
229
+ page_text = page.get_text().strip()
230
+ if page_text:
231
+ text_parts.append(f"\n--- Page {page_num + 1}/{page_count} ---\n{page_text}")
232
+ else:
233
+ empty_pages += 1
234
+ except Exception as e:
235
+ warnings.append(f"Page {page_num + 1} failed: {type(e).__name__}: {e}")
236
+
237
+ if empty_pages > 0:
238
+ warnings.append(f"{empty_pages}/{page_count} pages had no extractable text (may be scanned/image-based)")
239
+
240
+ content = "\n".join(text_parts) if text_parts else "[No extractable text found]"
241
+ if not text_parts and empty_pages == page_count:
242
+ warnings.append("PDF appears to be entirely image-based; OCR would be needed to extract text")
243
+
244
+ return content, warnings
245
+
246
  except Exception as e:
247
+ logger.error(f"PDF parse error for {filename}: {e}")
248
+ return "", [f"PDF parse failed: {type(e).__name__}: {e}"]
249
+ finally:
250
+ if doc:
251
+ try:
252
+ doc.close()
253
+ except Exception:
254
+ pass
255
 
256
 
257
+ def parse_docx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
258
  """Parse DOCX bytes to text."""
259
+ warnings = []
260
  try:
261
  from docx import Document
262
+ except ImportError:
263
+ return "[DOCX library not available]", ["python-docx not installed"]
264
+
265
+ try:
266
  doc = Document(io.BytesIO(data))
267
+ parts = []
268
+
269
  paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
270
+ if paragraphs:
271
+ parts.extend(paragraphs)
272
+
273
+ for i, table in enumerate(doc.tables):
274
+ try:
275
+ table_text = f"\n--- Table {i + 1} ---\n"
276
+ for row in table.rows:
277
+ cells = [cell.text.strip() for cell in row.cells]
278
+ table_text += " | ".join(cells) + "\n"
279
+ parts.append(table_text)
280
+ except Exception as e:
281
+ warnings.append(f"Table {i + 1} extraction failed: {e}")
282
+
283
+ content = "\n".join(parts) if parts else "[DOCX: empty document]"
284
+ if not parts:
285
+ warnings.append("Document contains no paragraphs or tables")
286
+
287
+ return content, warnings
288
+
289
  except Exception as e:
290
+ logger.error(f"DOCX parse error for {filename}: {e}")
291
+ return "", [f"DOCX parse failed: {type(e).__name__}: {e}"]
292
 
293
 
294
+ def parse_xlsx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
295
  """Parse XLSX bytes to text summary."""
296
+ warnings = []
297
  try:
298
  import openpyxl
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  except ImportError:
300
+ return "[XLSX library not available]", ["openpyxl not installed"]
301
+
302
+ wb = None
303
+ try:
304
+ wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
305
+ parts = []
306
+
307
+ for sheet_name in wb.sheetnames:
308
+ try:
309
+ ws = wb[sheet_name]
310
+ sheet_text = f"\n--- Sheet: {sheet_name} ---\n"
311
+ row_count = 0
312
+ for row in ws.iter_rows(values_only=True):
313
+ if row_count >= MAX_XLSX_ROWS:
314
+ sheet_text += f"\n... (truncated at {MAX_XLSX_ROWS} rows)\n"
315
+ warnings.append(f"Sheet '{sheet_name}' truncated at {MAX_XLSX_ROWS} rows")
316
+ break
317
+ cells = [str(cell) if cell is not None else "" for cell in row]
318
+ sheet_text += " | ".join(cells) + "\n"
319
+ row_count += 1
320
+ if row_count == 0:
321
+ sheet_text += "(empty sheet)\n"
322
+ parts.append(sheet_text)
323
+ except Exception as e:
324
+ warnings.append(f"Sheet '{sheet_name}' failed: {type(e).__name__}: {e}")
325
+
326
+ content = "\n".join(parts) if parts else "[XLSX: empty workbook]"
327
+ return content, warnings
328
+
329
  except Exception as e:
330
+ logger.error(f"XLSX parse error for {filename}: {e}")
331
+ return "", [f"XLSX parse failed: {type(e).__name__}: {e}"]
332
+ finally:
333
+ if wb:
334
+ try:
335
+ wb.close()
336
+ except Exception:
337
+ pass
338
 
339
 
340
+ # ──────────────────────────────────────────────────────────────────────────────
341
+ # Validation layer
342
+ # ──────────────────────────────────────────────────────────────────────────────
343
+ def validate_upload(file_path: str | None) -> str:
344
+ """Validate the uploaded file. Returns the resolved file path. Raises gr.Error on failure."""
345
+ if file_path is None:
346
+ raise gr.Error("⚠️ Please upload a ZIP file first.")
 
347
 
348
+ if not os.path.isfile(file_path):
349
+ raise gr.Error("❌ Upload failed β€” file not found on server. Please try again.")
350
 
351
+ file_size = os.path.getsize(file_path)
352
+ if file_size == 0:
353
+ raise gr.Error("❌ The uploaded file is empty (0 bytes).")
 
354
 
355
+ size_mb = file_size / (1024 ** 2)
356
+ if size_mb > MAX_ZIP_SIZE_MB:
357
+ raise gr.Error(
358
+ f"❌ File too large: {size_mb:.1f} MB. "
359
+ f"Maximum allowed is {MAX_ZIP_SIZE_MB} MB."
360
+ )
361
 
362
  if not zipfile.is_zipfile(file_path):
363
+ raise gr.Error(
364
+ "❌ Not a valid ZIP archive. The file may be corrupted, "
365
+ "or it may be a different archive format (tar, rar, 7z)."
366
+ )
367
+
368
+ return file_path
369
+
370
+
371
+ def check_zip_bomb(zf: zipfile.ZipFile, compressed_size: int) -> list[str]:
372
+ """Check for zip bomb indicators. Returns warnings. Raises gr.Error if malicious."""
373
+ warnings = []
374
+ total_uncompressed = sum(info.file_size for info in zf.infolist() if not info.is_dir())
375
+
376
+ if compressed_size > 0:
377
+ ratio = total_uncompressed / compressed_size
378
+ if ratio > MAX_DECOMPRESSION_RATIO:
379
+ raise gr.Error(
380
+ f"πŸ›‘οΈ Zip bomb detected! Decompression ratio is {ratio:.0f}x "
381
+ f"(compressed: {format_size(compressed_size)}, "
382
+ f"uncompressed: {format_size(total_uncompressed)}). "
383
+ f"Maximum allowed ratio is {MAX_DECOMPRESSION_RATIO}x."
384
+ )
385
+ if ratio > MAX_DECOMPRESSION_RATIO / 2:
386
+ warnings.append(
387
+ f"High decompression ratio ({ratio:.0f}x) β€” approaching the "
388
+ f"{MAX_DECOMPRESSION_RATIO}x safety limit"
389
+ )
390
 
391
+ total_uncompressed_mb = total_uncompressed / (1024 ** 2)
392
+ if total_uncompressed_mb > MAX_ZIP_SIZE_MB * 5:
393
+ raise gr.Error(
394
+ f"πŸ›‘οΈ Uncompressed content too large: {total_uncompressed_mb:.0f} MB. "
395
+ f"Maximum is {MAX_ZIP_SIZE_MB * 5} MB."
396
+ )
397
 
398
+ return warnings
399
+
400
+
401
+ # ──────────────────────────────────────────────────────────────────────────────
402
+ # Core parsing engine
403
+ # ──────────────────────────────────────────────────────────────────────────────
404
+ def parse_zip(file_path: str, progress: gr.Progress) -> tuple[list[ParsedFile], ParseStats]:
405
+ """Parse all files in a ZIP archive with per-file error isolation."""
406
+ file_size = os.path.getsize(file_path)
407
+ stats = ParseStats()
408
+
409
+ try:
410
+ zf = zipfile.ZipFile(file_path, "r")
411
+ except zipfile.BadZipFile:
412
+ raise gr.Error("❌ ZIP file is corrupted and cannot be opened.")
413
+ except Exception as e:
414
+ raise gr.Error(f"❌ Failed to open ZIP: {type(e).__name__}: {e}")
415
+
416
+ try:
417
+ bomb_warnings = check_zip_bomb(zf, file_size)
418
+
419
+ entries = [info for info in zf.infolist() if not info.is_dir()]
420
+ stats.skipped_dirs = len(zf.infolist()) - len(entries)
421
+ stats.total_files = len(entries)
422
+ stats.total_compressed_bytes = file_size
423
+
424
+ if stats.total_files == 0:
425
+ raise gr.Error("❌ ZIP archive contains no files (only directories).")
426
+
427
+ truncated = False
428
+ if stats.total_files > MAX_FILES_IN_ZIP:
429
+ gr.Warning(
430
+ f"ZIP contains {stats.total_files} files β€” "
431
+ f"processing first {MAX_FILES_IN_ZIP} only."
432
+ )
433
+ entries = entries[:MAX_FILES_IN_ZIP]
434
+ truncated = True
435
+
436
+ parsed_files: list[ParsedFile] = []
437
+
438
+ for i, info in enumerate(progress.tqdm(entries, desc="Parsing documents")):
439
+ category, ext = classify_file(info.filename)
440
+ stats.by_category[category.value] += 1
441
+ stats.total_uncompressed_bytes += info.file_size
442
+
443
+ pf = ParsedFile(
444
+ filename=info.filename,
445
+ category=category.value,
446
+ extension=ext or "(none)",
447
+ size_bytes=info.file_size,
448
+ size_display=format_size(info.file_size),
449
+ )
450
 
451
+ file_mb = info.file_size / (1024 ** 2)
452
+ if file_mb > MAX_SINGLE_FILE_MB:
453
+ pf.error = f"Skipped: file too large ({file_mb:.1f} MB > {MAX_SINGLE_FILE_MB} MB limit)"
454
+ pf.warnings.append(pf.error)
455
+ stats.parse_warnings += 1
456
+ parsed_files.append(pf)
457
+ continue
458
 
459
  try:
460
  raw_data = zf.read(info)
461
+ except RuntimeError as e:
462
+ pf.error = f"Cannot read: {e}"
463
+ if "password" in str(e).lower():
464
+ pf.error = "File is password-protected"
465
+ stats.parse_errors += 1
466
+ parsed_files.append(pf)
467
+ continue
468
  except Exception as e:
469
+ pf.error = f"Read failed: {type(e).__name__}: {e}"
470
+ stats.parse_errors += 1
471
+ parsed_files.append(pf)
472
+ continue
473
+
474
+ try:
475
+ if category == FileCategory.TEXT:
476
+ content, warnings = parse_text_content(raw_data, info.filename)
477
+ elif category == FileCategory.PDF:
478
+ content, warnings = parse_pdf_content(raw_data, info.filename)
479
+ elif category == FileCategory.DOCX:
480
+ content, warnings = parse_docx_content(raw_data, info.filename)
481
+ elif category == FileCategory.XLSX:
482
+ content, warnings = parse_xlsx_content(raw_data, info.filename)
483
+ elif category == FileCategory.IMAGE:
484
+ content = ""
485
+ warnings = []
486
+ pf.preview = f"[Image: {ext}, {pf.size_display}]"
487
+ else:
488
+ content = ""
489
+ warnings = []
490
+ pf.preview = f"[Binary: {ext}, {pf.size_display}]"
491
+
492
+ pf.content = content
493
+ pf.preview = content[:MAX_PREVIEW_CHARS] if content else pf.preview
494
+ pf.warnings = warnings
495
+
496
+ if warnings:
497
+ stats.parse_warnings += 1
 
498
  else:
499
+ stats.parsed_ok += 1
500
+
501
+ except MemoryError:
502
+ pf.error = "Out of memory while parsing this file"
503
+ stats.parse_errors += 1
504
+ logger.error(f"MemoryError parsing {info.filename}")
505
+ except Exception as e:
506
+ pf.error = f"Parse failed: {type(e).__name__}: {e}"
507
+ stats.parse_errors += 1
508
+ logger.error(f"Parse error for {info.filename}: {e}")
509
+ traceback.print_exc()
510
+
511
+ parsed_files.append(pf)
512
+
513
+ if bomb_warnings:
514
+ for w in bomb_warnings:
515
+ gr.Warning(w)
516
+
517
+ if truncated:
518
+ stats.parse_warnings += 1
519
+
520
+ return parsed_files, stats
521
+
522
+ finally:
523
+ try:
524
+ zf.close()
525
+ except Exception:
526
+ pass
527
+
528
+
529
+ # ──────────────────────────────────────────────────────────────────────────────
530
+ # Output formatters
531
+ # ──────────────────────────────────────────────────────────────────────────────
532
+ def build_summary(stats: ParseStats, parsed_files: list[ParsedFile]) -> str:
533
+ """Build a rich markdown summary."""
534
+ alerts = []
535
+ if stats.parse_errors > 0:
536
+ alerts.append(f"⚠️ **{stats.parse_errors} file(s) failed to parse** β€” see ❌ markers in the file listing")
537
+ if stats.parse_warnings > 0:
538
+ alerts.append(f"ℹ️ **{stats.parse_warnings} file(s) had warnings** β€” see ⚠️ markers in the file listing")
539
+
540
+ alert_block = "\n".join(alerts) + "\n\n" if alerts else ""
541
+
542
+ error_files = [pf for pf in parsed_files if pf.error]
543
+ error_block = ""
544
+ if error_files:
545
+ error_lines = []
546
+ for pf in error_files[:10]:
547
+ error_lines.append(f"- `{pf.filename}`: {pf.error}")
548
+ if len(error_files) > 10:
549
+ error_lines.append(f"- ... and {len(error_files) - 10} more")
550
+ error_block = "\n### ❌ Failed Files\n" + "\n".join(error_lines) + "\n\n"
551
+
552
+ return f"""## πŸ“¦ ZIP Archive Summary
553
+
554
+ {alert_block}| Metric | Value |
555
  |--------|-------|
556
+ | **Total files** | {stats.total_files} |
557
+ | **Parsed successfully** | {stats.parsed_ok} |
558
+ | **With warnings** | {stats.parse_warnings} |
559
+ | **Failed** | {stats.parse_errors} |
560
+ | **Compressed size** | {format_size(stats.total_compressed_bytes)} |
561
+ | **Uncompressed size** | {format_size(stats.total_uncompressed_bytes)} |
562
+ | **Directories skipped** | {stats.skipped_dirs} |
563
+
564
+ ### πŸ“Š File Types
565
+ | Category | Count |
566
+ |----------|-------|
567
+ | Text/Code | {stats.by_category.get('text', 0)} |
568
+ | PDF | {stats.by_category.get('pdf', 0)} |
569
+ | DOCX | {stats.by_category.get('docx', 0)} |
570
+ | XLSX | {stats.by_category.get('xlsx', 0)} |
571
+ | Image | {stats.by_category.get('image', 0)} |
572
+ | Binary | {stats.by_category.get('binary', 0)} |
573
+
574
+ {error_block}"""
575
+
576
+
577
+ def build_full_text(parsed_files: list[ParsedFile]) -> str:
578
+ """Build concatenated text output from all parsed files."""
579
+ parts = []
580
+ for pf in parsed_files:
581
+ if pf.content:
582
+ emoji = CATEGORY_EMOJI.get(FileCategory(pf.category), "πŸ“„")
583
+ parts.append(
584
+ f"\n{'=' * 70}\n"
585
+ f"{emoji} {pf.filename}"
586
+ f"{' ⚠️ ' + ', '.join(pf.warnings) if pf.warnings else ''}\n"
587
+ f"{'=' * 70}\n"
588
+ f"{pf.content}"
589
+ )
590
+ elif pf.error:
591
+ parts.append(
592
+ f"\n{'=' * 70}\n"
593
+ f"❌ {pf.filename} β€” ERROR: {pf.error}\n"
594
+ f"{'=' * 70}"
595
+ )
596
+
597
+ if not parts:
598
+ return "(No text content was extracted from any file in the archive.)"
599
+
600
+ full = "\n".join(parts)
601
+ if len(full) > MAX_FULL_TEXT_CHARS:
602
+ full = full[:MAX_FULL_TEXT_CHARS] + "\n\n... [OUTPUT TRUNCATED β€” too large to display fully]"
603
+ return full
604
+
605
+
606
+ def build_json(parsed_files: list[ParsedFile]) -> list[dict]:
607
+ """Build structured JSON output."""
608
+ output = []
609
+ for pf in parsed_files:
610
+ entry = {
611
+ "filename": pf.filename,
612
+ "category": pf.category,
613
+ "extension": pf.extension,
614
+ "size_bytes": pf.size_bytes,
615
+ "size_display": pf.size_display,
616
+ "preview": pf.preview[:1000],
617
+ "status": "error" if pf.error else ("warning" if pf.warnings else "ok"),
618
+ }
619
+ if pf.error:
620
+ entry["error"] = pf.error
621
+ if pf.warnings:
622
+ entry["warnings"] = pf.warnings
623
+ output.append(entry)
624
+ return output
625
+
626
+
627
+ def build_detail(file_data: list[dict], evt: gr.SelectData) -> str:
628
+ """Build detail view when user clicks a table row."""
629
+ if not file_data or not isinstance(file_data, list):
630
+ return "ℹ️ Select a file from the **File Listing** tab to see its full preview here."
631
+
632
+ try:
633
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
634
+ except (TypeError, IndexError):
635
+ return "⚠️ Could not determine selected row. Please click a row in the file listing."
636
+
637
+ if not (0 <= row_idx < len(file_data)):
638
+ return f"⚠️ Row index {row_idx} is out of range (0–{len(file_data) - 1})."
639
+
640
+ item = file_data[row_idx]
641
+
642
+ header = f"## {CATEGORY_EMOJI.get(item.get('category', ''), 'πŸ“„')} {item['filename']}\n"
643
+ meta = f"**Category:** {item.get('category', 'unknown')} | **Size:** {item.get('size_display', 'unknown')}\n\n"
644
+
645
+ sections = [header, meta]
646
+
647
+ if item.get("error"):
648
+ sections.append(f"### ❌ Error\n```\n{item['error']}\n```\n")
649
+
650
+ if item.get("warnings"):
651
+ sections.append("### ⚠️ Warnings\n" + "\n".join(f"- {w}" for w in item["warnings"]) + "\n\n")
652
+
653
+ preview = item.get("preview", "")
654
+ if preview and not preview.startswith("["):
655
+ ext = item.get("extension", "").lstrip(".")
656
+ lang_map = {
657
+ "py": "python", "js": "javascript", "ts": "typescript",
658
+ "json": "json", "yaml": "yaml", "yml": "yaml",
659
+ "html": "html", "htm": "html", "css": "css",
660
+ "sql": "sql", "sh": "bash", "bash": "bash",
661
+ "java": "java", "c": "c", "cpp": "cpp", "go": "go",
662
+ "rs": "rust", "rb": "ruby", "php": "php", "xml": "xml",
663
+ "md": "markdown", "toml": "toml", "csv": "csv",
664
+ }
665
+ lang = lang_map.get(ext, "")
666
+ sections.append(f"### πŸ“ Content Preview\n```{lang}\n{preview}\n```")
667
+ elif preview:
668
+ sections.append(f"### πŸ“ Info\n{preview}")
669
+ else:
670
+ sections.append("*(No content to preview for this file type.)*")
671
+
672
+ return "\n".join(sections)
673
+
674
+
675
+ # ──────────────────────────────────────────────────────────────────────────────
676
+ # Main entry point
677
+ # ──────────────────────────────────────────────────────────────────────────────
678
+ def run_parse(file_obj, progress=gr.Progress()):
679
+ """Top-level handler: validate β†’ parse β†’ format outputs."""
680
+ try:
681
+ file_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
682
+
683
+ progress(0.0, desc="Validating upload...")
684
+ file_path = validate_upload(file_path)
685
 
686
+ gr.Info(f"πŸ“¦ Processing ZIP file ({format_size(os.path.getsize(file_path))})...")
687
 
688
+ parsed_files, stats = parse_zip(file_path, progress)
689
 
690
+ progress(0.95, desc="Building output...")
691
+ summary = build_summary(stats, parsed_files)
692
+ table_rows = [pf.to_table_row() for pf in parsed_files]
693
+ full_text = build_full_text(parsed_files)
694
+ json_data = build_json(parsed_files)
695
 
696
+ progress(1.0, desc="Done!")
 
 
 
697
 
698
+ if stats.parse_errors > 0:
699
+ gr.Warning(f"{stats.parse_errors} file(s) failed to parse. See details below.")
700
+ elif stats.parse_warnings > 0:
701
+ gr.Info(f"βœ… Parsed {stats.parsed_ok} files with {stats.parse_warnings} warning(s).")
702
+ else:
703
+ gr.Info(f"βœ… Successfully parsed all {stats.parsed_ok} files!")
704
 
705
+ return summary, table_rows, full_text, json_data, json_data
706
 
707
+ except gr.Error:
708
+ raise
709
 
710
+ except MemoryError:
711
+ logger.error("MemoryError during ZIP processing")
712
+ raise gr.Error(
713
+ "πŸ’₯ Out of memory! The ZIP file contents are too large to process. "
714
+ "Try a smaller archive or one with fewer/smaller files."
715
+ )
716
+
717
+ except Exception as e:
718
+ logger.error(f"Unexpected error: {type(e).__name__}: {e}")
719
+ traceback.print_exc()
720
+ raise gr.Error(
721
+ f"πŸ’₯ An unexpected error occurred: {type(e).__name__}: {e}\n\n"
722
+ "If this persists, please report it as a bug."
723
+ )
724
+
725
+
726
+ # ──────────────────────────────────────────────────────────────────────────────
727
+ # Gradio UI
728
+ # ──────────────────────────────────────────────────────────────────────────────
729
  with gr.Blocks(
730
  title="πŸ“¦ Document Parser",
 
731
  ) as demo:
732
  gr.Markdown("""
733
  # πŸ“¦ Document Parser
 
734
 
735
+ Upload a **ZIP file** and this tool extracts & parses text from every supported document inside it.
736
+
737
+ **Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.csv`, `.html`, `.xml`,
738
+ `.pdf`, `.docx`, `.xlsx`, and **40+ more** text/code formats β€” including `Makefile`, `Dockerfile`, `LICENSE`, etc.
739
+
740
+ **Limits:** Max ZIP size: {max_zip}MB Β· Max files: {max_files} Β· Max single file: {max_file}MB Β· Zip bomb protection enabled
741
+ """.format(max_zip=MAX_ZIP_SIZE_MB, max_files=MAX_FILES_IN_ZIP, max_file=MAX_SINGLE_FILE_MB))
742
 
743
  with gr.Row():
744
  with gr.Column(scale=1):
 
747
  file_types=[".zip"],
748
  type="filepath",
749
  )
750
+ parse_btn = gr.Button(
751
+ "πŸ” Parse Documents",
752
+ variant="primary",
753
+ size="lg",
754
+ )
755
 
756
+ summary_output = gr.Markdown(label="Summary", value="*Upload a ZIP file to get started.*")
757
 
758
  with gr.Tabs():
759
  with gr.Tab("πŸ“‹ File Listing"):
760
  file_table = gr.Dataframe(
761
+ headers=["Status", "Filename", "Extension", "Type", "Size", "Preview"],
762
  label="Files in Archive",
763
  interactive=False,
764
  wrap=True,
765
  )
766
  with gr.Tab("πŸ“ Extracted Text"):
767
  text_output = gr.Textbox(
768
+ label="Full Extracted Text (all parseable files concatenated)",
769
  lines=30,
770
  max_lines=100,
771
+ buttons=["copy"],
772
  )
773
  with gr.Tab("πŸ”Ž File Detail"):
774
+ gr.Markdown("*Click a row in the **File Listing** tab, then switch here to see the full preview.*")
775
+ detail_output = gr.Markdown(
776
+ "ℹ️ Select a file from the **File Listing** tab to see its full preview here."
777
+ )
778
+ with gr.Tab("πŸ“Š JSON Export"):
779
  json_output = gr.JSON(label="Structured Parse Results")
780
 
 
781
  file_data_state = gr.State([])
782
 
 
 
 
 
783
  parse_btn.click(
784
  fn=run_parse,
785
  inputs=zip_input,
786
  outputs=[summary_output, file_table, text_output, json_output, file_data_state],
787
+ concurrency_limit=CONCURRENCY_LIMIT,
788
+ concurrency_id="parse_engine",
789
+ trigger_mode="once",
790
  )
791
+
792
  zip_input.upload(
793
  fn=run_parse,
794
  inputs=zip_input,
795
  outputs=[summary_output, file_table, text_output, json_output, file_data_state],
796
+ concurrency_limit=CONCURRENCY_LIMIT,
797
+ concurrency_id="parse_engine",
798
+ trigger_mode="once",
799
  )
800
 
801
  file_table.select(
802
+ fn=build_detail,
803
  inputs=file_data_state,
804
  outputs=detail_output,
805
  )
806
 
807
+ demo.queue(default_concurrency_limit=CONCURRENCY_LIMIT, max_size=20)
808
+
809
  if __name__ == "__main__":
810
+ demo.launch(
811
+ show_error=True,
812
+ theme=gr.themes.Soft(),
813
+ css="""
814
+ .file-table { font-size: 0.9em; }
815
+ footer { display: none !important; }
816
+ """,
817
+ )