Spaces:
Sleeping
Sleeping
Refactor: production-grade error handling, progress bars, zip bomb protection, per-file isolation, Gradio 6 compat
Browse files
app.py
CHANGED
|
@@ -1,233 +1,744 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import io
|
| 5 |
-
import
|
| 6 |
-
import
|
| 7 |
-
import
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
}
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
"""Parse PDF bytes to text using PyMuPDF."""
|
|
|
|
| 45 |
try:
|
| 46 |
import fitz
|
| 47 |
-
doc = fitz.open(stream=data, filetype="pdf")
|
| 48 |
-
text = ""
|
| 49 |
-
for page_num, page in enumerate(doc):
|
| 50 |
-
text += f"\n--- Page {page_num + 1} ---\n"
|
| 51 |
-
text += page.get_text()
|
| 52 |
-
doc.close()
|
| 53 |
-
return text.strip() if text.strip() else "[PDF: no extractable text]"
|
| 54 |
except ImportError:
|
| 55 |
-
return "[PDF
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
except Exception as e:
|
| 57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
|
| 60 |
-
def parse_docx_content(data):
|
| 61 |
"""Parse DOCX bytes to text."""
|
|
|
|
| 62 |
try:
|
| 63 |
from docx import Document
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
doc = Document(io.BytesIO(data))
|
|
|
|
|
|
|
| 65 |
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
-
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
-
def parse_xlsx_content(data):
|
| 74 |
"""Parse XLSX bytes to text summary."""
|
|
|
|
| 75 |
try:
|
| 76 |
import openpyxl
|
| 77 |
-
wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
|
| 78 |
-
text = ""
|
| 79 |
-
for sheet_name in wb.sheetnames:
|
| 80 |
-
ws = wb[sheet_name]
|
| 81 |
-
text += f"\n--- Sheet: {sheet_name} ---\n"
|
| 82 |
-
row_count = 0
|
| 83 |
-
for row in ws.iter_rows(values_only=True):
|
| 84 |
-
if row_count >= 50: # Limit rows shown
|
| 85 |
-
text += f"\n... (more rows exist)\n"
|
| 86 |
-
break
|
| 87 |
-
text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
|
| 88 |
-
row_count += 1
|
| 89 |
-
wb.close()
|
| 90 |
-
return text.strip() if text.strip() else "[XLSX: empty workbook]"
|
| 91 |
except ImportError:
|
| 92 |
-
return "[XLSX
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
except Exception as e:
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
return f"{size_bytes / (1024 * 1024):.1f} MB"
|
| 105 |
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
return "β οΈ Please upload a ZIP file.", [], "", []
|
| 111 |
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if not zipfile.is_zipfile(file_path):
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
|
|
|
| 122 |
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
try:
|
| 134 |
raw_data = zf.read(info)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
elif
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
content =
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
stats
|
| 165 |
-
content_preview = f"[Image: {ext}]"
|
| 166 |
else:
|
| 167 |
-
stats
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
"
|
| 173 |
-
|
| 174 |
-
"
|
| 175 |
-
|
| 176 |
-
"
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|--------|-------|
|
| 192 |
-
| **Total files** | {stats
|
| 193 |
-
| **
|
| 194 |
-
| **
|
| 195 |
-
| **
|
| 196 |
-
| **
|
| 197 |
-
| **
|
| 198 |
-
| **
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
-
|
| 203 |
|
| 204 |
-
|
| 205 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
|
| 208 |
-
"""When user clicks a row in the table, show that file's full preview."""
|
| 209 |
-
if not file_data_json or not isinstance(file_data_json, list):
|
| 210 |
-
return "Select a file from the table above."
|
| 211 |
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
|
|
|
| 217 |
|
|
|
|
| 218 |
|
| 219 |
-
|
|
|
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
with gr.Blocks(
|
| 222 |
title="π¦ Document Parser",
|
| 223 |
-
theme=gr.themes.Soft(),
|
| 224 |
) as demo:
|
| 225 |
gr.Markdown("""
|
| 226 |
# π¦ Document Parser
|
| 227 |
-
Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
|
| 228 |
|
| 229 |
-
**
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
| 232 |
with gr.Row():
|
| 233 |
with gr.Column(scale=1):
|
|
@@ -236,54 +747,71 @@ Upload a **ZIP file** containing documents and this tool will parse and extract
|
|
| 236 |
file_types=[".zip"],
|
| 237 |
type="filepath",
|
| 238 |
)
|
| 239 |
-
parse_btn = gr.Button(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
-
summary_output = gr.Markdown(label="Summary")
|
| 242 |
|
| 243 |
with gr.Tabs():
|
| 244 |
with gr.Tab("π File Listing"):
|
| 245 |
file_table = gr.Dataframe(
|
| 246 |
-
headers=["Filename", "Extension", "Type", "Size", "Preview"],
|
| 247 |
label="Files in Archive",
|
| 248 |
interactive=False,
|
| 249 |
wrap=True,
|
| 250 |
)
|
| 251 |
with gr.Tab("π Extracted Text"):
|
| 252 |
text_output = gr.Textbox(
|
| 253 |
-
label="Full Extracted Text",
|
| 254 |
lines=30,
|
| 255 |
max_lines=100,
|
| 256 |
-
|
| 257 |
)
|
| 258 |
with gr.Tab("π File Detail"):
|
| 259 |
-
gr.Markdown("*Click a row in the File Listing tab to see
|
| 260 |
-
detail_output = gr.Markdown(
|
| 261 |
-
|
|
|
|
|
|
|
| 262 |
json_output = gr.JSON(label="Structured Parse Results")
|
| 263 |
|
| 264 |
-
# Hidden state for file data
|
| 265 |
file_data_state = gr.State([])
|
| 266 |
|
| 267 |
-
def run_parse(file_obj):
|
| 268 |
-
summary, table, text, data = parse_zip(file_obj)
|
| 269 |
-
return summary, table, text, data, data
|
| 270 |
-
|
| 271 |
parse_btn.click(
|
| 272 |
fn=run_parse,
|
| 273 |
inputs=zip_input,
|
| 274 |
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
|
|
|
|
|
|
|
|
|
| 275 |
)
|
|
|
|
| 276 |
zip_input.upload(
|
| 277 |
fn=run_parse,
|
| 278 |
inputs=zip_input,
|
| 279 |
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
|
|
|
|
|
|
|
|
|
| 280 |
)
|
| 281 |
|
| 282 |
file_table.select(
|
| 283 |
-
fn=
|
| 284 |
inputs=file_data_state,
|
| 285 |
outputs=detail_output,
|
| 286 |
)
|
| 287 |
|
|
|
|
|
|
|
| 288 |
if __name__ == "__main__":
|
| 289 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
π¦ Document Parser β Production-Grade ZIP Document Extraction Tool
|
| 3 |
+
|
| 4 |
+
Features:
|
| 5 |
+
- Upload ZIP files and parse all supported document formats
|
| 6 |
+
- Supports 40+ text/code formats, PDF, DOCX, XLSX
|
| 7 |
+
- Zip bomb protection (decompression ratio + size limits)
|
| 8 |
+
- Per-file error isolation β one corrupt file won't crash the whole parse
|
| 9 |
+
- Progress bars for real-time feedback
|
| 10 |
+
- Concurrency-limited to prevent resource exhaustion
|
| 11 |
+
- Full structured JSON export + file detail drill-down
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
import io
|
| 17 |
+
import logging
|
| 18 |
+
import os
|
| 19 |
+
import traceback
|
| 20 |
+
import zipfile
|
| 21 |
+
from dataclasses import dataclass, field
|
| 22 |
+
from enum import Enum
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
import gradio as gr
|
| 26 |
+
|
| 27 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 28 |
+
# Configuration constants
|
| 29 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 30 |
+
MAX_ZIP_SIZE_MB = 200
|
| 31 |
+
MAX_FILES_IN_ZIP = 500
|
| 32 |
+
MAX_SINGLE_FILE_MB = 50
|
| 33 |
+
MAX_DECOMPRESSION_RATIO = 100 # zip bomb guard: reject if total > ratio Γ compressed
|
| 34 |
+
MAX_PREVIEW_CHARS = 5_000
|
| 35 |
+
MAX_FULL_TEXT_CHARS = 500_000
|
| 36 |
+
MAX_XLSX_ROWS = 100
|
| 37 |
+
CONCURRENCY_LIMIT = 3
|
| 38 |
+
|
| 39 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 40 |
+
# Logging
|
| 41 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 42 |
+
logger = logging.getLogger("document_parser")
|
| 43 |
+
logging.basicConfig(
|
| 44 |
+
level=logging.INFO,
|
| 45 |
+
format="%(asctime)s | %(levelname)s | %(message)s",
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 50 |
+
# File classification
|
| 51 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 52 |
+
class FileCategory(str, Enum):
|
| 53 |
+
TEXT = "text"
|
| 54 |
+
PDF = "pdf"
|
| 55 |
+
DOCX = "docx"
|
| 56 |
+
XLSX = "xlsx"
|
| 57 |
+
IMAGE = "image"
|
| 58 |
+
BINARY = "binary"
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
TEXT_EXTENSIONS = frozenset({
|
| 62 |
+
".txt", ".md", ".rst", ".py", ".js", ".ts", ".jsx", ".tsx", ".html",
|
| 63 |
+
".htm", ".css", ".scss", ".less", ".json", ".jsonl", ".yaml", ".yml",
|
| 64 |
+
".csv", ".tsv", ".xml", ".toml", ".cfg", ".ini", ".conf", ".properties",
|
| 65 |
+
".sh", ".bash", ".zsh", ".fish", ".bat", ".ps1", ".cmd",
|
| 66 |
+
".r", ".rmd", ".java", ".c", ".cpp", ".h", ".hpp", ".cc", ".cxx",
|
| 67 |
+
".go", ".rs", ".rb", ".php", ".swift", ".kt", ".kts", ".scala", ".clj",
|
| 68 |
+
".sql", ".graphql", ".gql", ".proto", ".thrift",
|
| 69 |
+
".dockerfile", ".makefile", ".cmake",
|
| 70 |
+
".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
|
| 71 |
+
".env", ".env.example", ".log", ".tex", ".bib", ".sty",
|
| 72 |
+
".lua", ".vim", ".el", ".lisp", ".hs", ".ml", ".mli", ".ex", ".exs",
|
| 73 |
+
".erl", ".hrl", ".dart", ".v", ".sv", ".vhd", ".vhdl",
|
| 74 |
+
".tf", ".tfvars", ".hcl", ".nix", ".dhall",
|
| 75 |
+
".ipynb",
|
| 76 |
+
})
|
| 77 |
+
|
| 78 |
+
KNOWN_TEXT_FILENAMES = frozenset({
|
| 79 |
+
"Makefile", "Dockerfile", "Procfile", "Vagrantfile", "Gemfile",
|
| 80 |
+
"Rakefile", "Brewfile", "Justfile", "Taskfile",
|
| 81 |
+
".gitignore", ".gitattributes", ".dockerignore", ".editorconfig",
|
| 82 |
+
".eslintrc", ".prettierrc", ".babelrc", ".browserslistrc",
|
| 83 |
+
"LICENSE", "LICENCE", "COPYING", "AUTHORS", "CONTRIBUTORS",
|
| 84 |
+
"CHANGELOG", "CHANGES", "HISTORY", "NEWS",
|
| 85 |
+
"README", "INSTALL", "TODO", "HACKING",
|
| 86 |
+
"requirements.txt",
|
| 87 |
+
})
|
| 88 |
+
|
| 89 |
+
IMAGE_EXTENSIONS = frozenset({
|
| 90 |
+
".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico",
|
| 91 |
+
".tiff", ".tif", ".avif", ".heic", ".heif",
|
| 92 |
+
})
|
| 93 |
+
|
| 94 |
+
CATEGORY_EMOJI = {
|
| 95 |
+
FileCategory.TEXT: "π",
|
| 96 |
+
FileCategory.PDF: "π",
|
| 97 |
+
FileCategory.DOCX: "π",
|
| 98 |
+
FileCategory.XLSX: "π",
|
| 99 |
+
FileCategory.IMAGE: "πΌοΈ",
|
| 100 |
+
FileCategory.BINARY: "π¦",
|
| 101 |
}
|
| 102 |
|
| 103 |
+
|
| 104 |
+
def classify_file(filename: str) -> tuple[FileCategory, str]:
|
| 105 |
+
"""Classify a file by its extension and known filename patterns."""
|
| 106 |
+
basename = filename.rsplit("/", 1)[-1] if "/" in filename else filename
|
| 107 |
+
ext = os.path.splitext(basename)[1].lower()
|
| 108 |
+
|
| 109 |
+
if not ext and basename in KNOWN_TEXT_FILENAMES:
|
| 110 |
+
return FileCategory.TEXT, ""
|
| 111 |
+
if not ext and basename.upper() in {n.upper() for n in KNOWN_TEXT_FILENAMES}:
|
| 112 |
+
return FileCategory.TEXT, ""
|
| 113 |
+
|
| 114 |
+
if ext in TEXT_EXTENSIONS:
|
| 115 |
+
return FileCategory.TEXT, ext
|
| 116 |
+
if ext == ".pdf":
|
| 117 |
+
return FileCategory.PDF, ext
|
| 118 |
+
if ext == ".docx":
|
| 119 |
+
return FileCategory.DOCX, ext
|
| 120 |
+
if ext in {".xlsx", ".xls"}:
|
| 121 |
+
return FileCategory.XLSX, ext
|
| 122 |
+
if ext in IMAGE_EXTENSIONS:
|
| 123 |
+
return FileCategory.IMAGE, ext
|
| 124 |
+
return FileCategory.BINARY, ext
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 128 |
+
# Data classes
|
| 129 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 130 |
+
@dataclass
|
| 131 |
+
class ParsedFile:
|
| 132 |
+
filename: str
|
| 133 |
+
category: str
|
| 134 |
+
extension: str
|
| 135 |
+
size_bytes: int
|
| 136 |
+
size_display: str
|
| 137 |
+
content: str = ""
|
| 138 |
+
preview: str = ""
|
| 139 |
+
error: Optional[str] = None
|
| 140 |
+
warnings: list[str] = field(default_factory=list)
|
| 141 |
+
|
| 142 |
+
def to_table_row(self) -> list:
|
| 143 |
+
status = "β οΈ" if self.warnings else ("β" if self.error else "β
")
|
| 144 |
+
preview_text = self.error or self.preview[:200].replace("\n", " ")
|
| 145 |
+
return [
|
| 146 |
+
status,
|
| 147 |
+
self.filename,
|
| 148 |
+
self.extension or "(none)",
|
| 149 |
+
self.category,
|
| 150 |
+
self.size_display,
|
| 151 |
+
preview_text,
|
| 152 |
+
]
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@dataclass
|
| 156 |
+
class ParseStats:
|
| 157 |
+
total_files: int = 0
|
| 158 |
+
parsed_ok: int = 0
|
| 159 |
+
parse_warnings: int = 0
|
| 160 |
+
parse_errors: int = 0
|
| 161 |
+
skipped_dirs: int = 0
|
| 162 |
+
total_compressed_bytes: int = 0
|
| 163 |
+
total_uncompressed_bytes: int = 0
|
| 164 |
+
by_category: dict = field(default_factory=lambda: {c.value: 0 for c in FileCategory})
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 168 |
+
# Size formatting
|
| 169 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 170 |
+
def format_size(size_bytes: int) -> str:
|
| 171 |
+
if size_bytes < 0:
|
| 172 |
+
return "0 B"
|
| 173 |
+
if size_bytes < 1024:
|
| 174 |
+
return f"{size_bytes} B"
|
| 175 |
+
elif size_bytes < 1024 ** 2:
|
| 176 |
+
return f"{size_bytes / 1024:.1f} KB"
|
| 177 |
+
elif size_bytes < 1024 ** 3:
|
| 178 |
+
return f"{size_bytes / (1024 ** 2):.1f} MB"
|
| 179 |
+
else:
|
| 180 |
+
return f"{size_bytes / (1024 ** 3):.2f} GB"
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 184 |
+
# Document parsers β each returns (content, warnings) or raises
|
| 185 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 186 |
+
def parse_text_content(data: bytes, filename: str) -> tuple[str, list[str]]:
|
| 187 |
+
"""Parse plain text / code files."""
|
| 188 |
+
warnings = []
|
| 189 |
+
try:
|
| 190 |
+
content = data.decode("utf-8")
|
| 191 |
+
except UnicodeDecodeError:
|
| 192 |
+
try:
|
| 193 |
+
content = data.decode("latin-1")
|
| 194 |
+
warnings.append("Decoded with latin-1 fallback (not valid UTF-8)")
|
| 195 |
+
except Exception:
|
| 196 |
+
content = data.decode("utf-8", errors="replace")
|
| 197 |
+
warnings.append("Contains invalid bytes; replaced with placeholders")
|
| 198 |
+
|
| 199 |
+
if len(content) > MAX_FULL_TEXT_CHARS:
|
| 200 |
+
warnings.append(f"Content truncated to {MAX_FULL_TEXT_CHARS:,} characters (original: {len(content):,})")
|
| 201 |
+
content = content[:MAX_FULL_TEXT_CHARS] + "\n\n... [TRUNCATED]"
|
| 202 |
+
|
| 203 |
+
return content, warnings
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
def parse_pdf_content(data: bytes, filename: str) -> tuple[str, list[str]]:
|
| 207 |
"""Parse PDF bytes to text using PyMuPDF."""
|
| 208 |
+
warnings = []
|
| 209 |
try:
|
| 210 |
import fitz
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
except ImportError:
|
| 212 |
+
return "[PDF library not available]", ["PyMuPDF not installed β install with: pip install PyMuPDF"]
|
| 213 |
+
|
| 214 |
+
doc = None
|
| 215 |
+
try:
|
| 216 |
+
doc = fitz.open(stream=data, filetype="pdf")
|
| 217 |
+
if doc.is_encrypted:
|
| 218 |
+
return "", ["PDF is password-protected and cannot be parsed"]
|
| 219 |
+
|
| 220 |
+
page_count = len(doc)
|
| 221 |
+
if page_count == 0:
|
| 222 |
+
return "", ["PDF has 0 pages"]
|
| 223 |
+
|
| 224 |
+
text_parts = []
|
| 225 |
+
empty_pages = 0
|
| 226 |
+
for page_num in range(page_count):
|
| 227 |
+
try:
|
| 228 |
+
page = doc[page_num]
|
| 229 |
+
page_text = page.get_text().strip()
|
| 230 |
+
if page_text:
|
| 231 |
+
text_parts.append(f"\n--- Page {page_num + 1}/{page_count} ---\n{page_text}")
|
| 232 |
+
else:
|
| 233 |
+
empty_pages += 1
|
| 234 |
+
except Exception as e:
|
| 235 |
+
warnings.append(f"Page {page_num + 1} failed: {type(e).__name__}: {e}")
|
| 236 |
+
|
| 237 |
+
if empty_pages > 0:
|
| 238 |
+
warnings.append(f"{empty_pages}/{page_count} pages had no extractable text (may be scanned/image-based)")
|
| 239 |
+
|
| 240 |
+
content = "\n".join(text_parts) if text_parts else "[No extractable text found]"
|
| 241 |
+
if not text_parts and empty_pages == page_count:
|
| 242 |
+
warnings.append("PDF appears to be entirely image-based; OCR would be needed to extract text")
|
| 243 |
+
|
| 244 |
+
return content, warnings
|
| 245 |
+
|
| 246 |
except Exception as e:
|
| 247 |
+
logger.error(f"PDF parse error for {filename}: {e}")
|
| 248 |
+
return "", [f"PDF parse failed: {type(e).__name__}: {e}"]
|
| 249 |
+
finally:
|
| 250 |
+
if doc:
|
| 251 |
+
try:
|
| 252 |
+
doc.close()
|
| 253 |
+
except Exception:
|
| 254 |
+
pass
|
| 255 |
|
| 256 |
|
| 257 |
+
def parse_docx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
|
| 258 |
"""Parse DOCX bytes to text."""
|
| 259 |
+
warnings = []
|
| 260 |
try:
|
| 261 |
from docx import Document
|
| 262 |
+
except ImportError:
|
| 263 |
+
return "[DOCX library not available]", ["python-docx not installed"]
|
| 264 |
+
|
| 265 |
+
try:
|
| 266 |
doc = Document(io.BytesIO(data))
|
| 267 |
+
parts = []
|
| 268 |
+
|
| 269 |
paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
|
| 270 |
+
if paragraphs:
|
| 271 |
+
parts.extend(paragraphs)
|
| 272 |
+
|
| 273 |
+
for i, table in enumerate(doc.tables):
|
| 274 |
+
try:
|
| 275 |
+
table_text = f"\n--- Table {i + 1} ---\n"
|
| 276 |
+
for row in table.rows:
|
| 277 |
+
cells = [cell.text.strip() for cell in row.cells]
|
| 278 |
+
table_text += " | ".join(cells) + "\n"
|
| 279 |
+
parts.append(table_text)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
warnings.append(f"Table {i + 1} extraction failed: {e}")
|
| 282 |
+
|
| 283 |
+
content = "\n".join(parts) if parts else "[DOCX: empty document]"
|
| 284 |
+
if not parts:
|
| 285 |
+
warnings.append("Document contains no paragraphs or tables")
|
| 286 |
+
|
| 287 |
+
return content, warnings
|
| 288 |
+
|
| 289 |
except Exception as e:
|
| 290 |
+
logger.error(f"DOCX parse error for {filename}: {e}")
|
| 291 |
+
return "", [f"DOCX parse failed: {type(e).__name__}: {e}"]
|
| 292 |
|
| 293 |
|
| 294 |
+
def parse_xlsx_content(data: bytes, filename: str) -> tuple[str, list[str]]:
|
| 295 |
"""Parse XLSX bytes to text summary."""
|
| 296 |
+
warnings = []
|
| 297 |
try:
|
| 298 |
import openpyxl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
except ImportError:
|
| 300 |
+
return "[XLSX library not available]", ["openpyxl not installed"]
|
| 301 |
+
|
| 302 |
+
wb = None
|
| 303 |
+
try:
|
| 304 |
+
wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True, data_only=True)
|
| 305 |
+
parts = []
|
| 306 |
+
|
| 307 |
+
for sheet_name in wb.sheetnames:
|
| 308 |
+
try:
|
| 309 |
+
ws = wb[sheet_name]
|
| 310 |
+
sheet_text = f"\n--- Sheet: {sheet_name} ---\n"
|
| 311 |
+
row_count = 0
|
| 312 |
+
for row in ws.iter_rows(values_only=True):
|
| 313 |
+
if row_count >= MAX_XLSX_ROWS:
|
| 314 |
+
sheet_text += f"\n... (truncated at {MAX_XLSX_ROWS} rows)\n"
|
| 315 |
+
warnings.append(f"Sheet '{sheet_name}' truncated at {MAX_XLSX_ROWS} rows")
|
| 316 |
+
break
|
| 317 |
+
cells = [str(cell) if cell is not None else "" for cell in row]
|
| 318 |
+
sheet_text += " | ".join(cells) + "\n"
|
| 319 |
+
row_count += 1
|
| 320 |
+
if row_count == 0:
|
| 321 |
+
sheet_text += "(empty sheet)\n"
|
| 322 |
+
parts.append(sheet_text)
|
| 323 |
+
except Exception as e:
|
| 324 |
+
warnings.append(f"Sheet '{sheet_name}' failed: {type(e).__name__}: {e}")
|
| 325 |
+
|
| 326 |
+
content = "\n".join(parts) if parts else "[XLSX: empty workbook]"
|
| 327 |
+
return content, warnings
|
| 328 |
+
|
| 329 |
except Exception as e:
|
| 330 |
+
logger.error(f"XLSX parse error for {filename}: {e}")
|
| 331 |
+
return "", [f"XLSX parse failed: {type(e).__name__}: {e}"]
|
| 332 |
+
finally:
|
| 333 |
+
if wb:
|
| 334 |
+
try:
|
| 335 |
+
wb.close()
|
| 336 |
+
except Exception:
|
| 337 |
+
pass
|
| 338 |
|
| 339 |
|
| 340 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 341 |
+
# Validation layer
|
| 342 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 343 |
+
def validate_upload(file_path: str | None) -> str:
|
| 344 |
+
"""Validate the uploaded file. Returns the resolved file path. Raises gr.Error on failure."""
|
| 345 |
+
if file_path is None:
|
| 346 |
+
raise gr.Error("β οΈ Please upload a ZIP file first.")
|
|
|
|
| 347 |
|
| 348 |
+
if not os.path.isfile(file_path):
|
| 349 |
+
raise gr.Error("β Upload failed β file not found on server. Please try again.")
|
| 350 |
|
| 351 |
+
file_size = os.path.getsize(file_path)
|
| 352 |
+
if file_size == 0:
|
| 353 |
+
raise gr.Error("β The uploaded file is empty (0 bytes).")
|
|
|
|
| 354 |
|
| 355 |
+
size_mb = file_size / (1024 ** 2)
|
| 356 |
+
if size_mb > MAX_ZIP_SIZE_MB:
|
| 357 |
+
raise gr.Error(
|
| 358 |
+
f"β File too large: {size_mb:.1f} MB. "
|
| 359 |
+
f"Maximum allowed is {MAX_ZIP_SIZE_MB} MB."
|
| 360 |
+
)
|
| 361 |
|
| 362 |
if not zipfile.is_zipfile(file_path):
|
| 363 |
+
raise gr.Error(
|
| 364 |
+
"β Not a valid ZIP archive. The file may be corrupted, "
|
| 365 |
+
"or it may be a different archive format (tar, rar, 7z)."
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
return file_path
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
def check_zip_bomb(zf: zipfile.ZipFile, compressed_size: int) -> list[str]:
|
| 372 |
+
"""Check for zip bomb indicators. Returns warnings. Raises gr.Error if malicious."""
|
| 373 |
+
warnings = []
|
| 374 |
+
total_uncompressed = sum(info.file_size for info in zf.infolist() if not info.is_dir())
|
| 375 |
+
|
| 376 |
+
if compressed_size > 0:
|
| 377 |
+
ratio = total_uncompressed / compressed_size
|
| 378 |
+
if ratio > MAX_DECOMPRESSION_RATIO:
|
| 379 |
+
raise gr.Error(
|
| 380 |
+
f"π‘οΈ Zip bomb detected! Decompression ratio is {ratio:.0f}x "
|
| 381 |
+
f"(compressed: {format_size(compressed_size)}, "
|
| 382 |
+
f"uncompressed: {format_size(total_uncompressed)}). "
|
| 383 |
+
f"Maximum allowed ratio is {MAX_DECOMPRESSION_RATIO}x."
|
| 384 |
+
)
|
| 385 |
+
if ratio > MAX_DECOMPRESSION_RATIO / 2:
|
| 386 |
+
warnings.append(
|
| 387 |
+
f"High decompression ratio ({ratio:.0f}x) β approaching the "
|
| 388 |
+
f"{MAX_DECOMPRESSION_RATIO}x safety limit"
|
| 389 |
+
)
|
| 390 |
|
| 391 |
+
total_uncompressed_mb = total_uncompressed / (1024 ** 2)
|
| 392 |
+
if total_uncompressed_mb > MAX_ZIP_SIZE_MB * 5:
|
| 393 |
+
raise gr.Error(
|
| 394 |
+
f"π‘οΈ Uncompressed content too large: {total_uncompressed_mb:.0f} MB. "
|
| 395 |
+
f"Maximum is {MAX_ZIP_SIZE_MB * 5} MB."
|
| 396 |
+
)
|
| 397 |
|
| 398 |
+
return warnings
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 402 |
+
# Core parsing engine
|
| 403 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 404 |
+
def parse_zip(file_path: str, progress: gr.Progress) -> tuple[list[ParsedFile], ParseStats]:
|
| 405 |
+
"""Parse all files in a ZIP archive with per-file error isolation."""
|
| 406 |
+
file_size = os.path.getsize(file_path)
|
| 407 |
+
stats = ParseStats()
|
| 408 |
+
|
| 409 |
+
try:
|
| 410 |
+
zf = zipfile.ZipFile(file_path, "r")
|
| 411 |
+
except zipfile.BadZipFile:
|
| 412 |
+
raise gr.Error("β ZIP file is corrupted and cannot be opened.")
|
| 413 |
+
except Exception as e:
|
| 414 |
+
raise gr.Error(f"β Failed to open ZIP: {type(e).__name__}: {e}")
|
| 415 |
+
|
| 416 |
+
try:
|
| 417 |
+
bomb_warnings = check_zip_bomb(zf, file_size)
|
| 418 |
+
|
| 419 |
+
entries = [info for info in zf.infolist() if not info.is_dir()]
|
| 420 |
+
stats.skipped_dirs = len(zf.infolist()) - len(entries)
|
| 421 |
+
stats.total_files = len(entries)
|
| 422 |
+
stats.total_compressed_bytes = file_size
|
| 423 |
+
|
| 424 |
+
if stats.total_files == 0:
|
| 425 |
+
raise gr.Error("β ZIP archive contains no files (only directories).")
|
| 426 |
+
|
| 427 |
+
truncated = False
|
| 428 |
+
if stats.total_files > MAX_FILES_IN_ZIP:
|
| 429 |
+
gr.Warning(
|
| 430 |
+
f"ZIP contains {stats.total_files} files β "
|
| 431 |
+
f"processing first {MAX_FILES_IN_ZIP} only."
|
| 432 |
+
)
|
| 433 |
+
entries = entries[:MAX_FILES_IN_ZIP]
|
| 434 |
+
truncated = True
|
| 435 |
+
|
| 436 |
+
parsed_files: list[ParsedFile] = []
|
| 437 |
+
|
| 438 |
+
for i, info in enumerate(progress.tqdm(entries, desc="Parsing documents")):
|
| 439 |
+
category, ext = classify_file(info.filename)
|
| 440 |
+
stats.by_category[category.value] += 1
|
| 441 |
+
stats.total_uncompressed_bytes += info.file_size
|
| 442 |
+
|
| 443 |
+
pf = ParsedFile(
|
| 444 |
+
filename=info.filename,
|
| 445 |
+
category=category.value,
|
| 446 |
+
extension=ext or "(none)",
|
| 447 |
+
size_bytes=info.file_size,
|
| 448 |
+
size_display=format_size(info.file_size),
|
| 449 |
+
)
|
| 450 |
|
| 451 |
+
file_mb = info.file_size / (1024 ** 2)
|
| 452 |
+
if file_mb > MAX_SINGLE_FILE_MB:
|
| 453 |
+
pf.error = f"Skipped: file too large ({file_mb:.1f} MB > {MAX_SINGLE_FILE_MB} MB limit)"
|
| 454 |
+
pf.warnings.append(pf.error)
|
| 455 |
+
stats.parse_warnings += 1
|
| 456 |
+
parsed_files.append(pf)
|
| 457 |
+
continue
|
| 458 |
|
| 459 |
try:
|
| 460 |
raw_data = zf.read(info)
|
| 461 |
+
except RuntimeError as e:
|
| 462 |
+
pf.error = f"Cannot read: {e}"
|
| 463 |
+
if "password" in str(e).lower():
|
| 464 |
+
pf.error = "File is password-protected"
|
| 465 |
+
stats.parse_errors += 1
|
| 466 |
+
parsed_files.append(pf)
|
| 467 |
+
continue
|
| 468 |
except Exception as e:
|
| 469 |
+
pf.error = f"Read failed: {type(e).__name__}: {e}"
|
| 470 |
+
stats.parse_errors += 1
|
| 471 |
+
parsed_files.append(pf)
|
| 472 |
+
continue
|
| 473 |
+
|
| 474 |
+
try:
|
| 475 |
+
if category == FileCategory.TEXT:
|
| 476 |
+
content, warnings = parse_text_content(raw_data, info.filename)
|
| 477 |
+
elif category == FileCategory.PDF:
|
| 478 |
+
content, warnings = parse_pdf_content(raw_data, info.filename)
|
| 479 |
+
elif category == FileCategory.DOCX:
|
| 480 |
+
content, warnings = parse_docx_content(raw_data, info.filename)
|
| 481 |
+
elif category == FileCategory.XLSX:
|
| 482 |
+
content, warnings = parse_xlsx_content(raw_data, info.filename)
|
| 483 |
+
elif category == FileCategory.IMAGE:
|
| 484 |
+
content = ""
|
| 485 |
+
warnings = []
|
| 486 |
+
pf.preview = f"[Image: {ext}, {pf.size_display}]"
|
| 487 |
+
else:
|
| 488 |
+
content = ""
|
| 489 |
+
warnings = []
|
| 490 |
+
pf.preview = f"[Binary: {ext}, {pf.size_display}]"
|
| 491 |
+
|
| 492 |
+
pf.content = content
|
| 493 |
+
pf.preview = content[:MAX_PREVIEW_CHARS] if content else pf.preview
|
| 494 |
+
pf.warnings = warnings
|
| 495 |
+
|
| 496 |
+
if warnings:
|
| 497 |
+
stats.parse_warnings += 1
|
|
|
|
| 498 |
else:
|
| 499 |
+
stats.parsed_ok += 1
|
| 500 |
+
|
| 501 |
+
except MemoryError:
|
| 502 |
+
pf.error = "Out of memory while parsing this file"
|
| 503 |
+
stats.parse_errors += 1
|
| 504 |
+
logger.error(f"MemoryError parsing {info.filename}")
|
| 505 |
+
except Exception as e:
|
| 506 |
+
pf.error = f"Parse failed: {type(e).__name__}: {e}"
|
| 507 |
+
stats.parse_errors += 1
|
| 508 |
+
logger.error(f"Parse error for {info.filename}: {e}")
|
| 509 |
+
traceback.print_exc()
|
| 510 |
+
|
| 511 |
+
parsed_files.append(pf)
|
| 512 |
+
|
| 513 |
+
if bomb_warnings:
|
| 514 |
+
for w in bomb_warnings:
|
| 515 |
+
gr.Warning(w)
|
| 516 |
+
|
| 517 |
+
if truncated:
|
| 518 |
+
stats.parse_warnings += 1
|
| 519 |
+
|
| 520 |
+
return parsed_files, stats
|
| 521 |
+
|
| 522 |
+
finally:
|
| 523 |
+
try:
|
| 524 |
+
zf.close()
|
| 525 |
+
except Exception:
|
| 526 |
+
pass
|
| 527 |
+
|
| 528 |
+
|
| 529 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 530 |
+
# Output formatters
|
| 531 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 532 |
+
def build_summary(stats: ParseStats, parsed_files: list[ParsedFile]) -> str:
|
| 533 |
+
"""Build a rich markdown summary."""
|
| 534 |
+
alerts = []
|
| 535 |
+
if stats.parse_errors > 0:
|
| 536 |
+
alerts.append(f"β οΈ **{stats.parse_errors} file(s) failed to parse** β see β markers in the file listing")
|
| 537 |
+
if stats.parse_warnings > 0:
|
| 538 |
+
alerts.append(f"βΉοΈ **{stats.parse_warnings} file(s) had warnings** β see β οΈ markers in the file listing")
|
| 539 |
+
|
| 540 |
+
alert_block = "\n".join(alerts) + "\n\n" if alerts else ""
|
| 541 |
+
|
| 542 |
+
error_files = [pf for pf in parsed_files if pf.error]
|
| 543 |
+
error_block = ""
|
| 544 |
+
if error_files:
|
| 545 |
+
error_lines = []
|
| 546 |
+
for pf in error_files[:10]:
|
| 547 |
+
error_lines.append(f"- `{pf.filename}`: {pf.error}")
|
| 548 |
+
if len(error_files) > 10:
|
| 549 |
+
error_lines.append(f"- ... and {len(error_files) - 10} more")
|
| 550 |
+
error_block = "\n### β Failed Files\n" + "\n".join(error_lines) + "\n\n"
|
| 551 |
+
|
| 552 |
+
return f"""## π¦ ZIP Archive Summary
|
| 553 |
+
|
| 554 |
+
{alert_block}| Metric | Value |
|
| 555 |
|--------|-------|
|
| 556 |
+
| **Total files** | {stats.total_files} |
|
| 557 |
+
| **Parsed successfully** | {stats.parsed_ok} |
|
| 558 |
+
| **With warnings** | {stats.parse_warnings} |
|
| 559 |
+
| **Failed** | {stats.parse_errors} |
|
| 560 |
+
| **Compressed size** | {format_size(stats.total_compressed_bytes)} |
|
| 561 |
+
| **Uncompressed size** | {format_size(stats.total_uncompressed_bytes)} |
|
| 562 |
+
| **Directories skipped** | {stats.skipped_dirs} |
|
| 563 |
+
|
| 564 |
+
### π File Types
|
| 565 |
+
| Category | Count |
|
| 566 |
+
|----------|-------|
|
| 567 |
+
| Text/Code | {stats.by_category.get('text', 0)} |
|
| 568 |
+
| PDF | {stats.by_category.get('pdf', 0)} |
|
| 569 |
+
| DOCX | {stats.by_category.get('docx', 0)} |
|
| 570 |
+
| XLSX | {stats.by_category.get('xlsx', 0)} |
|
| 571 |
+
| Image | {stats.by_category.get('image', 0)} |
|
| 572 |
+
| Binary | {stats.by_category.get('binary', 0)} |
|
| 573 |
+
|
| 574 |
+
{error_block}"""
|
| 575 |
+
|
| 576 |
+
|
| 577 |
+
def build_full_text(parsed_files: list[ParsedFile]) -> str:
|
| 578 |
+
"""Build concatenated text output from all parsed files."""
|
| 579 |
+
parts = []
|
| 580 |
+
for pf in parsed_files:
|
| 581 |
+
if pf.content:
|
| 582 |
+
emoji = CATEGORY_EMOJI.get(FileCategory(pf.category), "π")
|
| 583 |
+
parts.append(
|
| 584 |
+
f"\n{'=' * 70}\n"
|
| 585 |
+
f"{emoji} {pf.filename}"
|
| 586 |
+
f"{' β οΈ ' + ', '.join(pf.warnings) if pf.warnings else ''}\n"
|
| 587 |
+
f"{'=' * 70}\n"
|
| 588 |
+
f"{pf.content}"
|
| 589 |
+
)
|
| 590 |
+
elif pf.error:
|
| 591 |
+
parts.append(
|
| 592 |
+
f"\n{'=' * 70}\n"
|
| 593 |
+
f"β {pf.filename} β ERROR: {pf.error}\n"
|
| 594 |
+
f"{'=' * 70}"
|
| 595 |
+
)
|
| 596 |
+
|
| 597 |
+
if not parts:
|
| 598 |
+
return "(No text content was extracted from any file in the archive.)"
|
| 599 |
+
|
| 600 |
+
full = "\n".join(parts)
|
| 601 |
+
if len(full) > MAX_FULL_TEXT_CHARS:
|
| 602 |
+
full = full[:MAX_FULL_TEXT_CHARS] + "\n\n... [OUTPUT TRUNCATED β too large to display fully]"
|
| 603 |
+
return full
|
| 604 |
+
|
| 605 |
+
|
| 606 |
+
def build_json(parsed_files: list[ParsedFile]) -> list[dict]:
|
| 607 |
+
"""Build structured JSON output."""
|
| 608 |
+
output = []
|
| 609 |
+
for pf in parsed_files:
|
| 610 |
+
entry = {
|
| 611 |
+
"filename": pf.filename,
|
| 612 |
+
"category": pf.category,
|
| 613 |
+
"extension": pf.extension,
|
| 614 |
+
"size_bytes": pf.size_bytes,
|
| 615 |
+
"size_display": pf.size_display,
|
| 616 |
+
"preview": pf.preview[:1000],
|
| 617 |
+
"status": "error" if pf.error else ("warning" if pf.warnings else "ok"),
|
| 618 |
+
}
|
| 619 |
+
if pf.error:
|
| 620 |
+
entry["error"] = pf.error
|
| 621 |
+
if pf.warnings:
|
| 622 |
+
entry["warnings"] = pf.warnings
|
| 623 |
+
output.append(entry)
|
| 624 |
+
return output
|
| 625 |
+
|
| 626 |
+
|
| 627 |
+
def build_detail(file_data: list[dict], evt: gr.SelectData) -> str:
|
| 628 |
+
"""Build detail view when user clicks a table row."""
|
| 629 |
+
if not file_data or not isinstance(file_data, list):
|
| 630 |
+
return "βΉοΈ Select a file from the **File Listing** tab to see its full preview here."
|
| 631 |
+
|
| 632 |
+
try:
|
| 633 |
+
row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
|
| 634 |
+
except (TypeError, IndexError):
|
| 635 |
+
return "β οΈ Could not determine selected row. Please click a row in the file listing."
|
| 636 |
+
|
| 637 |
+
if not (0 <= row_idx < len(file_data)):
|
| 638 |
+
return f"β οΈ Row index {row_idx} is out of range (0β{len(file_data) - 1})."
|
| 639 |
+
|
| 640 |
+
item = file_data[row_idx]
|
| 641 |
+
|
| 642 |
+
header = f"## {CATEGORY_EMOJI.get(item.get('category', ''), 'π')} {item['filename']}\n"
|
| 643 |
+
meta = f"**Category:** {item.get('category', 'unknown')} | **Size:** {item.get('size_display', 'unknown')}\n\n"
|
| 644 |
+
|
| 645 |
+
sections = [header, meta]
|
| 646 |
+
|
| 647 |
+
if item.get("error"):
|
| 648 |
+
sections.append(f"### β Error\n```\n{item['error']}\n```\n")
|
| 649 |
+
|
| 650 |
+
if item.get("warnings"):
|
| 651 |
+
sections.append("### β οΈ Warnings\n" + "\n".join(f"- {w}" for w in item["warnings"]) + "\n\n")
|
| 652 |
+
|
| 653 |
+
preview = item.get("preview", "")
|
| 654 |
+
if preview and not preview.startswith("["):
|
| 655 |
+
ext = item.get("extension", "").lstrip(".")
|
| 656 |
+
lang_map = {
|
| 657 |
+
"py": "python", "js": "javascript", "ts": "typescript",
|
| 658 |
+
"json": "json", "yaml": "yaml", "yml": "yaml",
|
| 659 |
+
"html": "html", "htm": "html", "css": "css",
|
| 660 |
+
"sql": "sql", "sh": "bash", "bash": "bash",
|
| 661 |
+
"java": "java", "c": "c", "cpp": "cpp", "go": "go",
|
| 662 |
+
"rs": "rust", "rb": "ruby", "php": "php", "xml": "xml",
|
| 663 |
+
"md": "markdown", "toml": "toml", "csv": "csv",
|
| 664 |
+
}
|
| 665 |
+
lang = lang_map.get(ext, "")
|
| 666 |
+
sections.append(f"### π Content Preview\n```{lang}\n{preview}\n```")
|
| 667 |
+
elif preview:
|
| 668 |
+
sections.append(f"### π Info\n{preview}")
|
| 669 |
+
else:
|
| 670 |
+
sections.append("*(No content to preview for this file type.)*")
|
| 671 |
+
|
| 672 |
+
return "\n".join(sections)
|
| 673 |
+
|
| 674 |
+
|
| 675 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 676 |
+
# Main entry point
|
| 677 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 678 |
+
def run_parse(file_obj, progress=gr.Progress()):
|
| 679 |
+
"""Top-level handler: validate β parse β format outputs."""
|
| 680 |
+
try:
|
| 681 |
+
file_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
|
| 682 |
+
|
| 683 |
+
progress(0.0, desc="Validating upload...")
|
| 684 |
+
file_path = validate_upload(file_path)
|
| 685 |
|
| 686 |
+
gr.Info(f"π¦ Processing ZIP file ({format_size(os.path.getsize(file_path))})...")
|
| 687 |
|
| 688 |
+
parsed_files, stats = parse_zip(file_path, progress)
|
| 689 |
|
| 690 |
+
progress(0.95, desc="Building output...")
|
| 691 |
+
summary = build_summary(stats, parsed_files)
|
| 692 |
+
table_rows = [pf.to_table_row() for pf in parsed_files]
|
| 693 |
+
full_text = build_full_text(parsed_files)
|
| 694 |
+
json_data = build_json(parsed_files)
|
| 695 |
|
| 696 |
+
progress(1.0, desc="Done!")
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
+
if stats.parse_errors > 0:
|
| 699 |
+
gr.Warning(f"{stats.parse_errors} file(s) failed to parse. See details below.")
|
| 700 |
+
elif stats.parse_warnings > 0:
|
| 701 |
+
gr.Info(f"β
Parsed {stats.parsed_ok} files with {stats.parse_warnings} warning(s).")
|
| 702 |
+
else:
|
| 703 |
+
gr.Info(f"β
Successfully parsed all {stats.parsed_ok} files!")
|
| 704 |
|
| 705 |
+
return summary, table_rows, full_text, json_data, json_data
|
| 706 |
|
| 707 |
+
except gr.Error:
|
| 708 |
+
raise
|
| 709 |
|
| 710 |
+
except MemoryError:
|
| 711 |
+
logger.error("MemoryError during ZIP processing")
|
| 712 |
+
raise gr.Error(
|
| 713 |
+
"π₯ Out of memory! The ZIP file contents are too large to process. "
|
| 714 |
+
"Try a smaller archive or one with fewer/smaller files."
|
| 715 |
+
)
|
| 716 |
+
|
| 717 |
+
except Exception as e:
|
| 718 |
+
logger.error(f"Unexpected error: {type(e).__name__}: {e}")
|
| 719 |
+
traceback.print_exc()
|
| 720 |
+
raise gr.Error(
|
| 721 |
+
f"π₯ An unexpected error occurred: {type(e).__name__}: {e}\n\n"
|
| 722 |
+
"If this persists, please report it as a bug."
|
| 723 |
+
)
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 727 |
+
# Gradio UI
|
| 728 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 729 |
with gr.Blocks(
|
| 730 |
title="π¦ Document Parser",
|
|
|
|
| 731 |
) as demo:
|
| 732 |
gr.Markdown("""
|
| 733 |
# π¦ Document Parser
|
|
|
|
| 734 |
|
| 735 |
+
Upload a **ZIP file** and this tool extracts & parses text from every supported document inside it.
|
| 736 |
+
|
| 737 |
+
**Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.ts`, `.json`, `.yaml`, `.csv`, `.html`, `.xml`,
|
| 738 |
+
`.pdf`, `.docx`, `.xlsx`, and **40+ more** text/code formats β including `Makefile`, `Dockerfile`, `LICENSE`, etc.
|
| 739 |
+
|
| 740 |
+
**Limits:** Max ZIP size: {max_zip}MB Β· Max files: {max_files} Β· Max single file: {max_file}MB Β· Zip bomb protection enabled
|
| 741 |
+
""".format(max_zip=MAX_ZIP_SIZE_MB, max_files=MAX_FILES_IN_ZIP, max_file=MAX_SINGLE_FILE_MB))
|
| 742 |
|
| 743 |
with gr.Row():
|
| 744 |
with gr.Column(scale=1):
|
|
|
|
| 747 |
file_types=[".zip"],
|
| 748 |
type="filepath",
|
| 749 |
)
|
| 750 |
+
parse_btn = gr.Button(
|
| 751 |
+
"π Parse Documents",
|
| 752 |
+
variant="primary",
|
| 753 |
+
size="lg",
|
| 754 |
+
)
|
| 755 |
|
| 756 |
+
summary_output = gr.Markdown(label="Summary", value="*Upload a ZIP file to get started.*")
|
| 757 |
|
| 758 |
with gr.Tabs():
|
| 759 |
with gr.Tab("π File Listing"):
|
| 760 |
file_table = gr.Dataframe(
|
| 761 |
+
headers=["Status", "Filename", "Extension", "Type", "Size", "Preview"],
|
| 762 |
label="Files in Archive",
|
| 763 |
interactive=False,
|
| 764 |
wrap=True,
|
| 765 |
)
|
| 766 |
with gr.Tab("π Extracted Text"):
|
| 767 |
text_output = gr.Textbox(
|
| 768 |
+
label="Full Extracted Text (all parseable files concatenated)",
|
| 769 |
lines=30,
|
| 770 |
max_lines=100,
|
| 771 |
+
buttons=["copy"],
|
| 772 |
)
|
| 773 |
with gr.Tab("π File Detail"):
|
| 774 |
+
gr.Markdown("*Click a row in the **File Listing** tab, then switch here to see the full preview.*")
|
| 775 |
+
detail_output = gr.Markdown(
|
| 776 |
+
"βΉοΈ Select a file from the **File Listing** tab to see its full preview here."
|
| 777 |
+
)
|
| 778 |
+
with gr.Tab("π JSON Export"):
|
| 779 |
json_output = gr.JSON(label="Structured Parse Results")
|
| 780 |
|
|
|
|
| 781 |
file_data_state = gr.State([])
|
| 782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
parse_btn.click(
|
| 784 |
fn=run_parse,
|
| 785 |
inputs=zip_input,
|
| 786 |
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
| 787 |
+
concurrency_limit=CONCURRENCY_LIMIT,
|
| 788 |
+
concurrency_id="parse_engine",
|
| 789 |
+
trigger_mode="once",
|
| 790 |
)
|
| 791 |
+
|
| 792 |
zip_input.upload(
|
| 793 |
fn=run_parse,
|
| 794 |
inputs=zip_input,
|
| 795 |
outputs=[summary_output, file_table, text_output, json_output, file_data_state],
|
| 796 |
+
concurrency_limit=CONCURRENCY_LIMIT,
|
| 797 |
+
concurrency_id="parse_engine",
|
| 798 |
+
trigger_mode="once",
|
| 799 |
)
|
| 800 |
|
| 801 |
file_table.select(
|
| 802 |
+
fn=build_detail,
|
| 803 |
inputs=file_data_state,
|
| 804 |
outputs=detail_output,
|
| 805 |
)
|
| 806 |
|
| 807 |
+
demo.queue(default_concurrency_limit=CONCURRENCY_LIMIT, max_size=20)
|
| 808 |
+
|
| 809 |
if __name__ == "__main__":
|
| 810 |
+
demo.launch(
|
| 811 |
+
show_error=True,
|
| 812 |
+
theme=gr.themes.Soft(),
|
| 813 |
+
css="""
|
| 814 |
+
.file-table { font-size: 0.9em; }
|
| 815 |
+
footer { display: none !important; }
|
| 816 |
+
""",
|
| 817 |
+
)
|