ScottzillaSystems commited on
Commit
61ca336
Β·
verified Β·
1 Parent(s): e561c4d

Add document parser app

Browse files
Files changed (1) hide show
  1. app.py +289 -0
app.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import zipfile
3
+ import os
4
+ import io
5
+ import json
6
+ import tempfile
7
+ import shutil
8
+
9
+ # Supported text-based extensions
10
+ TEXT_EXTS = {
11
+ ".txt", ".md", ".py", ".js", ".ts", ".jsx", ".tsx", ".html", ".css",
12
+ ".json", ".yaml", ".yml", ".csv", ".xml", ".toml", ".cfg", ".ini",
13
+ ".sh", ".bash", ".bat", ".ps1", ".r", ".java", ".c", ".cpp", ".h",
14
+ ".hpp", ".go", ".rs", ".rb", ".php", ".swift", ".kt", ".scala",
15
+ ".sql", ".dockerfile", ".makefile", ".gitignore", ".env", ".log",
16
+ }
17
+
18
+ # Extensions we can parse with special libraries
19
+ PDF_EXTS = {".pdf"}
20
+ DOCX_EXTS = {".docx"}
21
+ XLSX_EXTS = {".xlsx"}
22
+ IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".webp", ".ico"}
23
+
24
+
25
+ def get_file_type(filename):
26
+ """Categorize file by extension."""
27
+ ext = os.path.splitext(filename)[1].lower()
28
+ if not ext and filename.split("/")[-1] in {"Makefile", "Dockerfile", "Procfile", ".gitignore", ".dockerignore"}:
29
+ return "text", ext
30
+ if ext in TEXT_EXTS:
31
+ return "text", ext
32
+ if ext in PDF_EXTS:
33
+ return "pdf", ext
34
+ if ext in DOCX_EXTS:
35
+ return "docx", ext
36
+ if ext in XLSX_EXTS:
37
+ return "xlsx", ext
38
+ if ext in IMAGE_EXTS:
39
+ return "image", ext
40
+ return "binary", ext
41
+
42
+
43
+ def parse_pdf_content(data):
44
+ """Parse PDF bytes to text using PyMuPDF."""
45
+ try:
46
+ import fitz
47
+ doc = fitz.open(stream=data, filetype="pdf")
48
+ text = ""
49
+ for page_num, page in enumerate(doc):
50
+ text += f"\n--- Page {page_num + 1} ---\n"
51
+ text += page.get_text()
52
+ doc.close()
53
+ return text.strip() if text.strip() else "[PDF: no extractable text]"
54
+ except ImportError:
55
+ return "[PDF parsing unavailable - PyMuPDF not installed]"
56
+ except Exception as e:
57
+ return f"[PDF parse error: {e}]"
58
+
59
+
60
+ def parse_docx_content(data):
61
+ """Parse DOCX bytes to text."""
62
+ try:
63
+ from docx import Document
64
+ doc = Document(io.BytesIO(data))
65
+ paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
66
+ return "\n".join(paragraphs) if paragraphs else "[DOCX: empty document]"
67
+ except ImportError:
68
+ return "[DOCX parsing unavailable - python-docx not installed]"
69
+ except Exception as e:
70
+ return f"[DOCX parse error: {e}]"
71
+
72
+
73
+ def parse_xlsx_content(data):
74
+ """Parse XLSX bytes to text summary."""
75
+ try:
76
+ import openpyxl
77
+ wb = openpyxl.load_workbook(io.BytesIO(data), read_only=True)
78
+ text = ""
79
+ for sheet_name in wb.sheetnames:
80
+ ws = wb[sheet_name]
81
+ text += f"\n--- Sheet: {sheet_name} ---\n"
82
+ row_count = 0
83
+ for row in ws.iter_rows(values_only=True):
84
+ if row_count >= 50: # Limit rows shown
85
+ text += f"\n... (more rows exist)\n"
86
+ break
87
+ text += " | ".join(str(cell) if cell is not None else "" for cell in row) + "\n"
88
+ row_count += 1
89
+ wb.close()
90
+ return text.strip() if text.strip() else "[XLSX: empty workbook]"
91
+ except ImportError:
92
+ return "[XLSX parsing unavailable - openpyxl not installed]"
93
+ except Exception as e:
94
+ return f"[XLSX parse error: {e}]"
95
+
96
+
97
+ def format_size(size_bytes):
98
+ """Format bytes to human-readable string."""
99
+ if size_bytes < 1024:
100
+ return f"{size_bytes} B"
101
+ elif size_bytes < 1024 * 1024:
102
+ return f"{size_bytes / 1024:.1f} KB"
103
+ else:
104
+ return f"{size_bytes / (1024 * 1024):.1f} MB"
105
+
106
+
107
+ def parse_zip(file_obj):
108
+ """Main parsing function for uploaded zip files."""
109
+ if file_obj is None:
110
+ return "⚠️ Please upload a ZIP file.", [], "", []
111
+
112
+ file_path = file_obj if isinstance(file_obj, str) else file_obj.name
113
+
114
+ if not zipfile.is_zipfile(file_path):
115
+ return "❌ The uploaded file is not a valid ZIP archive.", [], "", []
116
+
117
+ results = []
118
+ table_rows = []
119
+ full_text_parts = []
120
+ stats = {"total_files": 0, "text_files": 0, "pdf_files": 0, "docx_files": 0,
121
+ "xlsx_files": 0, "image_files": 0, "binary_files": 0, "total_size": 0}
122
+
123
+ with zipfile.ZipFile(file_path, "r") as zf:
124
+ for info in zf.infolist():
125
+ if info.is_dir():
126
+ continue
127
+
128
+ stats["total_files"] += 1
129
+ stats["total_size"] += info.file_size
130
+ file_type, ext = get_file_type(info.filename)
131
+ content_preview = ""
132
+
133
+ try:
134
+ raw_data = zf.read(info)
135
+ except Exception as e:
136
+ content_preview = f"[Read error: {e}]"
137
+ raw_data = None
138
+
139
+ if raw_data is not None:
140
+ if file_type == "text":
141
+ stats["text_files"] += 1
142
+ try:
143
+ content = raw_data.decode("utf-8", errors="replace")
144
+ content_preview = content[:2000]
145
+ full_text_parts.append(f"\n{'='*60}\nπŸ“„ {info.filename}\n{'='*60}\n{content}")
146
+ except Exception as e:
147
+ content_preview = f"[Decode error: {e}]"
148
+ elif file_type == "pdf":
149
+ stats["pdf_files"] += 1
150
+ content = parse_pdf_content(raw_data)
151
+ content_preview = content[:2000]
152
+ full_text_parts.append(f"\n{'='*60}\nπŸ“• {info.filename}\n{'='*60}\n{content}")
153
+ elif file_type == "docx":
154
+ stats["docx_files"] += 1
155
+ content = parse_docx_content(raw_data)
156
+ content_preview = content[:2000]
157
+ full_text_parts.append(f"\n{'='*60}\nπŸ“˜ {info.filename}\n{'='*60}\n{content}")
158
+ elif file_type == "xlsx":
159
+ stats["xlsx_files"] += 1
160
+ content = parse_xlsx_content(raw_data)
161
+ content_preview = content[:2000]
162
+ full_text_parts.append(f"\n{'='*60}\nπŸ“Š {info.filename}\n{'='*60}\n{content}")
163
+ elif file_type == "image":
164
+ stats["image_files"] += 1
165
+ content_preview = f"[Image: {ext}]"
166
+ else:
167
+ stats["binary_files"] += 1
168
+ content_preview = f"[Binary file: {ext}]"
169
+
170
+ results.append({
171
+ "filename": info.filename,
172
+ "type": file_type,
173
+ "extension": ext or "(none)",
174
+ "size": info.file_size,
175
+ "size_formatted": format_size(info.file_size),
176
+ "preview": content_preview[:500],
177
+ })
178
+
179
+ table_rows.append([
180
+ info.filename,
181
+ ext or "(none)",
182
+ file_type,
183
+ format_size(info.file_size),
184
+ content_preview[:200].replace("\n", " "),
185
+ ])
186
+
187
+ # Build summary
188
+ summary = f"""## πŸ“¦ ZIP Archive Summary
189
+
190
+ | Metric | Value |
191
+ |--------|-------|
192
+ | **Total files** | {stats['total_files']} |
193
+ | **Total size** | {format_size(stats['total_size'])} |
194
+ | **Text/Code files** | {stats['text_files']} |
195
+ | **PDF files** | {stats['pdf_files']} |
196
+ | **DOCX files** | {stats['docx_files']} |
197
+ | **XLSX files** | {stats['xlsx_files']} |
198
+ | **Image files** | {stats['image_files']} |
199
+ | **Binary files** | {stats['binary_files']} |
200
+ """
201
+
202
+ full_text = "\n".join(full_text_parts) if full_text_parts else "(No text content extracted)"
203
+
204
+ return summary, table_rows, full_text, results
205
+
206
+
207
+ def select_file_content(file_data_json, evt: gr.SelectData):
208
+ """When user clicks a row in the table, show that file's full preview."""
209
+ if not file_data_json or not isinstance(file_data_json, list):
210
+ return "Select a file from the table above."
211
+
212
+ row_idx = evt.index[0] if isinstance(evt.index, (list, tuple)) else evt.index
213
+ if 0 <= row_idx < len(file_data_json):
214
+ item = file_data_json[row_idx]
215
+ return f"## πŸ“„ {item['filename']}\n**Type:** {item['type']} | **Size:** {item['size_formatted']}\n\n```\n{item.get('preview', '(no preview)')}\n```"
216
+ return "File not found."
217
+
218
+
219
+ # ─── Gradio UI ───────────────────────────────────────────
220
+
221
+ with gr.Blocks(
222
+ title="πŸ“¦ Document Parser",
223
+ theme=gr.themes.Soft(),
224
+ ) as demo:
225
+ gr.Markdown("""
226
+ # πŸ“¦ Document Parser
227
+ Upload a **ZIP file** containing documents and this tool will parse and extract text from all supported formats.
228
+
229
+ **Supported formats:** `.txt`, `.md`, `.py`, `.js`, `.json`, `.yaml`, `.csv`, `.html`, `.pdf`, `.docx`, `.xlsx`, and 30+ more text/code formats.
230
+ """)
231
+
232
+ with gr.Row():
233
+ with gr.Column(scale=1):
234
+ zip_input = gr.File(
235
+ label="Upload ZIP File",
236
+ file_types=[".zip"],
237
+ type="filepath",
238
+ )
239
+ parse_btn = gr.Button("πŸ” Parse Documents", variant="primary", size="lg")
240
+
241
+ summary_output = gr.Markdown(label="Summary")
242
+
243
+ with gr.Tabs():
244
+ with gr.Tab("πŸ“‹ File Listing"):
245
+ file_table = gr.Dataframe(
246
+ headers=["Filename", "Extension", "Type", "Size", "Preview"],
247
+ label="Files in Archive",
248
+ interactive=False,
249
+ wrap=True,
250
+ )
251
+ with gr.Tab("πŸ“ Extracted Text"):
252
+ text_output = gr.Textbox(
253
+ label="Full Extracted Text",
254
+ lines=30,
255
+ max_lines=100,
256
+ show_copy_button=True,
257
+ )
258
+ with gr.Tab("πŸ”Ž File Detail"):
259
+ gr.Markdown("*Click a row in the File Listing tab to see its full preview here.*")
260
+ detail_output = gr.Markdown("Select a file from the table above.")
261
+ with gr.Tab("πŸ“Š JSON Data"):
262
+ json_output = gr.JSON(label="Structured Parse Results")
263
+
264
+ # Hidden state for file data
265
+ file_data_state = gr.State([])
266
+
267
+ def run_parse(file_obj):
268
+ summary, table, text, data = parse_zip(file_obj)
269
+ return summary, table, text, data, data
270
+
271
+ parse_btn.click(
272
+ fn=run_parse,
273
+ inputs=zip_input,
274
+ outputs=[summary_output, file_table, text_output, json_output, file_data_state],
275
+ )
276
+ zip_input.upload(
277
+ fn=run_parse,
278
+ inputs=zip_input,
279
+ outputs=[summary_output, file_table, text_output, json_output, file_data_state],
280
+ )
281
+
282
+ file_table.select(
283
+ fn=select_file_content,
284
+ inputs=file_data_state,
285
+ outputs=detail_output,
286
+ )
287
+
288
+ if __name__ == "__main__":
289
+ demo.launch()