mlbench123 commited on
Commit
e128ae3
Β·
verified Β·
1 Parent(s): 209097b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +362 -0
app.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any
6
+ import google.generativeai as genai
7
+ from PIL import Image
8
+ import PyPDF2
9
+ import tempfile
10
+ import traceback
11
+
12
+ # ==============================================================
13
+ # Enhanced extraction prompt with better instructions
14
+ # ==============================================================
15
+ EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
16
+ Carefully analyze ALL text content from PDFs, images, and documents.
17
+
18
+ CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
19
+ is in tables, handwritten notes, stamps, or poorly scanned areas.
20
+
21
+ Extract and structure the data as valid JSON only (no markdown, no commentary):
22
+
23
+ {
24
+ "poNumber": string | null,
25
+ "shipFrom": string | null,
26
+ "carrierType": string | null,
27
+ "originCarrier": string | null,
28
+ "railCarNumber": string | null,
29
+ "totalQuantity": number | null,
30
+ "totalUnits": string | null,
31
+ "attachments": [string],
32
+ "accountName": string | null,
33
+ "inventories": {
34
+ "items": [
35
+ {
36
+ "quantityShipped": number | null,
37
+ "inventoryUnits": string | null,
38
+ "pcs": number | null,
39
+ "productName": string | null,
40
+ "productCode": string | null,
41
+ "product": {
42
+ "category": string | null,
43
+ "defaultUnits": string | null,
44
+ "unit": number | null,
45
+ "pcs": number | null,
46
+ "mbf": number | null,
47
+ "sf": number | null,
48
+ "pcsHeight": number | null,
49
+ "pcsWidth": number | null,
50
+ "pcsLength": number | null
51
+ },
52
+ "customFields": [string]
53
+ }
54
+ ]
55
+ }
56
+ }
57
+
58
+ EXTRACTION RULES:
59
+ 1. Extract ALL product line items - create one inventory item per product
60
+ 2. Parse dimensions: "2X6X14" β†’ pcsHeight=2, pcsWidth=6, pcsLength=14
61
+ 3. Convert BF to MBF: BF Γ· 1000
62
+ 4. customFields format: "Key||Value" (e.g., "Mill||Tolko")
63
+ 5. Look for: PO numbers, shipping info, quantities, product codes, dimensions
64
+ 6. Check headers, footers, stamps, handwritten notes, and table cells
65
+ 7. If multiple documents, consolidate all items into one JSON
66
+ 8. Return null for missing fields
67
+ 9. attachments should list all provided filenames
68
+
69
+ Return ONLY valid JSON matching this exact structure."""
70
+
71
+
72
+ def extract_text_from_pdf(pdf_path: str) -> str:
73
+ """Extract text from PDF with better error handling"""
74
+ try:
75
+ with open(pdf_path, 'rb') as file:
76
+ pdf_reader = PyPDF2.PdfReader(file)
77
+ text = ""
78
+ for page_num, page in enumerate(pdf_reader.pages):
79
+ page_text = page.extract_text()
80
+ if page_text:
81
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
82
+ return text if text.strip() else "No text extracted from PDF"
83
+ except Exception as e:
84
+ return f"Error extracting PDF text: {str(e)}"
85
+
86
+
87
+ def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
88
+ """Process files and prepare for Gemini multimodal input"""
89
+ processed_data = {
90
+ "text_content": "",
91
+ "file_objects": [],
92
+ "attachments": [],
93
+ "file_info": []
94
+ }
95
+
96
+ if not files:
97
+ return processed_data
98
+
99
+ for file_path in files:
100
+ if not os.path.exists(file_path):
101
+ continue
102
+
103
+ file_name = Path(file_path).name
104
+ file_ext = Path(file_path).suffix.lower()
105
+
106
+ processed_data["attachments"].append(file_name)
107
+ processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
108
+
109
+ try:
110
+ # Handle PDFs
111
+ if file_ext == '.pdf':
112
+ text = extract_text_from_pdf(file_path)
113
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
114
+
115
+ # Upload PDF to Gemini for visual analysis
116
+ uploaded_file = genai.upload_file(file_path)
117
+ processed_data["file_objects"].append(uploaded_file)
118
+
119
+ # Handle images
120
+ elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
121
+ # Upload image to Gemini
122
+ uploaded_file = genai.upload_file(file_path)
123
+ processed_data["file_objects"].append(uploaded_file)
124
+ processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
125
+
126
+ # Handle text files
127
+ elif file_ext in ['.txt', '.csv']:
128
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
129
+ text = f.read()
130
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
131
+
132
+ # Handle Word documents (basic text extraction)
133
+ elif file_ext in ['.doc', '.docx']:
134
+ try:
135
+ import docx
136
+ doc = docx.Document(file_path)
137
+ text = "\n".join([para.text for para in doc.paragraphs])
138
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
139
+ except ImportError:
140
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
141
+ except Exception as e:
142
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
143
+
144
+ except Exception as e:
145
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
146
+
147
+ return processed_data
148
+
149
+
150
+ def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.0-flash-exp") -> Dict[str, Any]:
151
+ """Extract structured data using Gemini with enhanced multimodal processing"""
152
+
153
+ if not api_key or api_key.strip() == "":
154
+ return {
155
+ "success": False,
156
+ "error": "Gemini API key not provided"
157
+ }
158
+
159
+ try:
160
+ # Configure Gemini
161
+ genai.configure(api_key=api_key)
162
+
163
+ # Use the latest model with vision capabilities
164
+ model = genai.GenerativeModel(model_name)
165
+
166
+ # Build multimodal prompt
167
+ content_parts = [
168
+ EXTRACTION_PROMPT,
169
+ f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
170
+ f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
171
+ "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
172
+ ]
173
+
174
+ # Add all uploaded files
175
+ content_parts.extend(processed_data["file_objects"])
176
+
177
+ # Generate with higher temperature for better extraction
178
+ generation_config = genai.types.GenerationConfig(
179
+ temperature=0.2,
180
+ max_output_tokens=8000,
181
+ )
182
+
183
+ response = model.generate_content(
184
+ content_parts,
185
+ generation_config=generation_config
186
+ )
187
+
188
+ response_text = response.text.strip()
189
+
190
+ # Clean markdown code blocks
191
+ if response_text.startswith("```json"):
192
+ response_text = response_text[7:]
193
+ elif response_text.startswith("```"):
194
+ response_text = response_text[3:]
195
+ if response_text.endswith("```"):
196
+ response_text = response_text[:-3]
197
+
198
+ response_text = response_text.strip()
199
+
200
+ # Parse JSON
201
+ extracted_data = json.loads(response_text)
202
+
203
+ return {
204
+ "success": True,
205
+ "data": extracted_data,
206
+ "raw_response": response_text,
207
+ "files_processed": len(processed_data["file_objects"])
208
+ }
209
+
210
+ except json.JSONDecodeError as e:
211
+ return {
212
+ "success": False,
213
+ "error": f"JSON parsing error: {str(e)}",
214
+ "raw_response": response.text if 'response' in locals() else "No response",
215
+ "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
216
+ }
217
+ except Exception as e:
218
+ return {
219
+ "success": False,
220
+ "error": f"Extraction error: {str(e)}",
221
+ "traceback": traceback.format_exc()
222
+ }
223
+
224
+
225
+ def process_documents(files, api_key, model_choice):
226
+ """Main Gradio processing function"""
227
+
228
+ if not files or len(files) == 0:
229
+ return "❌ Error: Please upload at least one file", "{}", "No files provided"
230
+
231
+ if not api_key or api_key.strip() == "":
232
+ return "❌ Error: Please enter your Gemini API key", "{}", "API key missing"
233
+
234
+ try:
235
+ # Get file paths
236
+ file_paths = [f.name if hasattr(f, 'name') else f for f in files]
237
+
238
+ status_msg = f"πŸ“„ Processing {len(file_paths)} file(s)...\n"
239
+
240
+ # Process files
241
+ processed_data = process_files_for_gemini(file_paths)
242
+ status_msg += f"βœ“ Files loaded: {', '.join(processed_data['attachments'])}\n"
243
+
244
+ # Extract with Gemini
245
+ status_msg += "πŸ€– Extracting data with Gemini AI...\n"
246
+ result = extract_with_gemini(processed_data, api_key, model_choice)
247
+
248
+ if result.get("success"):
249
+ json_output = json.dumps(result["data"], indent=2)
250
+ status_msg += f"βœ… Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
251
+
252
+ # Format display output
253
+ display_text = "=== EXTRACTED DATA ===\n\n"
254
+ display_text += json_output
255
+
256
+ return status_msg, json_output, display_text
257
+ else:
258
+ error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
259
+ if 'suggestion' in result:
260
+ error_msg += f"\nπŸ’‘ {result['suggestion']}\n"
261
+ if 'traceback' in result:
262
+ error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
263
+
264
+ raw_resp = result.get('raw_response', 'No response')
265
+ return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
266
+
267
+ except Exception as e:
268
+ error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
269
+ return error_msg, "{}", error_msg
270
+
271
+
272
+ # ==============================================================
273
+ # Gradio Interface
274
+ # ==============================================================
275
+
276
+ def create_interface():
277
+ with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
278
+ gr.Markdown("""
279
+ # πŸ“„ Shipping Document Data Extractor
280
+
281
+ Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
282
+
283
+ **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
284
+ """)
285
+
286
+ with gr.Row():
287
+ with gr.Column(scale=2):
288
+ api_key_input = gr.Textbox(
289
+ label="πŸ”‘ Gemini API Key",
290
+ placeholder="Enter your Google Gemini API key (AIza...)",
291
+ type="password",
292
+ info="Get your key from https://aistudio.google.com/apikey"
293
+ )
294
+
295
+ model_choice = gr.Dropdown(
296
+ choices=["gemini-2.0-flash-exp", "gemini-1.5-pro", "gemini-1.5-flash"],
297
+ value="gemini-2.0-flash-exp",
298
+ label="Model Selection",
299
+ info="Latest model recommended for best results"
300
+ )
301
+
302
+ file_input = gr.File(
303
+ label="πŸ“Ž Upload Documents",
304
+ file_count="multiple",
305
+ file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
306
+ )
307
+
308
+ submit_btn = gr.Button("πŸš€ Extract Data", variant="primary", size="lg")
309
+
310
+ with gr.Column(scale=3):
311
+ status_output = gr.Textbox(
312
+ label="πŸ“Š Status",
313
+ lines=4,
314
+ max_lines=8
315
+ )
316
+
317
+ json_output = gr.Code(
318
+ label="πŸ“‹ JSON Output (Copy this)",
319
+ language="json",
320
+ lines=15
321
+ )
322
+
323
+ display_output = gr.Textbox(
324
+ label="πŸ‘οΈ Preview",
325
+ lines=10,
326
+ max_lines=15
327
+ )
328
+
329
+ gr.Markdown("""
330
+ ### πŸ’‘ Tips:
331
+ - Upload multiple files for batch processing
332
+ - For images: ensure text is clear and well-lit
333
+ - For PDFs: both text-based and scanned PDFs work
334
+ - The AI will analyze visual content even if text extraction fails
335
+ """)
336
+
337
+ # Button action
338
+ submit_btn.click(
339
+ fn=process_documents,
340
+ inputs=[file_input, api_key_input, model_choice],
341
+ outputs=[status_output, json_output, display_output]
342
+ )
343
+
344
+ # Examples
345
+ gr.Examples(
346
+ examples=[
347
+ [["example1.pdf"], "your-api-key-here"],
348
+ ],
349
+ inputs=[file_input, api_key_input],
350
+ label="Example Usage"
351
+ )
352
+
353
+ return demo
354
+
355
+
356
+ if __name__ == "__main__":
357
+ demo = create_interface()
358
+ demo.launch(
359
+ server_name="0.0.0.0",
360
+ server_port=7860,
361
+ share=False
362
+ )