Ayesha-Majeed commited on
Commit
886d641
Β·
verified Β·
1 Parent(s): e2f5d23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +314 -142
app.py CHANGED
@@ -1,185 +1,357 @@
1
  import gradio as gr
2
  import json
 
3
  from pathlib import Path
4
  from typing import List, Dict, Any
 
5
  from PIL import Image
6
  import PyPDF2
7
- import pytesseract
8
- import google.generativeai as genai
9
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
10
 
 
 
11
 
12
- # ==================== Configure Gemini API ====================
13
- GEMINI_API_KEY = "AIzaSyB2b80YwNHs3Yj6RZOTL8wjXk2YhxCluOA"
14
- if GEMINI_API_KEY:
15
- genai.configure(api_key=GEMINI_API_KEY)
16
 
17
- EXTRACTION_PROMPT = """You are a shipping document data extraction specialist. Extract structured data from the provided shipping/logistics documents.
18
- Extract the following fields into a JSON format:
19
  {
20
- "poNumber": "Purchase Order Number",
21
- "shipFrom": "Origin/Ship From Location",
22
- "carrierType": "Transportation type (RAIL/TRUCK/etc)",
23
- "originCarrier": "Carrier name (CN/CPRS/etc)",
24
- "railCarNumber": "Rail car identifier",
25
- "totalQuantity": "Total quantity as number",
26
- "totalUnits": "Unit type (UNIT/MBF/MSFT/etc)",
27
- "accountName": "Customer/Account name",
28
- "inventories": {
29
- "items": [
30
- {
31
- "quantityShipped": "Quantity as number",
32
- "inventoryUnits": "Unit type",
33
- "productName": "Full product description",
34
- "productCode": "Product code/SKU",
35
- "product": {
36
- "category": "Product category (OSB/Lumber/etc)",
37
- "unit": "Unit count as number",
38
- "pcs": "Pieces per unit",
39
- "mbf": "Thousand board feet (if applicable)",
40
- "sf": "Square feet (if applicable)",
41
- "pcsHeight": "Height in inches",
42
- "pcsWidth": "Width in inches",
43
- "pcsLength": "Length in feet"
44
- },
45
- "customFields": [
46
- "Mill||Mill Name",
47
- "Vendor||Vendor Name"
48
- ]
49
- }
50
- ]
51
- }
52
  }
53
- IMPORTANT INSTRUCTIONS:
54
- 1. Extract ALL products/items found in the document
55
- 2. Convert text numbers to actual numbers (e.g., "54" β†’ 54)
56
- 3. Parse dimensions carefully, Do NOT convert units
57
- 4. Calculate MBF/SF when possible from dimensions and piece count
58
- 5. If a field is not found, use null
59
- 6. For multiple products, create separate items
60
- 7. Extract custom fields like Mill, Vendor
61
- Return ONLY valid JSON, no markdown formatting or explanations."""
62
-
63
- # ==================== Utility functions ====================
64
- def extract_text_from_pdf(pdf_file) -> str:
65
- try:
66
- pdf_reader = PyPDF2.PdfReader(pdf_file)
67
- text = ""
68
- for page in pdf_reader.pages:
69
- text += page.extract_text() + "\n"
70
- return text
71
- except Exception as e:
72
- return f"Error extracting PDF text: {str(e)}"
73
 
74
- def convert_pdf_to_images(pdf_file) -> List[Image.Image]:
75
- try:
76
- from pdf2image import convert_from_path
77
- images = convert_from_path(pdf_file)
78
- return images
79
- except Exception as e:
80
- print(f"Error converting PDF to images: {e}")
81
- return []
 
 
 
 
 
82
 
83
- def extract_text_from_image(img: Image.Image) -> str:
 
 
84
  try:
85
- text = pytesseract.image_to_string(img)
86
- return text
 
 
 
 
 
 
87
  except Exception as e:
88
- print(f"Error extracting text from image: {e}")
89
- return ""
90
 
91
- def process_files(files: List[str]) -> Dict[str, Any]:
 
 
92
  processed_data = {
93
- "files": [],
94
- "combined_text": "",
95
- "images": []
 
96
  }
97
 
 
 
 
98
  for file_path in files:
 
 
 
99
  file_name = Path(file_path).name
100
  file_ext = Path(file_path).suffix.lower()
101
- file_data = {"filename": file_name, "type": file_ext, "content": ""}
 
 
102
 
103
  try:
 
104
  if file_ext == '.pdf':
105
  text = extract_text_from_pdf(file_path)
106
- file_data["content"] = text
107
- processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
108
- images = convert_pdf_to_images(file_path)
109
- processed_data["images"].extend(images)
110
 
111
- elif file_ext in ['.jpg', '.jpeg', '.png', '.bmp', '.gif']:
112
- img = Image.open(file_path)
113
- processed_data["images"].append(img)
114
- text = extract_text_from_image(img)
115
- processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
116
- file_data["content"] = f"Image file: {file_name}"
117
-
118
- elif file_ext in ['.txt']:
119
- with open(file_path, 'r', encoding='utf-8') as f:
 
 
 
 
 
120
  text = f.read()
121
- processed_data["combined_text"] += f"\n--- {file_name} ---\n{text}\n"
122
- file_data["content"] = text
123
 
124
- processed_data["files"].append(file_data)
 
 
 
 
 
 
 
 
 
 
 
125
  except Exception as e:
126
- file_data["content"] = f"Error processing file: {str(e)}"
127
- processed_data["files"].append(file_data)
128
 
129
  return processed_data
130
 
131
- def extract_with_gemini(processed_data: Dict[str, Any]) -> Dict[str, Any]:
 
 
 
 
 
 
 
 
 
132
  try:
133
- model = genai.GenerativeModel('models/gemini-2.5-flash')
134
- content = [EXTRACTION_PROMPT]
135
- if processed_data["combined_text"]:
136
- content.append(f"\nDocument Text:\n{processed_data['combined_text']}")
137
- for img in processed_data["images"][:5]:
138
- content.append(img)
139
- response = model.generate_content(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
  response_text = response.text.strip()
141
- # Clean Markdown
142
- for mark in ["```json", "```"]:
143
- response_text = response_text.replace(mark, "")
 
 
 
 
 
 
 
 
 
144
  extracted_data = json.loads(response_text)
145
- return extracted_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  except Exception as e:
147
- return {"error": str(e)}
148
-
149
- # ==================== Gradio function ====================
150
- def gradio_extraction(uploaded_files):
151
- file_paths = []
152
 
153
- for file in uploaded_files:
154
- src_path = Path(file.name)
155
- file_name = src_path.name
156
- tmp_path = Path(tempfile.gettempdir()) / file_name
157
 
158
- with open(src_path, "rb") as src, open(tmp_path, "wb") as dst:
159
- dst.write(src.read())
160
-
161
- file_paths.append(str(tmp_path))
 
 
 
 
 
 
 
 
162
 
163
- processed_data = process_files(file_paths)
164
- extracted_data = extract_with_gemini(processed_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
165
 
166
- with open("output.json", "w", encoding="utf-8") as f:
167
- json.dump(extracted_data, f, indent=2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
 
169
- return json.dumps(extracted_data, indent=2), "output.json"
170
-
171
-
172
- # ==================== Gradio Interface ====================
173
- iface = gr.Interface(
174
- fn=gradio_extraction,
175
- inputs = gr.File(file_types=[".pdf", ".jpg", ".jpeg", ".png", ".bmp", ".txt"], file_count="multiple"),
176
- outputs=[
177
- gr.Textbox(label="Extracted JSON",lines=15, max_lines=30),
178
- gr.File(label="Download JSON")
179
- ],
180
- title="Shipping Document Text Extractor",
181
- description="Upload PDFs or images of shipping/logistics documents and get structured JSON output.",
182
- theme=gr.themes.Base(primary_hue="blue")
183
- )
184
-
185
- iface.launch()
 
1
  import gradio as gr
2
  import json
3
+ import os
4
  from pathlib import Path
5
  from typing import List, Dict, Any
6
+ import google.generativeai as genai
7
  from PIL import Image
8
  import PyPDF2
 
 
9
  import tempfile
10
+ import traceback
11
+
12
+ # ==============================================================
13
+ # API Configuration - Add your key here
14
+ # ==============================================================
15
+ GEMINI_API_KEY = "AIzaSyDgNzVud08vXVrvFb5Mz0bHX8vBQ1LihNs"
16
+ # ==============================================================
17
+ # Enhanced extraction prompt with better instructions
18
+ # ==============================================================
19
+ EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
20
+ Carefully analyze ALL text content from PDFs, images, and documents.
21
 
22
+ CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
23
+ is in tables, handwritten notes, stamps, or poorly scanned areas.
24
 
25
+ Extract and structure the data as valid JSON only (no markdown, no commentary):
 
 
 
26
 
 
 
27
  {
28
+ "poNumber": string | null,
29
+ "shipFrom": string | null,
30
+ "carrierType": string | null,
31
+ "originCarrier": string | null,
32
+ "railCarNumber": string | null,
33
+ "totalQuantity": number | null,
34
+ "totalUnits": string | null,
35
+ "attachments": [string],
36
+ "accountName": string | null,
37
+ "inventories": {
38
+ "items": [
39
+ {
40
+ "quantityShipped": "Quantity as number, no of packages",
41
+ "inventoryUnits": string | null,
42
+ "pcs": number | null,
43
+ "productName": string | null,
44
+ "productCode": string | null,
45
+ "product": {
46
+ "category": "Product category (OSB/Lumber/etc)",
47
+ "defaultUnits": string | null,
48
+ "unit": "Unit type from document (MBF, FBM, SF, UNIT etc.)",
49
+ "pcs": "pcs": "Pieces per unit",
50
+ "mbf": number | null,
51
+ "sf": number | null,
52
+ "pcsHeight": number | null,
53
+ "pcsWidth": number | null,
54
+ "pcsLength": number | null
55
+ },
56
+ "customFields": [string]
57
+ }
58
+ ]
59
+ }
60
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ EXTRACTION RULES:
63
+ 1. Extract ALL product line items - create one inventory item per product
64
+ 2. Parse dimensions: "2X6X14" β†’ pcsHeight=2, pcsWidth=6, pcsLength=14
65
+ 3. BF = totalQuantity
66
+ 4. Convert BF to MBF: BF Γ· 1000
67
+ 5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
68
+ 6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
69
+ 7. Check headers, footers, stamps, handwritten notes, and table cells
70
+ 8. If multiple documents, consolidate all items into one JSON
71
+ 9. Return null for missing fields
72
+ 10.attachments should list all provided filenames
73
+
74
+ Return ONLY valid JSON matching this exact structure."""
75
 
76
+
77
+ def extract_text_from_pdf(pdf_path: str) -> str:
78
+ """Extract text from PDF with better error handling"""
79
  try:
80
+ with open(pdf_path, 'rb') as file:
81
+ pdf_reader = PyPDF2.PdfReader(file)
82
+ text = ""
83
+ for page_num, page in enumerate(pdf_reader.pages):
84
+ page_text = page.extract_text()
85
+ if page_text:
86
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
87
+ return text if text.strip() else "No text extracted from PDF"
88
  except Exception as e:
89
+ return f"Error extracting PDF text: {str(e)}"
 
90
 
91
+
92
+ def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
93
+ """Process files and prepare for Gemini multimodal input"""
94
  processed_data = {
95
+ "text_content": "",
96
+ "file_objects": [],
97
+ "attachments": [],
98
+ "file_info": []
99
  }
100
 
101
+ if not files:
102
+ return processed_data
103
+
104
  for file_path in files:
105
+ if not os.path.exists(file_path):
106
+ continue
107
+
108
  file_name = Path(file_path).name
109
  file_ext = Path(file_path).suffix.lower()
110
+
111
+ processed_data["attachments"].append(file_name)
112
+ processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
113
 
114
  try:
115
+ # Handle PDFs
116
  if file_ext == '.pdf':
117
  text = extract_text_from_pdf(file_path)
118
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
 
 
 
119
 
120
+ # Upload PDF to Gemini for visual analysis
121
+ uploaded_file = genai.upload_file(file_path)
122
+ processed_data["file_objects"].append(uploaded_file)
123
+
124
+ # Handle images
125
+ elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
126
+ # Upload image to Gemini
127
+ uploaded_file = genai.upload_file(file_path)
128
+ processed_data["file_objects"].append(uploaded_file)
129
+ processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
130
+
131
+ # Handle text files
132
+ elif file_ext in ['.txt', '.csv']:
133
+ with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
134
  text = f.read()
135
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
 
136
 
137
+ # Handle Word documents (basic text extraction)
138
+ elif file_ext in ['.doc', '.docx']:
139
+ try:
140
+ import docx
141
+ doc = docx.Document(file_path)
142
+ text = "\n".join([para.text for para in doc.paragraphs])
143
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
144
+ except ImportError:
145
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
146
+ except Exception as e:
147
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
148
+
149
  except Exception as e:
150
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
 
151
 
152
  return processed_data
153
 
154
+
155
+ def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.5-flash") -> Dict[str, Any]:
156
+ """Extract structured data using Gemini with enhanced multimodal processing"""
157
+
158
+ if not api_key or api_key.strip() == "":
159
+ return {
160
+ "success": False,
161
+ "error": "Gemini API key not provided"
162
+ }
163
+
164
  try:
165
+ # Configure Gemini
166
+ genai.configure(api_key=api_key)
167
+
168
+ # Use the latest model with vision capabilities
169
+ model = genai.GenerativeModel(model_name)
170
+
171
+ # Build multimodal prompt
172
+ content_parts = [
173
+ EXTRACTION_PROMPT,
174
+ f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
175
+ f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
176
+ "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
177
+ ]
178
+
179
+ # Add all uploaded files
180
+ content_parts.extend(processed_data["file_objects"])
181
+
182
+ # Generate with higher temperature for better extraction
183
+ generation_config = genai.types.GenerationConfig(
184
+ temperature=0.2,
185
+ max_output_tokens=8000,
186
+ )
187
+
188
+ response = model.generate_content(
189
+ content_parts,
190
+ generation_config=generation_config
191
+ )
192
+
193
  response_text = response.text.strip()
194
+
195
+ # Clean markdown code blocks
196
+ if response_text.startswith("```json"):
197
+ response_text = response_text[7:]
198
+ elif response_text.startswith("```"):
199
+ response_text = response_text[3:]
200
+ if response_text.endswith("```"):
201
+ response_text = response_text[:-3]
202
+
203
+ response_text = response_text.strip()
204
+
205
+ # Parse JSON
206
  extracted_data = json.loads(response_text)
207
+
208
+ return {
209
+ "success": True,
210
+ "data": extracted_data,
211
+ "raw_response": response_text,
212
+ "files_processed": len(processed_data["file_objects"])
213
+ }
214
+
215
+ except json.JSONDecodeError as e:
216
+ return {
217
+ "success": False,
218
+ "error": f"JSON parsing error: {str(e)}",
219
+ "raw_response": response.text if 'response' in locals() else "No response",
220
+ "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
221
+ }
222
  except Exception as e:
223
+ return {
224
+ "success": False,
225
+ "error": f"Extraction error: {str(e)}",
226
+ "traceback": traceback.format_exc()
227
+ }
228
 
 
 
 
 
229
 
230
+ def process_documents(files):
231
+ """Main Gradio processing function"""
232
+
233
+ if not files or len(files) == 0:
234
+ return "❌ Error: Please upload at least one file", "{}", "No files provided"
235
+
236
+ # Use the hardcoded API key and default model
237
+ api_key = GEMINI_API_KEY
238
+ model_choice = "gemini-2.0-flash"
239
+
240
+ if not api_key or api_key.strip() == "":
241
+ return "❌ Error: API key not configured in code", "{}", "API key missing"
242
 
243
+ try:
244
+ # Get file paths
245
+ file_paths = [f.name if hasattr(f, 'name') else f for f in files]
246
+
247
+ status_msg = f"πŸ“„ Processing {len(file_paths)} file(s)...\n"
248
+
249
+ # Process files
250
+ processed_data = process_files_for_gemini(file_paths)
251
+ status_msg += f"βœ“ Files loaded: {', '.join(processed_data['attachments'])}\n"
252
+
253
+ # Extract with Gemini
254
+ status_msg += "πŸ€– Extracting data with Gemini AI...\n"
255
+ result = extract_with_gemini(processed_data, api_key, model_choice)
256
+
257
+ if result.get("success"):
258
+ json_output = json.dumps(result["data"], indent=2)
259
+ status_msg += f"βœ… Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
260
+
261
+ # Format display output
262
+ display_text = "=== EXTRACTED DATA ===\n\n"
263
+ display_text += json_output
264
+
265
+ return status_msg, json_output, display_text
266
+ else:
267
+ error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
268
+ if 'suggestion' in result:
269
+ error_msg += f"\nπŸ’‘ {result['suggestion']}\n"
270
+ if 'traceback' in result:
271
+ error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
272
+
273
+ raw_resp = result.get('raw_response', 'No response')
274
+ return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
275
 
276
+ except Exception as e:
277
+ error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
278
+ return error_msg, "{}", error_msg
279
+
280
+
281
+ # ==============================================================
282
+ # Gradio Interface
283
+ # ==============================================================
284
+
285
+ def create_interface():
286
+ with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
287
+ gr.Markdown("""
288
+ # πŸ“„ Shipping Document Data Extractor
289
+
290
+ Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
291
+
292
+ **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
293
+ """)
294
+
295
+ with gr.Row():
296
+ with gr.Column(scale=2):
297
+ file_input = gr.File(
298
+ label="πŸ“Ž Upload Documents",
299
+ file_count="multiple",
300
+ file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
301
+ )
302
+
303
+ submit_btn = gr.Button("πŸš€ Extract Data", variant="primary", size="lg")
304
+
305
+ with gr.Column(scale=3):
306
+ status_output = gr.Textbox(
307
+ label="πŸ“Š Status",
308
+ lines=4,
309
+ max_lines=8
310
+ )
311
+
312
+ json_output = gr.Code(
313
+ label="πŸ“‹ JSON Output (Copy this)",
314
+ language="json",
315
+ lines=15
316
+ )
317
+
318
+ display_output = gr.Textbox(
319
+ label="πŸ‘οΈ Preview",
320
+ lines=10,
321
+ max_lines=15
322
+ )
323
+
324
+ gr.Markdown("""
325
+ ### πŸ’‘ Tips:
326
+ - Upload multiple files for batch processing
327
+ - For images: ensure text is clear and well-lit
328
+ - For PDFs: both text-based and scanned PDFs work
329
+ - The AI will analyze visual content even if text extraction fails
330
+ """)
331
+
332
+ # Button action
333
+ submit_btn.click(
334
+ fn=process_documents,
335
+ inputs=[file_input],
336
+ outputs=[status_output, json_output, display_output]
337
+ )
338
+
339
+ # Examples
340
+ gr.Examples(
341
+ examples=[
342
+ [["example1.pdf"]],
343
+ ],
344
+ inputs=[file_input],
345
+ label="Example Usage"
346
+ )
347
 
348
+ return demo
349
+
350
+
351
+ if __name__ == "__main__":
352
+ demo = create_interface()
353
+ demo.launch(
354
+ server_name="0.0.0.0",
355
+ server_port=7860,
356
+ share=False
357
+ )