mlbench123 commited on
Commit
ec91976
Β·
verified Β·
1 Parent(s): e478996

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +241 -188
app.py CHANGED
@@ -2,25 +2,25 @@ import gradio as gr
2
  import json
3
  import os
4
  from pathlib import Path
5
- from typing import List, Dict, Any
6
- import google.generativeai as genai
 
7
  from PIL import Image
8
  import PyPDF2
9
- import tempfile
10
- import traceback
 
 
 
 
 
 
11
 
12
  # ==============================================================
13
- # API Configuration - Add your key here
14
- # ==============================================================
15
- GEMINI_API_KEY = "AIzaSyDbIO57s0DlXMXRoKHKKrJNUcKytwbee-g"
16
- # ==============================================================
17
- # Enhanced extraction prompt with better instructions
18
  # ==============================================================
19
- EXTRACTION_PROMPT = """You are an expert shipping-document data extractor with OCR capabilities.
20
- Carefully analyze ALL text content from PDFs, images, and documents.
21
-
22
- CRITICAL: Look at both the text AND the visual layout of documents. Sometimes important data
23
- is in tables, handwritten notes, stamps, or poorly scanned areas.
24
 
25
  Extract and structure the data as valid JSON only (no markdown, no commentary):
26
 
@@ -60,238 +60,297 @@ Extract and structure the data as valid JSON only (no markdown, no commentary):
60
  }
61
 
62
  EXTRACTION RULES:
63
- 1. Extract ALL product line items - create one inventory item per product
64
- 2. Parse dimensions: "2X6X14" β†’ pcsHeight=2, pcsWidth=6, pcsLength=14
65
- 3. BF = totalQuantity
66
  4. Convert BF to MBF: BF Γ· 1000
67
  5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
68
  6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
69
- 7. Check headers, footers, stamps, handwritten notes, and table cells
70
- 8. If multiple documents, consolidate all items into one JSON
71
- 9. Return null for missing fields
72
- 10.attachments should list all provided filenames
73
 
74
  Return ONLY valid JSON matching this exact structure."""
75
 
76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  def extract_text_from_pdf(pdf_path: str) -> str:
78
- """Extract text from PDF with better error handling"""
79
  try:
80
- with open(pdf_path, 'rb') as file:
81
  pdf_reader = PyPDF2.PdfReader(file)
82
  text = ""
83
  for page_num, page in enumerate(pdf_reader.pages):
84
  page_text = page.extract_text()
85
  if page_text:
86
- text += f"\n--- Page {page_num + 1} ---\n{page_text}"
87
- return text if text.strip() else "No text extracted from PDF"
88
  except Exception as e:
89
  return f"Error extracting PDF text: {str(e)}"
90
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- def process_files_for_gemini(files: List[str]) -> Dict[str, Any]:
93
- """Process files and prepare for Gemini multimodal input"""
94
  processed_data = {
95
  "text_content": "",
96
- "file_objects": [],
97
  "attachments": [],
98
- "file_info": []
99
  }
100
-
101
  if not files:
102
  return processed_data
103
-
104
  for file_path in files:
105
  if not os.path.exists(file_path):
106
  continue
107
-
108
  file_name = Path(file_path).name
109
  file_ext = Path(file_path).suffix.lower()
110
-
111
  processed_data["attachments"].append(file_name)
112
  processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
113
-
114
  try:
115
- # Handle PDFs
116
- if file_ext == '.pdf':
117
- text = extract_text_from_pdf(file_path)
118
  processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
119
-
120
- # Upload PDF to Gemini for visual analysis
121
- uploaded_file = genai.upload_file(file_path)
122
- processed_data["file_objects"].append(uploaded_file)
123
-
124
- # Handle images
125
- elif file_ext in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.webp']:
126
- # Upload image to Gemini
127
- uploaded_file = genai.upload_file(file_path)
128
- processed_data["file_objects"].append(uploaded_file)
129
- processed_data["text_content"] += f"\n\n=== {file_name} (Image) ===\n[Image uploaded for visual analysis]"
130
-
131
- # Handle text files
132
- elif file_ext in ['.txt', '.csv']:
133
- with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
134
- text = f.read()
135
  processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
136
-
137
- # Handle Word documents (basic text extraction)
138
- elif file_ext in ['.doc', '.docx']:
139
- try:
140
- import docx
141
- doc = docx.Document(file_path)
142
- text = "\n".join([para.text for para in doc.paragraphs])
143
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
144
- except ImportError:
145
- processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Word document - install python-docx for text extraction]"
146
- except Exception as e:
147
- processed_data["text_content"] += f"\n\n=== {file_name} ===\nError reading Word doc: {str(e)}"
148
-
149
  except Exception as e:
150
  processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
151
-
152
  return processed_data
153
 
154
 
155
- def extract_with_gemini(processed_data: Dict[str, Any], api_key: str, model_name: str = "gemini-2.0-flash") -> Dict[str, Any]:
156
- """Extract structured data using Gemini with enhanced multimodal processing"""
157
-
158
- if not api_key or api_key.strip() == "":
159
- return {
160
- "success": False,
161
- "error": "Gemini API key not provided"
162
- }
163
-
 
 
 
 
164
  try:
165
- # Configure Gemini
166
- genai.configure(api_key=api_key)
167
-
168
- # Use the latest model with vision capabilities
169
- model = genai.GenerativeModel(model_name)
170
-
171
- # Build multimodal prompt
172
- content_parts = [
173
- EXTRACTION_PROMPT,
174
- f"\n\nDOCUMENT CONTEXT:\n{processed_data['text_content']}\n",
175
- f"\nATTACHMENTS: {json.dumps(processed_data['attachments'])}\n",
176
- "\nNow analyze the uploaded files carefully (including visual content) and extract the data as JSON:"
177
- ]
178
-
179
- # Add all uploaded files
180
- content_parts.extend(processed_data["file_objects"])
181
-
182
- # Generate with higher temperature for better extraction
183
- generation_config = genai.types.GenerationConfig(
184
- temperature=0.2,
185
- max_output_tokens=8000,
186
  )
187
-
188
- response = model.generate_content(
189
- content_parts,
190
- generation_config=generation_config
 
 
 
 
191
  )
192
-
193
- response_text = response.text.strip()
194
-
195
- # Clean markdown code blocks
196
- if response_text.startswith("```json"):
197
- response_text = response_text[7:]
198
- elif response_text.startswith("```"):
199
- response_text = response_text[3:]
200
- if response_text.endswith("```"):
201
- response_text = response_text[:-3]
202
-
203
- response_text = response_text.strip()
204
-
205
- # Parse JSON
206
- extracted_data = json.loads(response_text)
207
-
208
  return {
209
  "success": True,
210
  "data": extracted_data,
211
- "raw_response": response_text,
212
- "files_processed": len(processed_data["file_objects"])
213
  }
214
-
215
  except json.JSONDecodeError as e:
216
  return {
217
  "success": False,
218
  "error": f"JSON parsing error: {str(e)}",
219
- "raw_response": response.text if 'response' in locals() else "No response",
220
- "suggestion": "The AI returned non-JSON text. Try again or check the raw response."
 
 
 
221
  }
222
  except Exception as e:
223
  return {
224
  "success": False,
225
  "error": f"Extraction error: {str(e)}",
226
- "traceback": traceback.format_exc()
227
  }
228
 
229
 
 
 
 
 
230
  def process_documents(files):
231
- """Main Gradio processing function"""
232
-
233
  if not files or len(files) == 0:
234
  return "❌ Error: Please upload at least one file", "{}", "No files provided"
235
-
236
- # Use the hardcoded API key and default model
237
- api_key = GEMINI_API_KEY
238
- model_choice = "gemini-2.0-flash"
239
-
240
- if not api_key or api_key.strip() == "":
241
- return "❌ Error: API key not configured in code", "{}", "API key missing"
242
-
243
  try:
244
- # Get file paths
245
- file_paths = [f.name if hasattr(f, 'name') else f for f in files]
246
-
247
  status_msg = f"πŸ“„ Processing {len(file_paths)} file(s)...\n"
248
-
249
- # Process files
250
- processed_data = process_files_for_gemini(file_paths)
251
  status_msg += f"βœ“ Files loaded: {', '.join(processed_data['attachments'])}\n"
252
-
253
- # Extract with Gemini
254
- status_msg += "πŸ€– Extracting data with Gemini AI...\n"
255
- result = extract_with_gemini(processed_data, api_key, model_choice)
256
-
 
 
 
 
 
 
 
 
 
 
257
  if result.get("success"):
258
  json_output = json.dumps(result["data"], indent=2)
259
- status_msg += f"βœ… Extraction successful! Processed {result.get('files_processed', 0)} files.\n"
260
-
261
- # Format display output
262
- display_text = "=== EXTRACTED DATA ===\n\n"
263
- display_text += json_output
264
-
265
  return status_msg, json_output, display_text
266
- else:
267
- error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
268
- if 'suggestion' in result:
269
- error_msg += f"\nπŸ’‘ {result['suggestion']}\n"
270
- if 'traceback' in result:
271
- error_msg += f"\nDebug info:\n{result['traceback'][:500]}"
272
-
273
- raw_resp = result.get('raw_response', 'No response')
274
- return error_msg, "{}", f"Raw Response:\n{raw_resp[:1000]}"
275
-
 
276
  except Exception as e:
277
- error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:500]}"
278
  return error_msg, "{}", error_msg
279
 
280
 
281
  # ==============================================================
282
- # Gradio Interface
283
  # ==============================================================
284
 
285
  def create_interface():
286
  with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
287
  gr.Markdown("""
288
  # πŸ“„ Shipping Document Data Extractor
289
-
290
- Upload PDFs, images, Word docs, or text files to extract structured shipping data using Google Gemini AI.
291
-
 
292
  **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
293
  """)
294
-
295
  with gr.Row():
296
  with gr.Column(scale=2):
297
  file_input = gr.File(
@@ -299,62 +358,56 @@ def create_interface():
299
  file_count="multiple",
300
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
301
  )
302
-
303
- # Add example button here
304
  gr.Markdown("**Try with example:**")
305
  example_btn = gr.Button("πŸ“„ Load Example PDF", size="sm", variant="secondary")
306
-
307
  submit_btn = gr.Button("πŸš€ Extract Data", variant="primary", size="lg")
308
-
309
  with gr.Column(scale=3):
310
  status_output = gr.Textbox(
311
  label="πŸ“Š Status",
312
  lines=4,
313
  max_lines=8
314
  )
315
-
316
  json_output = gr.Code(
317
  label="πŸ“‹ JSON Output (Copy this)",
318
  language="json",
319
  lines=15
320
  )
321
-
322
  display_output = gr.Textbox(
323
  label="πŸ‘οΈ Preview",
324
  lines=10,
325
  max_lines=15
326
  )
327
-
328
  gr.Markdown("""
329
- ### πŸ’‘ Tips:
330
- - Upload multiple files for batch processing
331
- - For images: ensure text is clear and well-lit
332
- - For PDFs: both text-based and scanned PDFs work
333
- - The AI will analyze visual content even if text extraction fails
334
  """)
335
-
336
  submit_btn.click(
337
  fn=process_documents,
338
  inputs=[file_input],
339
  outputs=[status_output, json_output, display_output]
340
  )
341
-
342
  def load_example():
 
343
  example_path = "example1.pdf"
344
  if os.path.exists(example_path):
345
- # Return list of file paths for multiple file input
346
  return [example_path]
347
- else:
348
- # If example doesn't exist, return empty list
349
- print(f"Warning: Example file '{example_path}' not found")
350
- return []
351
-
352
  example_btn.click(
353
  fn=load_example,
354
  inputs=None,
355
  outputs=file_input
356
  )
357
-
358
 
359
  return demo
360
 
@@ -365,4 +418,4 @@ if __name__ == "__main__":
365
  server_name="0.0.0.0",
366
  server_port=7860,
367
  share=False
368
- )
 
2
  import json
3
  import os
4
  from pathlib import Path
5
+ from typing import List, Dict, Any, Optional, Tuple
6
+ import traceback
7
+
8
  from PIL import Image
9
  import PyPDF2
10
+
11
+ # Open-source OCR + PDF rendering
12
+ import pytesseract
13
+ from pdf2image import convert_from_path
14
+
15
+ # Open-source model inference via Hugging Face
16
+ from huggingface_hub import InferenceClient
17
+
18
 
19
  # ==============================================================
20
+ # Extraction prompt (same schema you used; updated wording for OCR-first)
 
 
 
 
21
  # ==============================================================
22
+ EXTRACTION_PROMPT = """You are an expert shipping-document data extractor.
23
+ You will be given OCR/text extracted from shipping documents (PDFs/images/docs).
 
 
 
24
 
25
  Extract and structure the data as valid JSON only (no markdown, no commentary):
26
 
 
60
  }
61
 
62
  EXTRACTION RULES:
63
+ 1. Extract ALL product line items - create one inventory item per product line
64
+ 2. Parse dimensions: "2X6X14" β†’ pcsHeight=2, pcsWidth=6, pcsLength=14 (numbers only)
65
+ 3. BF = totalQuantity (if total board-feet is present)
66
  4. Convert BF to MBF: BF Γ· 1000
67
  5. customFields format: "Key||Value" (e.g., "Mill||Tolko")
68
  6. Look for: PO numbers, shipping info, quantities, product codes, dimensions
69
+ 7. If multiple documents, consolidate all items into one JSON
70
+ 8. Return null for missing fields
71
+ 9. attachments should list all provided filenames
 
72
 
73
  Return ONLY valid JSON matching this exact structure."""
74
 
75
 
76
+ # ==============================================================
77
+ # Utilities: JSON extraction/cleaning
78
+ # ==============================================================
79
+
80
+ def _strip_code_fences(s: str) -> str:
81
+ s = s.strip()
82
+ if s.startswith("```"):
83
+ # remove opening fence line
84
+ parts = s.split("\n", 1)
85
+ if len(parts) == 2:
86
+ s = parts[1]
87
+ if s.endswith("```"):
88
+ s = s[:-3]
89
+ return s.strip()
90
+
91
+ def _extract_first_json_object(s: str) -> str:
92
+ """
93
+ Attempts to pull the first valid JSON object from a model response,
94
+ even if extra text exists before/after.
95
+ """
96
+ s = _strip_code_fences(s)
97
+
98
+ # Heuristic: find first '{' and last '}' (outermost object)
99
+ start = s.find("{")
100
+ end = s.rfind("}")
101
+ if start == -1 or end == -1 or end <= start:
102
+ raise json.JSONDecodeError("No JSON object found in response", s, 0)
103
+ return s[start:end + 1].strip()
104
+
105
+
106
+ # ==============================================================
107
+ # Text extraction: PDFs, images, docs
108
+ # ==============================================================
109
+
110
  def extract_text_from_pdf(pdf_path: str) -> str:
111
+ """Extract embedded text from PDF (works for text-based PDFs)."""
112
  try:
113
+ with open(pdf_path, "rb") as file:
114
  pdf_reader = PyPDF2.PdfReader(file)
115
  text = ""
116
  for page_num, page in enumerate(pdf_reader.pages):
117
  page_text = page.extract_text()
118
  if page_text:
119
+ text += f"\n--- Page {page_num + 1} (PDF text) ---\n{page_text}"
120
+ return text.strip()
121
  except Exception as e:
122
  return f"Error extracting PDF text: {str(e)}"
123
 
124
+ def ocr_image(image: Image.Image) -> str:
125
+ """OCR a PIL image using Tesseract."""
126
+ try:
127
+ if image.mode != "RGB":
128
+ image = image.convert("RGB")
129
+ return pytesseract.image_to_string(image)
130
+ except Exception as e:
131
+ return f"Error performing OCR on image: {str(e)}"
132
+
133
+ def extract_text_from_pdf_with_ocr(pdf_path: str, dpi: int = 250) -> str:
134
+ """
135
+ Extract text from PDF:
136
+ 1) Try embedded text via PyPDF2
137
+ 2) If empty/insufficient, render pages and OCR
138
+ """
139
+ embedded = extract_text_from_pdf(pdf_path)
140
+ # Consider embedded extraction "good" if it has meaningful length
141
+ if embedded and len(embedded) >= 50 and "Error extracting PDF text" not in embedded:
142
+ return embedded
143
+
144
+ # OCR fallback for scanned PDFs
145
+ try:
146
+ pages = convert_from_path(pdf_path, dpi=dpi)
147
+ ocr_chunks = []
148
+ for i, page_img in enumerate(pages):
149
+ page_text = ocr_image(page_img)
150
+ ocr_chunks.append(f"\n--- Page {i+1} (OCR) ---\n{page_text}")
151
+ merged = "\n".join(ocr_chunks).strip()
152
+ return merged if merged else (embedded or "No text extracted from PDF (OCR empty)")
153
+ except Exception as e:
154
+ # If poppler isn't installed, this will fail; surface clear error
155
+ msg = (
156
+ f"Error rendering PDF for OCR: {str(e)}\n"
157
+ f"Hint: On Hugging Face Spaces, add poppler-utils in packages.txt."
158
+ )
159
+ return msg
160
+
161
+ def extract_text_from_docx(docx_path: str) -> str:
162
+ try:
163
+ import docx
164
+ doc = docx.Document(docx_path)
165
+ text = "\n".join([p.text for p in doc.paragraphs if p.text])
166
+ return text.strip()
167
+ except Exception as e:
168
+ return f"Error reading Word doc: {str(e)}"
169
+
170
 
171
+ def process_files_for_extraction(files: List[str]) -> Dict[str, Any]:
172
+ """Process files locally (no Gemini upload)."""
173
  processed_data = {
174
  "text_content": "",
 
175
  "attachments": [],
176
+ "file_info": [],
177
  }
178
+
179
  if not files:
180
  return processed_data
181
+
182
  for file_path in files:
183
  if not os.path.exists(file_path):
184
  continue
185
+
186
  file_name = Path(file_path).name
187
  file_ext = Path(file_path).suffix.lower()
188
+
189
  processed_data["attachments"].append(file_name)
190
  processed_data["file_info"].append(f"File: {file_name} (Type: {file_ext})")
191
+
192
  try:
193
+ if file_ext == ".pdf":
194
+ text = extract_text_from_pdf_with_ocr(file_path)
 
195
  processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
196
+
197
+ elif file_ext in [".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"]:
198
+ img = Image.open(file_path)
199
+ text = ocr_image(img)
200
+ processed_data["text_content"] += f"\n\n=== {file_name} (OCR) ===\n{text}"
201
+
202
+ elif file_ext in [".txt", ".csv"]:
203
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
204
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n{f.read()}"
205
+
206
+ elif file_ext in [".doc", ".docx"]:
207
+ text = extract_text_from_docx(file_path)
 
 
 
 
208
  processed_data["text_content"] += f"\n\n=== {file_name} ===\n{text}"
209
+
210
+ else:
211
+ processed_data["text_content"] += f"\n\n=== {file_name} ===\n[Unsupported file type: {file_ext}]"
212
+
 
 
 
 
 
 
 
 
 
213
  except Exception as e:
214
  processed_data["text_content"] += f"\n\n=== {file_name} ===\nError processing: {str(e)}"
215
+
216
  return processed_data
217
 
218
 
219
+ # ==============================================================
220
+ # Open-source model extraction via Hugging Face Inference API
221
+ # ==============================================================
222
+
223
+ def extract_with_hf_llm(
224
+ processed_data: Dict[str, Any],
225
+ model_id: Optional[str] = None,
226
+ ) -> Dict[str, Any]:
227
+ """
228
+ Uses Hugging Face Inference API for an open-source instruct model.
229
+ - Set HF_TOKEN as a Space Secret for better limits (optional).
230
+ - Optionally set HF_MODEL env var to change model without code edits.
231
+ """
232
  try:
233
+ hf_token = os.getenv("HF_TOKEN", "").strip() or None
234
+ model_id = model_id or (os.getenv("HF_MODEL", "").strip() or None) or "Qwen/Qwen2.5-7B-Instruct"
235
+
236
+ client = InferenceClient(model=model_id, token=hf_token)
237
+
238
+ prompt = (
239
+ EXTRACTION_PROMPT
240
+ + "\n\nDOCUMENT TEXT (OCR + extracted text):\n"
241
+ + processed_data.get("text_content", "")
242
+ + "\n\nATTACHMENTS:\n"
243
+ + json.dumps(processed_data.get("attachments", []))
244
+ + "\n\nReturn ONLY valid JSON."
 
 
 
 
 
 
 
 
 
245
  )
246
+
247
+ resp = client.chat_completion(
248
+ messages=[
249
+ {"role": "system", "content": "You extract structured data and return strict JSON only."},
250
+ {"role": "user", "content": prompt},
251
+ ],
252
+ temperature=0.1,
253
+ max_tokens=3000,
254
  )
255
+
256
+ raw = resp.choices[0].message.content if resp and resp.choices else ""
257
+ raw = (raw or "").strip()
258
+
259
+ json_text = _extract_first_json_object(raw)
260
+ extracted_data = json.loads(json_text)
261
+
 
 
 
 
 
 
 
 
 
262
  return {
263
  "success": True,
264
  "data": extracted_data,
265
+ "raw_response": raw,
266
+ "model": model_id,
267
  }
268
+
269
  except json.JSONDecodeError as e:
270
  return {
271
  "success": False,
272
  "error": f"JSON parsing error: {str(e)}",
273
+ "raw_response": raw if "raw" in locals() else "",
274
+ "suggestion": (
275
+ "Model returned non-JSON or malformed JSON. "
276
+ "Try again or switch HF_MODEL to a different instruct model."
277
+ ),
278
  }
279
  except Exception as e:
280
  return {
281
  "success": False,
282
  "error": f"Extraction error: {str(e)}",
283
+ "traceback": traceback.format_exc(),
284
  }
285
 
286
 
287
+ # ==============================================================
288
+ # Main Gradio function
289
+ # ==============================================================
290
+
291
  def process_documents(files):
 
 
292
  if not files or len(files) == 0:
293
  return "❌ Error: Please upload at least one file", "{}", "No files provided"
294
+
 
 
 
 
 
 
 
295
  try:
296
+ file_paths = [f.name if hasattr(f, "name") else f for f in files]
 
 
297
  status_msg = f"πŸ“„ Processing {len(file_paths)} file(s)...\n"
298
+
299
+ # Local extraction (PDF text + OCR)
300
+ processed_data = process_files_for_extraction(file_paths)
301
  status_msg += f"βœ“ Files loaded: {', '.join(processed_data['attachments'])}\n"
302
+ status_msg += "🧾 Extracting text (PDF text + OCR where needed)...\n"
303
+
304
+ # If we extracted basically nothing, fail early with guidance
305
+ txt = (processed_data.get("text_content") or "").strip()
306
+ if len(txt) < 30:
307
+ msg = (
308
+ "❌ No usable text could be extracted.\n"
309
+ "If PDFs are scanned, ensure OCR dependencies are installed (tesseract-ocr + poppler-utils).\n"
310
+ )
311
+ return msg, "{}", msg
312
+
313
+ # LLM structuring
314
+ status_msg += "πŸ€– Structuring to JSON with open-source model (HF Inference API)...\n"
315
+ result = extract_with_hf_llm(processed_data)
316
+
317
  if result.get("success"):
318
  json_output = json.dumps(result["data"], indent=2)
319
+ status_msg += f"βœ… Extraction successful! Model: {result.get('model')}\n"
320
+
321
+ display_text = "=== EXTRACTED DATA ===\n\n" + json_output
 
 
 
322
  return status_msg, json_output, display_text
323
+
324
+ # Failure case
325
+ error_msg = f"❌ Extraction failed:\n{result.get('error', 'Unknown error')}\n"
326
+ if "suggestion" in result:
327
+ error_msg += f"\nπŸ’‘ {result['suggestion']}\n"
328
+ if "traceback" in result:
329
+ error_msg += f"\nDebug info:\n{result['traceback'][:800]}\n"
330
+
331
+ raw_resp = result.get("raw_response", "No response")
332
+ return error_msg, "{}", f"Raw Response:\n{raw_resp[:1500]}"
333
+
334
  except Exception as e:
335
+ error_msg = f"❌ Unexpected error: {str(e)}\n{traceback.format_exc()[:800]}"
336
  return error_msg, "{}", error_msg
337
 
338
 
339
  # ==============================================================
340
+ # Gradio Interface (kept essentially the same)
341
  # ==============================================================
342
 
343
  def create_interface():
344
  with gr.Blocks(theme=gr.themes.Soft(), title="Document Data Extractor") as demo:
345
  gr.Markdown("""
346
  # πŸ“„ Shipping Document Data Extractor
347
+
348
+ Upload PDFs, images, Word docs, or text files to extract structured shipping data.
349
+
350
+ **Pipeline:** Local OCR/Text extraction β†’ Open-source LLM (HF Inference API) β†’ JSON
351
  **Supported formats:** PDF, JPG, PNG, DOCX, TXT, CSV
352
  """)
353
+
354
  with gr.Row():
355
  with gr.Column(scale=2):
356
  file_input = gr.File(
 
358
  file_count="multiple",
359
  file_types=[".pdf", ".jpg", ".jpeg", ".png", ".gif", ".bmp", ".txt", ".csv", ".doc", ".docx"]
360
  )
361
+
 
362
  gr.Markdown("**Try with example:**")
363
  example_btn = gr.Button("πŸ“„ Load Example PDF", size="sm", variant="secondary")
364
+
365
  submit_btn = gr.Button("πŸš€ Extract Data", variant="primary", size="lg")
366
+
367
  with gr.Column(scale=3):
368
  status_output = gr.Textbox(
369
  label="πŸ“Š Status",
370
  lines=4,
371
  max_lines=8
372
  )
373
+
374
  json_output = gr.Code(
375
  label="πŸ“‹ JSON Output (Copy this)",
376
  language="json",
377
  lines=15
378
  )
379
+
380
  display_output = gr.Textbox(
381
  label="πŸ‘οΈ Preview",
382
  lines=10,
383
  max_lines=15
384
  )
385
+
386
  gr.Markdown("""
387
+ ### πŸ’‘ Notes
388
+ - For scanned PDFs: OCR requires **tesseract-ocr** and **poppler-utils** (see packages.txt).
389
+ - For better throughput, set **HF_TOKEN** in Space Secrets.
390
+ - You can switch models by setting **HF_MODEL** (e.g., `mistralai/Mistral-7B-Instruct-v0.3`).
 
391
  """)
392
+
393
  submit_btn.click(
394
  fn=process_documents,
395
  inputs=[file_input],
396
  outputs=[status_output, json_output, display_output]
397
  )
398
+
399
  def load_example():
400
+ # In Spaces, example file should be in repo root
401
  example_path = "example1.pdf"
402
  if os.path.exists(example_path):
 
403
  return [example_path]
404
+ return []
405
+
 
 
 
406
  example_btn.click(
407
  fn=load_example,
408
  inputs=None,
409
  outputs=file_input
410
  )
 
411
 
412
  return demo
413
 
 
418
  server_name="0.0.0.0",
419
  server_port=7860,
420
  share=False
421
+ )