PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on Jun 6, 2025

Commit

e6cd773

verified ·

1 Parent(s): 0592d14

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -8

app.py CHANGED Viewed

@@ -6,7 +6,7 @@ import re
 import os
 import time
-from main import extract_key_phrases, score_sentences, summarize_text  # read_pdf removed
 st.set_page_config(page_title="PDF Tools", layout="wide")
@@ -114,9 +114,83 @@ def fallback_supplier(text):
     return None
 def get_extraction_prompt(model_choice, txt):
-    # (no change, reuse as before)
     return (
-        # [--- omitted for brevity; keep as is ---]
         "\nInvoice Text:\n"
         f"{txt}"
     )
@@ -130,7 +204,6 @@ def extract_invoice_info(model_choice, text):
     if not data:
         return None
-    # (no change, reuse as before)
     if model_choice.startswith("DeepSeek"):
         header = {k: v for k, v in data.items() if k != "line_items"}
         items = data.get("line_items", [])
@@ -165,8 +238,11 @@ UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")  # Set this in your environment
 def extract_text_from_pdf_unstract(pdf_file):
     headers = {"unstract-key": UNSTRACT_API_KEY}
-    # Step 1: POST /whisper with the PDF
-    files = {"file": pdf_file}
     whisper_url = f"{UNSTRACT_BASE}/whisper"
     with st.spinner("Uploading and processing PDF with Unstract..."):
         r = requests.post(whisper_url, files=files, headers=headers)
@@ -180,7 +256,7 @@ def extract_text_from_pdf_unstract(pdf_file):
     # Step 2: Poll /whisper-status until processed
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
-    for i in range(30):  # Wait up to ~30 x 2 = 60 seconds
         status_r = requests.get(status_url, headers=headers)
         if status_r.status_code != 200:
             st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
@@ -200,7 +276,12 @@ def extract_text_from_pdf_unstract(pdf_file):
     if r.status_code != 200:
         st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
         return None
-    return r.json().get("result_text") or r.text
 # --------- INVOICE EXTRACTOR UI ---------
 st.title("Invoice Extractor")

 import os
 import time
+from main import extract_key_phrases, score_sentences, summarize_text  # Only if still needed for later
 st.set_page_config(page_title="PDF Tools", layout="wide")
     return None
 def get_extraction_prompt(model_choice, txt):
     return (
+        "You are an expert invoice parser. "
+        "Extract data according to the visible table structure and column headers in the invoice. "
+        "For every line item, only extract fields that correspond to the table columns for that row (do not include header/shipment fields in line items). "
+        "Merge all multi-line content within a single cell into that field (especially for the 'description' and 'notes'). "
+        "Shipment/invoice-level fields such as CAR NUMBER, SHIPPING POINT, SHIPMENT NUMBER, CURRENCY, etc., must go ONLY into the 'invoice_header', not as line item fields.\n"
+        "Use this schema:\n"
+        '{\n'
+        '  "invoice_header": {\n'
+        '    "car_number": "string or null",\n'
+        '    "shipment_number": "string or null",\n'
+        '    "shipping_point": "string or null",\n'
+        '    "currency": "string or null",\n'
+        '    "invoice_number": "string or null",\n'
+        '    "invoice_date": "string or null",\n'
+        '    "order_number": "string or null",\n'
+        '    "customer_order_number": "string or null",\n'
+        '    "our_order_number": "string or null",\n'
+        '    "sales_order_number": "string or null",\n'
+        '    "purchase_order_number": "string or null",\n'
+        '    "order_date": "string or null",\n'
+        '    "supplier_name": "string or null",\n'
+        '    "supplier_address": "string or null",\n'
+        '    "supplier_phone": "string or null",\n'
+        '    "supplier_email": "string or null",\n'
+        '    "supplier_tax_id": "string or null",\n'
+        '    "customer_name": "string or null",\n'
+        '    "customer_address": "string or null",\n'
+        '    "customer_phone": "string or null",\n'
+        '    "customer_email": "string or null",\n'
+        '    "customer_tax_id": "string or null",\n'
+        '    "ship_to_name": "string or null",\n'
+        '    "ship_to_address": "string or null",\n'
+        '    "bill_to_name": "string or null",\n'
+        '    "bill_to_address": "string or null",\n'
+        '    "remit_to_name": "string or null",\n'
+        '    "remit_to_address": "string or null",\n'
+        '    "tax_id": "string or null",\n'
+        '    "tax_registration_number": "string or null",\n'
+        '    "vat_number": "string or null",\n'
+        '    "payment_terms": "string or null",\n'
+        '    "payment_method": "string or null",\n'
+        '    "payment_reference": "string or null",\n'
+        '    "bank_account_number": "string or null",\n'
+        '    "iban": "string or null",\n'
+        '    "swift_code": "string or null",\n'
+        '    "total_before_tax": "string or null",\n'
+        '    "tax_amount": "string or null",\n'
+        '    "tax_rate": "string or null",\n'
+        '    "shipping_charges": "string or null",\n'
+        '    "discount": "string or null",\n'
+        '    "total_due": "string or null",\n'
+        '    "amount_paid": "string or null",\n'
+        '    "balance_due": "string or null",\n'
+        '    "due_date": "string or null",\n'
+        '    "invoice_status": "string or null",\n'
+        '    "reference_number": "string or null",\n'
+        '    "project_code": "string or null",\n'
+        '    "department": "string or null",\n'
+        '    "contact_person": "string or null",\n'
+        '    "notes": "string or null",\n'
+        '    "additional_info": "string or null"\n'
+        '  },\n'
+        '  "line_items": [\n'
+        '    {\n'
+        '      "quantity": "string or null",\n'
+        '      "units": "string or null",\n'
+        '      "description": "string or null",\n'
+        '      "footage": "string or null",\n'
+        '      "price": "string or null",\n'
+        '      "amount": "string or null",\n'
+        '      "notes": "string or null"\n'
+        '    }\n'
+        '  ]\n'
+        '}'
+        "\nIf a field is missing for a line item or header, use null. "
+        "Do not invent fields. Do not add any header or shipment data to any line item. Return ONLY the JSON object, no explanation.\n"
         "\nInvoice Text:\n"
         f"{txt}"
     )
     if not data:
         return None
     if model_choice.startswith("DeepSeek"):
         header = {k: v for k, v in data.items() if k != "line_items"}
         items = data.get("line_items", [])
 def extract_text_from_pdf_unstract(pdf_file):
     headers = {"unstract-key": UNSTRACT_API_KEY}
+    pdf_bytes = pdf_file.read()
+    filename = pdf_file.name if hasattr(pdf_file, "name") else "uploaded.pdf"
+    files = {
+        "file": (filename, io.BytesIO(pdf_bytes), "application/pdf")
+    }
     whisper_url = f"{UNSTRACT_BASE}/whisper"
     with st.spinner("Uploading and processing PDF with Unstract..."):
         r = requests.post(whisper_url, files=files, headers=headers)
     # Step 2: Poll /whisper-status until processed
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
+    for i in range(30):  # Wait up to 60s (2s x 30)
         status_r = requests.get(status_url, headers=headers)
         if status_r.status_code != 200:
             st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
     if r.status_code != 200:
         st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
         return None
+    # Unstract sometimes returns JSON, sometimes raw text
+    try:
+        data = r.json()
+        return data.get("result_text") or r.text
+    except Exception:
+        return r.text
 # --------- INVOICE EXTRACTOR UI ---------
 st.title("Invoice Extractor")