PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on Jun 6, 2025

Commit

2682cc6

verified ·

1 Parent(s): b3c1ec9

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -15

app.py CHANGED Viewed

@@ -4,8 +4,9 @@ import json
 import re
 import os
 import time
-from main import extract_key_phrases, score_sentences, summarize_text  # Optional, if you use these
 st.set_page_config(page_title="PDF Tools", layout="wide")
@@ -231,29 +232,42 @@ def extract_invoice_info(model_choice, text):
             itm.setdefault(k, None)
     return {"invoice_header": hdr, "line_items": items}
-# --------- UNSTRACT API PDF-TO-TEXT HELPER ---------
 UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
 UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")  # Set this in your environment!
-def extract_text_from_pdf_unstract(pdf_file):
     headers = {
         "unstract-key": UNSTRACT_API_KEY,
-        "Content-Type": "text/plain",  # Matches your working Postman code!
     }
-    pdf_bytes = pdf_file.read()
     url = f"{UNSTRACT_BASE}/whisper"
-    with st.spinner("Uploading and processing PDF with Unstract..."):
-        r = requests.post(url, headers=headers, data=pdf_bytes)
         if r.status_code != 202:
-            st.error(f"Unstract: Error uploading PDF: {r.status_code} - {r.text}")
             return None
         whisper_hash = r.json().get("whisper_hash")
         if not whisper_hash:
             st.error("Unstract: No whisper_hash received.")
             return None
-    # Step 2: Poll /whisper-status until processed
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
     for i in range(30):  # Wait up to 60s (2s x 30)
         status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
@@ -269,7 +283,6 @@ def extract_text_from_pdf_unstract(pdf_file):
         st.error("Unstract: Timeout waiting for OCR to finish.")
         return None
-    # Step 3: GET /whisper-retrieve?whisper_hash=...&text_only=true
     retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
     r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
     if r.status_code != 200:
@@ -282,14 +295,17 @@ def extract_text_from_pdf_unstract(pdf_file):
         return r.text
 # --------- INVOICE EXTRACTOR UI ---------
-st.title("Invoice Extractor")
 mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
-inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
 extracted_info = None
-if st.button("Extract") and inv_pdf:
-    with st.spinner("Extracting text from PDF using Unstract..."):
-        text = extract_text_from_pdf_unstract(inv_pdf)
     if text:
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info:

 import re
 import os
 import time
+import mimetypes
+from main import extract_key_phrases, score_sentences, summarize_text  # Optional
 st.set_page_config(page_title="PDF Tools", layout="wide")
             itm.setdefault(k, None)
     return {"invoice_header": hdr, "line_items": items}
+# --------- File type/content-type detection ---------
+def get_content_type(filename):
+    mime, _ = mimetypes.guess_type(filename)
+    ext = filename.lower().split('.')[-1]
+    # Special case for PDF (Unstract quirk)
+    if ext == "pdf":
+        return "text/plain"
+    if mime is None:
+        return "application/octet-stream"
+    return mime
+# --------- UNSTRACT API Multi-file PDF/Doc/Image-to-Text ---------
 UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
 UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")  # Set this in your environment!
+def extract_text_from_unstract(uploaded_file):
+    filename = getattr(uploaded_file, "name", "uploaded_file")
+    file_bytes = uploaded_file.read()
+    content_type = get_content_type(filename)
     headers = {
         "unstract-key": UNSTRACT_API_KEY,
+        "Content-Type": content_type,
     }
     url = f"{UNSTRACT_BASE}/whisper"
+    with st.spinner("Uploading and processing document with Unstract..."):
+        r = requests.post(url, headers=headers, data=file_bytes)
         if r.status_code != 202:
+            st.error(f"Unstract: Error uploading file: {r.status_code} - {r.text}")
             return None
         whisper_hash = r.json().get("whisper_hash")
         if not whisper_hash:
             st.error("Unstract: No whisper_hash received.")
             return None
     status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
     for i in range(30):  # Wait up to 60s (2s x 30)
         status_r = requests.get(status_url, headers={"unstract-key": UNSTRACT_API_KEY})
         st.error("Unstract: Timeout waiting for OCR to finish.")
         return None
     retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
     r = requests.get(retrieve_url, headers={"unstract-key": UNSTRACT_API_KEY})
     if r.status_code != 200:
         return r.text
 # --------- INVOICE EXTRACTOR UI ---------
+st.title("Invoice/Document Extractor")
 mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
+inv_file = st.file_uploader(
+    "Invoice or Document File",
+    type=["pdf", "docx", "xlsx", "xls", "png", "jpg", "jpeg", "tiff"]
+)
 extracted_info = None
+if st.button("Extract") and inv_file:
+    with st.spinner("Extracting text from document using Unstract..."):
+        text = extract_text_from_unstract(inv_file)
     if text:
         extracted_info = extract_invoice_info(mdl, text)
         if extracted_info: