PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on Jun 6, 2025

Commit

0592d14

verified ·

1 Parent(s): 796b6f2

Update app.py

Browse files

Files changed (1) hide show

app.py +236 -80

app.py CHANGED Viewed

@@ -1,98 +1,254 @@
 import streamlit as st
 import requests
-import time
 import os
-# CONFIG
-UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY", "pktmL5lfqlVv7IWW_MYhdXRl399GA1n8vaLktHefxVY")
-BASE_URL = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
-def upload_pdf_to_unstract(pdf_file):
-    url = f"{BASE_URL}/whisper"
     headers = {
-        "unstract-key": UNSTRACT_API_KEY,
     }
-    # Always reset file pointer
-    pdf_file.seek(0)
-    file_bytes = pdf_file.read()
-    # Force correct .pdf extension and type
-    file_name = getattr(pdf_file, "name", None)
-    if not file_name or not file_name.lower().endswith(".pdf"):
-        file_name = "invoice.pdf"
-    files = {
-        "file": (file_name, file_bytes, "application/pdf"),
-    }
-    # Debug print for troubleshooting
-    # st.write("Uploading file with name:", file_name)
-    with st.spinner("Uploading and starting OCR..."):
-        resp = requests.post(url, headers=headers, files=files)
-    if resp.status_code not in (200, 202):
-        st.error(f"Upload failed: {resp.status_code}: {resp.text}")
         return None
-    data = resp.json()
-    whisper_hash = data.get("whisper_hash")
-    if not whisper_hash:
-        st.error(f"No whisper_hash in response: {data}")
-    return whisper_hash
-def poll_until_processed(whisper_hash, poll_interval=3, max_attempts=30):
-    status_url = f"{BASE_URL}/whisper-status?whisper_hash={whisper_hash}"
-    headers = {
-        "unstract-key": UNSTRACT_API_KEY,
-    }
-    with st.spinner("Processing PDF (OCR in progress)..."):
-        for i in range(max_attempts):
-            resp = requests.get(status_url, headers=headers)
-            if resp.status_code != 200:
-                st.error(f"Status check failed: {resp.status_code}: {resp.text}")
-                return False
-            status = resp.json().get("status")
-            if status == "processed":
-                return True
-            elif status in ("failed", "error"):
-                st.error(f"Processing failed: {resp.text}")
-                return False
-            time.sleep(poll_interval)
-    st.error("Timed out waiting for OCR to complete.")
-    return False
-def retrieve_text(whisper_hash):
-    retrieve_url = f"{BASE_URL}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
-    headers = {
-        "unstract-key": UNSTRACT_API_KEY,
-    }
-    with st.spinner("Retrieving extracted text..."):
-        resp = requests.get(retrieve_url, headers=headers)
-    if resp.status_code != 200:
-        st.error(f"Retrieve failed: {resp.status_code}: {resp.text}")
         return None
-    data = resp.json()
-    result_text = data.get("result_text", "")
-    return result_text
-st.title("Unstract OCR: PDF Invoice Text Extraction")
-uploaded_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
-if st.button("Extract Text from PDF") and uploaded_pdf:
-    whisper_hash = upload_pdf_to_unstract(uploaded_pdf)
-    if not whisper_hash:
-        st.stop()
-    st.success(f"File accepted. Tracking hash: {whisper_hash}")
-    if poll_until_processed(whisper_hash):
-        text = retrieve_text(whisper_hash)
-        if text:
-            st.success("Text extraction complete!")
-            st.subheader("Extracted Text:")
-            st.text_area("Extracted Text", text, height=400)
-        else:
-            st.error("Extraction failed at retrieve step.")
     else:
-        st.error("OCR did not complete successfully.")
-st.caption("Powered by Unstract LLMWhisperer OCR API.")

 import streamlit as st
+import io
 import requests
+import json
+import re
 import os
+import time
+from main import extract_key_phrases, score_sentences, summarize_text  # read_pdf removed
+st.set_page_config(page_title="PDF Tools", layout="wide")
+# -------- LLM Model Setup (same as before) --------
+MODELS = {
+    "DeepSeek v3": {
+        "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-chat",
+        "key_env": "DEEPSEEK_API_KEY",
+        "response_format": {"type": "json_object"},
+    },
+    "DeepSeek R1": {
+        "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-reasoner",
+        "key_env": "DEEPSEEK_API_KEY",
+        "response_format": None,
+    },
+    "OpenAI GPT-4.1": {
+        "api_url": "https://api.openai.com/v1/chat/completions",
+        "model": "gpt-4-1106-preview",
+        "key_env": "OPENAI_API_KEY",
+        "response_format": None,
+        "extra_headers": {},
+    },
+    "Mistral Small": {
+        "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model": "mistralai/mistral-small-3.1-24b-instruct:free",
+        "key_env": "OPENROUTER_API_KEY",
+        "response_format": {"type": "json_object"},
+        "extra_headers": {
+            "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor",
+        },
+    },
+}
+def get_api_key(model_choice):
+    key = os.getenv(MODELS[model_choice]["key_env"])
+    if not key:
+        st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
+        st.stop()
+    return key
+def query_llm(model_choice, prompt):
+    cfg = MODELS[model_choice]
     headers = {
+        "Authorization": f"Bearer {get_api_key(model_choice)}",
+        "Content-Type": "application/json",
+    }
+    if cfg.get("extra_headers"):
+        headers.update(cfg["extra_headers"])
+    payload = {
+        "model": cfg["model"],
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.1,
+        "max_tokens": 2000,
     }
+    if cfg.get("response_format"):
+        payload["response_format"] = cfg["response_format"]
+    try:
+        with st.spinner(f"🔍 Querying {model_choice}..."):
+            r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
+        if r.status_code != 200:
+            if "No instances available" in r.text or r.status_code == 503:
+                st.error(f"{model_choice} is currently unavailable. Please try again later or select another model.")
+            else:
+                st.error(f"🚨 API Error {r.status_code}: {r.text}")
+            return None
+        content = r.json()["choices"][0]["message"]["content"]
+        st.session_state.last_api = content
+        st.session_state.last_raw = r.text
+        return content
+    except Exception as e:
+        st.error(f"Connection error: {e}")
+        return None
+def clean_json_response(text):
+    if not text:
+        return None
+    orig = text
+    text = re.sub(r'```(?:json)?', '', text).strip()
+    start, end = text.find('{'), text.rfind('}') + 1
+    if start < 0 or end < 1:
+        st.error("Couldn't locate JSON in response.")
+        st.code(orig)
+        return None
+    frag = text[start:end]
+    frag = re.sub(r',\s*([}\]])', r'\1', frag)
+    try:
+        return json.loads(frag)
+    except json.JSONDecodeError as e:
+        repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
+        try:
+            return json.loads(repaired)
+        except json.JSONDecodeError:
+            st.error(f"JSON parse error: {e}")
+            st.code(frag)
+            return None
+def fallback_supplier(text):
+    for line in text.splitlines():
+        line = line.strip()
+        if line:
+            return line
+    return None
+def get_extraction_prompt(model_choice, txt):
+    # (no change, reuse as before)
+    return (
+        # [--- omitted for brevity; keep as is ---]
+        "\nInvoice Text:\n"
+        f"{txt}"
+    )
+def extract_invoice_info(model_choice, text):
+    prompt = get_extraction_prompt(model_choice, text)
+    raw = query_llm(model_choice, prompt)
+    if not raw:
         return None
+    data = clean_json_response(raw)
+    if not data:
         return None
+    # (no change, reuse as before)
+    if model_choice.startswith("DeepSeek"):
+        header = {k: v for k, v in data.items() if k != "line_items"}
+        items = data.get("line_items", [])
+        if not isinstance(items, list):
+            items = []
+        for itm in items:
+            if not isinstance(itm, dict):
+                continue
+            for k in ("description","quantity","unit_price","total_price"):
+                itm.setdefault(k, None)
+        return {"invoice_header": header, "line_items": items}
+    hdr = data.get("invoice_header", {})
+    if not hdr and any(k in data for k in ("invoice_number","supplier_name","customer_name")):
+        hdr = data
+    for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
+        hdr.setdefault(k, None)
+    if not hdr.get("supplier_name"):
+        hdr["supplier_name"] = fallback_supplier(text)
+    items = data.get("line_items", [])
+    if not isinstance(items, list):
+        items = []
+    for itm in items:
+        if not isinstance(itm, dict):
+            continue
+        for k in ("item_number","description","quantity","unit_price","total_price"):
+            itm.setdefault(k, None)
+    return {"invoice_header": hdr, "line_items": items}
+# --------- UNSTRACT API PDF-TO-TEXT HELPER ---------
+UNSTRACT_BASE = "https://llmwhisperer-api.us-central.unstract.com/api/v2"
+UNSTRACT_API_KEY = os.getenv("UNSTRACT_API_KEY")  # Set this in your environment!
+def extract_text_from_pdf_unstract(pdf_file):
+    headers = {"unstract-key": UNSTRACT_API_KEY}
+    # Step 1: POST /whisper with the PDF
+    files = {"file": pdf_file}
+    whisper_url = f"{UNSTRACT_BASE}/whisper"
+    with st.spinner("Uploading and processing PDF with Unstract..."):
+        r = requests.post(whisper_url, files=files, headers=headers)
+        if r.status_code != 202:
+            st.error(f"Unstract: Error uploading PDF: {r.status_code} - {r.text}")
+            return None
+        whisper_hash = r.json().get("whisper_hash")
+        if not whisper_hash:
+            st.error("Unstract: No whisper_hash received.")
+            return None
+    # Step 2: Poll /whisper-status until processed
+    status_url = f"{UNSTRACT_BASE}/whisper-status?whisper_hash={whisper_hash}"
+    for i in range(30):  # Wait up to ~30 x 2 = 60 seconds
+        status_r = requests.get(status_url, headers=headers)
+        if status_r.status_code != 200:
+            st.error(f"Unstract: Error checking status: {status_r.status_code} - {status_r.text}")
+            return None
+        status = status_r.json().get("status")
+        if status == "processed":
+            break
+        st.info(f"Unstract status: {status or 'waiting'}... ({i+1})")
+        time.sleep(2)
     else:
+        st.error("Unstract: Timeout waiting for OCR to finish.")
+        return None
+    # Step 3: GET /whisper-retrieve?whisper_hash=...&text_only=true
+    retrieve_url = f"{UNSTRACT_BASE}/whisper-retrieve?whisper_hash={whisper_hash}&text_only=true"
+    r = requests.get(retrieve_url, headers=headers)
+    if r.status_code != 200:
+        st.error(f"Unstract: Error retrieving extracted text: {r.status_code} - {r.text}")
+        return None
+    return r.json().get("result_text") or r.text
+# --------- INVOICE EXTRACTOR UI ---------
+st.title("Invoice Extractor")
+mdl = st.selectbox("Model", list(MODELS.keys()), key="extract_model")
+inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
+extracted_info = None
+if st.button("Extract") and inv_pdf:
+    with st.spinner("Extracting text from PDF using Unstract..."):
+        text = extract_text_from_pdf_unstract(inv_pdf)
+    if text:
+        extracted_info = extract_invoice_info(mdl, text)
+        if extracted_info:
+            st.success("Extraction Complete")
+            st.subheader("Invoice Metadata")
+            st.table([{k.replace("_", " ").title(): v for k, v in extracted_info["invoice_header"].items()}])
+            st.subheader("Line Items")
+            st.table(extracted_info["line_items"])
+            st.session_state["last_extracted_info"] = extracted_info  # store in session
+# If we've already extracted info, or in this session, show further controls
+extracted_info = extracted_info or st.session_state.get("last_extracted_info", None)
+if extracted_info:
+    st.markdown("---")
+    st.subheader("📝 Fine-tune Extracted Data with Your Own Prompt")
+    user_prompt = st.text_area(
+        "Enter your prompt for further processing or transformation (the extracted JSON will be available as context).",
+        height=120,
+        key="custom_prompt"
+    )
+    model_2 = st.selectbox("Model for Fine-Tuning Prompt", list(MODELS.keys()), key="refine_model")
+    if st.button("Run Custom Prompt"):
+        refine_input = (
+            "Here is an extracted invoice in JSON format:\n"
+            f"{json.dumps(extracted_info, indent=2)}\n"
+            "Follow this instruction and return the result as a JSON object only (no explanation):\n"
+            f"{user_prompt}"
+        )
+        result = query_llm(model_2, refine_input)
+        refined_json = clean_json_response(result)
+        st.subheader("Fine-Tuned Output")
+        if refined_json:
+            st.json(refined_json)
+        else:
+            st.error("Could not parse a valid JSON output from the model.")
+    st.caption("The prompt is run on the above-extracted fields as JSON. Try instructions like: 'Add a new field for net_amount (amount minus tax) to each line item', or 'Summarize the total quantity ordered', etc.")
+if "last_api" in st.session_state:
+    with st.expander("Debug"):
+        st.code(st.session_state.last_api)
+        st.code(st.session_state.last_raw)