PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

e7adc3a

verified ·

1 Parent(s): 835898b

Update app.py

Browse files

Files changed (1) hide show

app.py +88 -145

app.py CHANGED Viewed

@@ -4,34 +4,28 @@ import requests
 import json
 import re
 import os
-from datetime import datetime
 from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
-# Configure Streamlit
-st.set_page_config(
-    page_title="PDF Tools – Summarizer & Invoice Extractor",
-    layout="wide",
-)
-# Model configurations
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model_name": "deepseek-chat",
-        "api_key_env": "DEEPSEEK_API_KEY",
         "response_format": {"type": "json_object"},
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model_name": "deepseek-reasoner",
-        "api_key_env": "DEEPSEEK_API_KEY",
         "response_format": None,
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
-        "model_name": "meta-llama/llama-4-maverick:free",
-        "api_key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
@@ -40,8 +34,8 @@ MODELS = {
     },
     "Mistral Small": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
-        "model_name": "mistralai/mistral-small-3.1-24b-instruct:free",
-        "api_key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
@@ -51,9 +45,9 @@ MODELS = {
 }
 def get_api_key(model_choice):
-    key = os.environ.get(MODELS[model_choice]["api_key_env"])
     if not key:
-        st.error(f"❌ {MODELS[model_choice]['api_key_env']} not set")
         st.stop()
     return key
@@ -65,44 +59,44 @@ def query_llm(model_choice, prompt):
     }
     if cfg.get("extra_headers"):
         headers.update(cfg["extra_headers"])
     payload = {
-        "model": cfg["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
     if cfg.get("response_format"):
         payload["response_format"] = cfg["response_format"]
     try:
         with st.spinner(f"🔍 Querying {model_choice}..."):
-            resp = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
-        if resp.status_code != 200:
-            st.error(f"🚨 API Error {resp.status_code}: {resp.text}")
             return None
-        content = resp.json()["choices"][0]["message"]["content"]
-        st.session_state.last_api_response = content
-        st.session_state.last_api_raw = resp.text
         return content
     except Exception as e:
-        st.error(f"Connection failed: {e}")
         return None
 def clean_json_response(text):
     if not text:
         return None
-    original = text
-    # strip any ``` fences
     text = re.sub(r'```(?:json)?', '', text).strip()
-    # locate outermost JSON braces
     start = text.find('{')
     end = text.rfind('}') + 1
     if start < 0 or end < 1:
-        st.error("Couldn't locate JSON in response.")
-        st.code(original)
         return None
     fragment = text[start:end]
     try:
         return json.loads(fragment)
     except json.JSONDecodeError as e:
@@ -110,152 +104,101 @@ def clean_json_response(text):
         st.code(fragment)
         return None
-def get_extraction_prompt(model_choice, text):
-    if model_choice == "DeepSeek v3":
-        return (
-            "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
-            "{\n"
-            '  "invoice_number": "string",\n'
-            '  "invoice_date": "YYYY-MM-DD",\n'
-            '  "po_number": "string or null",\n'
-            '  "invoice_value": "string with currency symbol",\n'
-            '  "line_items": [\n'
-            "    { \"description\": \"string\", \"quantity\": \"number or string\", "
-            "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }\n"
-            "  ]\n"
-            "}\n"
-            "Rules:\n"
-            "1. Use null for missing fields\n"
-            "2. Do not include any extra text\n\n"
-            "Invoice Text:\n"
-            + text
-        )
-    elif model_choice == "DeepSeek R1":
         return (
-            "Please extract invoice info from the text below and return only raw json:\n"
-            "{ \"invoice_number\": \"string or null\", \"invoice_date\": \"YYYY-MM-DD or null\", "
-            "\"po_number\": \"string or null\", \"invoice_value\": \"string with currency or null\", "
-            "\"line_items\": [{ \"description\": \"string\", \"quantity\": \"number or string\", "
-            "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }] }\n"
-            "Invoice Text:\n"
-            + text
         )
-    else:  # Llama & Mistral
         return (
-            "You are given the text of an invoice. Extract the invoice information and return ONLY a valid json object "
-            "formatted exactly as below (nothing else):\n"
-            "{\n"
-            '  "invoice_header": {\n'
-            '    "invoice_number": "string",\n'
-            '    "invoice_date": "YYYY-MM-DD",\n'
-            '    "po_number": "string or null",\n'
-            '    "invoice_value": "string with currency symbol",\n'
-            '    "supplier_name": "string or null",\n'
-            '    "customer_name": "string or null"\n'
-            '  },\n'
-            '  "line_items": [\n'
-            '    {\n'
-            '      "item_number": "string or null",\n'
-            '      "description": "string",\n'
-            '      "quantity": number,\n'
-            '      "unit_price": "string with currency symbol",\n'
-            '      "total_price": "string with currency symbol"\n'
-            '    }\n'
-            '  ]\n'
-            "}\n"
-            "Rules:\n"
-            "1. Date: YYYY-MM-DD\n"
-            "2. Use null for missing values\n"
-            "3. Currency values must include a symbol or code\n"
-            "4. No extra keys or explanatory text\n"
-            "5. Output must start with '{' and end with '}'\n\n"
-            "Invoice Text:\n"
-            + text
         )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
     raw = query_llm(model_choice, prompt)
-    if raw is None:
         return None
-    if not raw.strip():
-        st.error("Empty response from API.")
-        st.code(st.session_state.last_api_raw)
-        return None
     data = clean_json_response(raw)
     if not data:
         return None
-    # normalize fields
-    if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-        hdr = data.setdefault("invoice_header", {})
-        for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
-            hdr.setdefault(k, None)
-        items = data.setdefault("line_items", [])
         for itm in items:
-            for k in ["item_number", "description", "quantity", "unit_price", "total_price"]:
-                itm.setdefault(k, None)
     else:
-        for k in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
-            data.setdefault(k, None)
-        items = data.setdefault("line_items", [])
         for itm in items:
-            for k in ["description", "quantity", "unit_price", "total_price"]:
-                itm.setdefault(k, None)
     return data
-# ---- UI ----
-tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
-    st.title("PDF to Bullet-Point Summarizer")
-    pdf = st.file_uploader("Upload PDF", type="pdf")
-    pct = st.slider("Summarization (%)", 1, 100, 20)
     if st.button("Summarize") and pdf:
         txt = read_pdf(io.BytesIO(pdf.getvalue()))
         keys = extract_key_phrases(txt)
-        scores = score_sentences(txt, keys)
-        n = max(1, len(scores) * pct // 100)
-        summary = summarize_text(scores, num_points=n)
-        st.subheader("Summary")
-        st.markdown(summary)
 with tab2:
     st.title("Invoice Extractor")
-    mdl = st.selectbox("Model", list(MODELS.keys()))
-    inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
     if st.button("Extract") and inv_pdf:
         txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
-        info = extract_invoice_info(mdl, txt)
         if info:
-            st.success("Extraction Complete")
-            if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
-                h = info["invoice_header"]
-                c1, c2, c3 = st.columns(3)
-                c1.metric("Invoice #", h["invoice_number"])
-                c1.metric("Supplier", h["supplier_name"])
-                c2.metric("Date", h["invoice_date"])
-                c2.metric("Customer", h["customer_name"])
-                c3.metric("PO #", h["po_number"])
-                c3.metric("Total", h["invoice_value"])
-                st.subheader("Line Items")
                 st.table(info["line_items"])
             else:
-                c1, c2 = st.columns(2)
-                c1.metric("Invoice #", info["invoice_number"])
-                c1.metric("PO #", info["po_number"])
-                c2.metric("Date", info["invoice_date"])
-                c2.metric("Value", info["invoice_value"])
-                st.subheader("Line Items")
                 st.table(info["line_items"])
-    if "last_api_response" in st.session_state:
         with st.expander("Debug"):
-            st.write("Raw assistant content:")
-            st.code(st.session_state.last_api_response)
-            st.write("Full HTTP response:")
-            st.code(st.session_state.last_api_raw)

 import json
 import re
 import os
 from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
+st.set_page_config(page_title="PDF Tools", layout="wide")
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-chat",
+        "key_env": "DEEPSEEK_API_KEY",
         "response_format": {"type": "json_object"},
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-reasoner",
+        "key_env": "DEEPSEEK_API_KEY",
         "response_format": None,
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model": "meta-llama/llama-4-maverick:free",
+        "key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
     },
     "Mistral Small": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model": "mistralai/mistral-small-3.1-24b-instruct:free",
+        "key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
 }
 def get_api_key(model_choice):
+    key = os.getenv(MODELS[model_choice]["key_env"])
     if not key:
+        st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
         st.stop()
     return key
     }
     if cfg.get("extra_headers"):
         headers.update(cfg["extra_headers"])
     payload = {
+        "model": cfg["model"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
     if cfg.get("response_format"):
         payload["response_format"] = cfg["response_format"]
     try:
         with st.spinner(f"🔍 Querying {model_choice}..."):
+            r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
+        if r.status_code != 200:
+            st.error(f"🚨 API Error {r.status_code}: {r.text}")
             return None
+        content = r.json()["choices"][0]["message"]["content"]
+        st.session_state.last_api = content
+        st.session_state.last_raw = r.text
         return content
     except Exception as e:
+        st.error(f"Connection error: {e}")
         return None
 def clean_json_response(text):
     if not text:
         return None
+    orig = text
+    # strip fences
     text = re.sub(r'```(?:json)?', '', text).strip()
+    # grab braces
     start = text.find('{')
     end = text.rfind('}') + 1
     if start < 0 or end < 1:
+        st.error("Couldn't locate JSON")
+        st.code(orig)
         return None
     fragment = text[start:end]
+    # remove stray trailing commas before } or ]
+    fragment = re.sub(r',\s*([}\]])', r'\1', fragment)
     try:
         return json.loads(fragment)
     except json.JSONDecodeError as e:
         st.code(fragment)
         return None
+def fallback_supplier(text):
+    # first non-empty line heuristic
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
+    return lines[0] if lines else None
+def get_extraction_prompt(model_choice, txt):
+    # every prompt now demands "json" and COMPACT JSON output
+    if model_choice.startswith("DeepSeek"):
         return (
+            "Extract full invoice info below and RETURN ONLY a valid json object (compact, single line) with these fields:\n"
+            '{"invoice_number":"string","invoice_date":"YYYY-MM-DD","po_number":"string|null",'
+            '"invoice_value":"string with currency","line_items":[{"description":"string","quantity":"number",'
+            '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
+            "Use null for missing fields. NO extra text.\n\n"
+            f"Invoice Text:\n{txt}"
         )
+    else:
         return (
+            "You are given invoice text. Extract data and RETURN ONLY a compact json object (one line) exactly like this:\n"
+            '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
+            '"po_number":"string|null","invoice_value":"string with currency",'
+            '"supplier_name":"string|null","customer_name":"string|null"},'
+            '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
+            '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
+            "Use null for missing. NO explanations or extra keys.\n\n"
+            f"Invoice Text:\n{txt}"
         )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
     raw = query_llm(model_choice, prompt)
+    if not raw:
         return None
     data = clean_json_response(raw)
     if not data:
         return None
+    # normalize header + fallback supplier
+    if model_choice in ("Llama 4 Mavericks","Mistral Small"):
+        hdr = data.setdefault("invoice_header",{})
+        for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
+            hdr.setdefault(k,None)
+        if not hdr.get("supplier_name"):
+            hdr["supplier_name"] = fallback_supplier(text)
+        items = data.setdefault("line_items",[])
         for itm in items:
+            for k in ("item_number","description","quantity","unit_price","total_price"):
+                itm.setdefault(k,None)
     else:
+        for k in ("invoice_number","invoice_date","po_number","invoice_value"):
+            data.setdefault(k,None)
+        items = data.setdefault("line_items",[])
         for itm in items:
+            for k in ("description","quantity","unit_price","total_price"):
+                itm.setdefault(k,None)
     return data
+# UI
+tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
 with tab1:
+    st.title("PDF → Bullet-Point Summarizer")
+    pdf = st.file_uploader("Upload PDF",type="pdf")
+    pct = st.slider("Summarization %",1,100,20)
     if st.button("Summarize") and pdf:
         txt = read_pdf(io.BytesIO(pdf.getvalue()))
         keys = extract_key_phrases(txt)
+        scores = score_sentences(txt,keys)
+        n = max(1, len(scores)*pct//100)
+        st.markdown(summarize_text(scores,num_points=n))
 with tab2:
     st.title("Invoice Extractor")
+    mdl = st.selectbox("Model",list(MODELS.keys()))
+    inv_pdf = st.file_uploader("Invoice PDF",type="pdf")
     if st.button("Extract") and inv_pdf:
         txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
+        info = extract_invoice_info(mdl,txt)
         if info:
+            st.success("Done")
+            if mdl in ("Llama 4 Mavericks","Mistral Small"):
+                h=info["invoice_header"]
+                c1,c2,c3=st.columns(3)
+                c1.metric("Invoice #",h["invoice_number"]);c1.metric("Supplier",h["supplier_name"])
+                c2.metric("Date",h["invoice_date"]);c2.metric("Customer",h["customer_name"])
+                c3.metric("PO #",h["po_number"]);c3.metric("Total",h["invoice_value"])
                 st.table(info["line_items"])
             else:
+                c1,c2=st.columns(2)
+                c1.metric("Invoice #",info["invoice_number"]);c1.metric("PO #",info["po_number"])
+                c2.metric("Date",info["invoice_date"]);c2.metric("Value",info["invoice_value"])
                 st.table(info["line_items"])
+    if "last_api" in st.session_state:
         with st.expander("Debug"):
+            st.code(st.session_state.last_api)
+            st.code(st.session_state.last_raw)