PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

7e5da41

verified ·

1 Parent(s): 0ce55d2

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -99

app.py CHANGED Viewed

@@ -20,13 +20,13 @@ MODELS = {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
         "model_name": "deepseek-chat",
         "api_key_env": "DEEPSEEK_API_KEY",
-        "response_format": {"type": "json_object"}
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
         "model_name": "deepseek-reasoner",
         "api_key_env": "DEEPSEEK_API_KEY",
-        "response_format": None
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
@@ -35,8 +35,8 @@ MODELS = {
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
-            "X-Title": "Invoice Extractor"
-        }
     },
     "Mistral Small": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
@@ -45,160 +45,197 @@ MODELS = {
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
-            "X-Title": "Invoice Extractor"
-        }
-    }
 }
 def get_api_key(model_choice):
-    api_key = os.environ.get(MODELS[model_choice]["api_key_env"])
-    if not api_key:
-        st.error(f"❌ {MODELS[model_choice]['api_key_env']} environment variable not set!")
         st.stop()
-    return api_key
 def query_llm(model_choice, prompt):
-    config = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
-    if config.get("extra_headers"):
-        headers.update(config["extra_headers"])
     payload = {
-        "model": config["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
-    if config.get("response_format"):
-        payload["response_format"] = config["response_format"]
     try:
-        with st.spinner(f"🔍 Analyzing with {model_choice}..."):
-            resp = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
-            if resp.status_code != 200:
-                st.error(f"🚨 API Error {resp.status_code}: {resp.text}")
-                return None
-            content = resp.json()["choices"][0]["message"]["content"]
-            st.session_state.last_api_response = content
-            st.session_state.last_api_response_raw = resp.text
-            return content
-    except requests.exceptions.RequestException as e:
-        st.error(f"🌐 Connection Failed: {e}")
         return None
 def clean_json_response(text):
-    """Strip code fences and extract a valid JSON segment."""
     if not text:
         return None
     original = text
-    # Remove any ``` or ```json fences
-    text = re.sub(r'```(?:json)?', '', text)
-    text = text.strip()
-    # Find the JSON object boundaries
     start = text.find('{')
     end = text.rfind('}') + 1
-    if start == -1 or end == 0:
-        st.error("Failed to locate JSON in the response.")
         st.code(original)
         return None
-    json_str = text[start:end]
     try:
-        return json.loads(json_str)
     except json.JSONDecodeError as e:
-        st.error(f"JSON decode error: {e}")
-        st.code(json_str)
         return None
 def get_extraction_prompt(model_choice, text):
-    # (Prompts abbreviated here for readability—use your existing prompt definitions)
     if model_choice == "DeepSeek v3":
-        return "..."  # your DeepSeek v3 prompt
     elif model_choice == "DeepSeek R1":
-        return "..."  # your DeepSeek R1 prompt
-    else:
-        return "..."  # generic Llama/Mistral prompt
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
-    result = query_llm(model_choice, prompt)
-    if not result:
         return None
-    data = clean_json_response(result)
     if not data:
         return None
-    # Normalize structure
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-        header = data.setdefault("invoice_header", {})
-        for key in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
-            header.setdefault(key, None)
         items = data.setdefault("line_items", [])
-        for item in items:
-            for key in ["item_number", "description", "quantity", "unit_price", "total_price"]:
-                item.setdefault(key, None)
     else:
-        for key in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
-            data.setdefault(key, None)
         items = data.setdefault("line_items", [])
-        for item in items:
-            for key in ["description", "quantity", "unit_price", "total_price"]:
-                item.setdefault(key, None)
     return data
-# ---- UI Layout ----
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
-    st.title("PDF to Bullet Point Summarizer 🗟")
-    pdf_file = st.file_uploader("Upload PDF", type="pdf")
-    scale = st.slider("Summarization extent (%)", 1, 100, 20)
-    if st.button("Generate Summary") and pdf_file:
-        text = read_pdf(io.BytesIO(pdf_file.getvalue()))
-        phrases = extract_key_phrases(text)
-        scores = score_sentences(text, phrases)
-        count = max(1, len(scores) * scale // 100)
-        summary = summarize_text(scores, num_points=count)
-        st.subheader("Summary:")
-        st.markdown(summary)
 with tab2:
-    st.title("📋 Invoice Extractor from PDF")
-    model_choice = st.selectbox("Select AI Model", list(MODELS.keys()))
-    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
-    if st.button("Extract Invoice Information") and invoice_pdf:
-        invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
-        invoice_data = extract_invoice_info(model_choice, invoice_text)
-        if invoice_data:
-            st.success("Extraction Complete!")
-            if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-                hdr = invoice_data["invoice_header"]
                 c1, c2, c3 = st.columns(3)
-                c1.metric("Invoice #", hdr.get("invoice_number"))
-                c1.metric("Supplier", hdr.get("supplier_name"))
-                c2.metric("Date", hdr.get("invoice_date"))
-                c2.metric("Customer", hdr.get("customer_name"))
-                c3.metric("PO #", hdr.get("po_number"))
-                c3.metric("Total", hdr.get("invoice_value"))
                 st.subheader("Line Items")
-                st.table(invoice_data["line_items"])
             else:
                 c1, c2 = st.columns(2)
-                c1.metric("Invoice #", invoice_data.get("invoice_number"))
-                c1.metric("PO #", invoice_data.get("po_number"))
-                c2.metric("Date", invoice_data.get("invoice_date"))
-                c2.metric("Value", invoice_data.get("invoice_value"))
                 st.subheader("Line Items")
-                st.table(invoice_data["line_items"])
     if "last_api_response" in st.session_state:
-        with st.expander("Debug Information"):
-            st.write("Extracted content (raw string):")
             st.code(st.session_state.last_api_response)
-            st.write("Full HTTP response text:")
-            st.code(st.session_state.get("last_api_response_raw", "No response"))

         "api_url": "https://api.deepseek.com/v1/chat/completions",
         "model_name": "deepseek-chat",
         "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": {"type": "json_object"},
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
         "model_name": "deepseek-reasoner",
         "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": None,
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor",
+        },
     },
     "Mistral Small": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor",
+        },
+    },
 }
 def get_api_key(model_choice):
+    key = os.environ.get(MODELS[model_choice]["api_key_env"])
+    if not key:
+        st.error(f"❌ {MODELS[model_choice]['api_key_env']} not set")
         st.stop()
+    return key
 def query_llm(model_choice, prompt):
+    cfg = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
+    if cfg.get("extra_headers"):
+        headers.update(cfg["extra_headers"])
     payload = {
+        "model": cfg["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
+    if cfg.get("response_format"):
+        payload["response_format"] = cfg["response_format"]
     try:
+        with st.spinner(f"🔍 Querying {model_choice}..."):
+            resp = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
+        if resp.status_code != 200:
+            st.error(f"🚨 API Error {resp.status_code}: {resp.text}")
+            return None
+        content = resp.json()["choices"][0]["message"]["content"]
+        st.session_state.last_api_response = content
+        st.session_state.last_api_raw = resp.text
+        return content
+    except Exception as e:
+        st.error(f"Connection failed: {e}")
         return None
 def clean_json_response(text):
     if not text:
         return None
     original = text
+    # remove any ``` fences
+    text = re.sub(r'```(?:json)?', '', text).strip()
+    # find outer braces
     start = text.find('{')
     end = text.rfind('}') + 1
+    if start < 0 or end < 1:
+        st.error("Couldn't locate JSON in response.")
         st.code(original)
         return None
+    fragment = text[start:end]
     try:
+        return json.loads(fragment)
     except json.JSONDecodeError as e:
+        st.error(f"JSON parse error: {e}")
+        st.code(fragment)
         return None
 def get_extraction_prompt(model_choice, text):
+    # NOTE: every prompt below includes the word "json" in lowercase
     if model_choice == "DeepSeek v3":
+        return (
+            "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
+            "{\n"
+            '  "invoice_number": "string",\n'
+            '  "invoice_date": "YYYY-MM-DD",\n'
+            '  "po_number": "string or null",\n'
+            '  "invoice_value": "string with currency symbol",\n'
+            '  "line_items": [\n'
+            "    {...}\n"
+            "  ]\n"
+            "}\n"
+            "Rules:\n"
+            "1. Use null for missing fields\n"
+            "2. Do not include any additional text\n\n"
+            "Invoice Text:\n"
+            + text
+        )
     elif model_choice == "DeepSeek R1":
+        return (
+            "Please extract invoice info from the text below and return only raw json:\n"
+            "{...}\n"
+            "Invoice Text:\n"
+            + text
+        )
+    else:  # Llama / Mistral
+        return (
+            "Extract complete invoice information and return a valid json object with these fields:\n"
+            "{\n"
+            '  "invoice_header": {...},\n'
+            '  "line_items": [...]\n'
+            "}\n"
+            "Rules:\n"
+            "1. Return ONLY json\n"
+            "2. Date format YYYY-MM-DD\n"
+            "3. Currency values with symbol\n"
+            "4. Do not include any explanations\n\n"
+            "Invoice Text:\n"
+            + text
+        )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
+    raw = query_llm(model_choice, prompt)
+    if raw is None:
         return None
+    if not raw.strip():
+        st.error("Empty response from API.")
+        st.code(st.session_state.last_api_raw)
+        return None
+    data = clean_json_response(raw)
     if not data:
         return None
+    # normalize
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
+        hdr = data.setdefault("invoice_header", {})
+        for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
+            hdr.setdefault(k, None)
         items = data.setdefault("line_items", [])
+        for itm in items:
+            for k in ["item_number", "description", "quantity", "unit_price", "total_price"]:
+                itm.setdefault(k, None)
     else:
+        for k in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
+            data.setdefault(k, None)
         items = data.setdefault("line_items", [])
+        for itm in items:
+            for k in ["description", "quantity", "unit_price", "total_price"]:
+                itm.setdefault(k, None)
     return data
+# ---- UI ----
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
+    st.title("PDF to Bullet Point Summarizer")
+    pdf = st.file_uploader("Upload PDF", type="pdf")
+    pct = st.slider("Summarization (%)", 1, 100, 20)
+    if st.button("Summarize") and pdf:
+        txt = read_pdf(io.BytesIO(pdf.getvalue()))
+        keys = extract_key_phrases(txt)
+        scores = score_sentences(txt, keys)
+        n = max(1, len(scores) * pct // 100)
+        bullet = summarize_text(scores, num_points=n)
+        st.subheader("Summary")
+        st.markdown(bullet)
 with tab2:
+    st.title("Invoice Extractor")
+    mdl = st.selectbox("Model", list(MODELS.keys()))
+    inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
+    if st.button("Extract") and inv_pdf:
+        txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
+        info = extract_invoice_info(mdl, txt)
+        if info:
+            st.success("Done")
+            if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
+                h = info["invoice_header"]
                 c1, c2, c3 = st.columns(3)
+                c1.metric("Invoice #", h["invoice_number"])
+                c1.metric("Supplier", h["supplier_name"])
+                c2.metric("Date", h["invoice_date"])
+                c2.metric("Customer", h["customer_name"])
+                c3.metric("PO #", h["po_number"])
+                c3.metric("Total", h["invoice_value"])
                 st.subheader("Line Items")
+                st.table(info["line_items"])
             else:
                 c1, c2 = st.columns(2)
+                c1.metric("Invoice #", info["invoice_number"])
+                c1.metric("PO #", info["po_number"])
+                c2.metric("Date", info["invoice_date"])
+                c2.metric("Value", info["invoice_value"])
                 st.subheader("Line Items")
+                st.table(info["line_items"])
     if "last_api_response" in st.session_state:
+        with st.expander("Debug"):
+            st.write("Raw assistant content:")
             st.code(st.session_state.last_api_response)
+            st.write("Full HTTP response:")
+            st.code(st.session_state.last_api_raw)