PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 30, 2025

Commit

bec67ce

verified ·

1 Parent(s): b3585ca

Update app.py

Browse files

Files changed (1) hide show

app.py +151 -392

app.py CHANGED Viewed

@@ -1,450 +1,209 @@
 import streamlit as st
-from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 import io
 import requests
 import json
 import re
 import os
-from datetime import datetime
-# Configure Streamlit
-st.set_page_config(
-    page_title="PDF Tools - Summarizer & Invoice Extractor",
-    layout="wide",
-)
-# Model Configuration for Invoice Extractor
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model_name": "deepseek-chat",
-        "api_key_env": "DEEPSEEK_API_KEY",
-        "response_format": {"type": "json_object"}
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model_name": "deepseek-reasoner",
-        "api_key_env": "DEEPSEEK_API_KEY",
-        "response_format": None
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
-        "model_name": "meta-llama/llama-4-maverick:free",
-        "api_key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
-            "X-Title": "Invoice Extractor"
-        }
-    }
 }
 def get_api_key(model_choice):
-    """Get the appropriate API key based on model choice"""
-    api_key_env = MODELS[model_choice]["api_key_env"]
-    api_key = os.environ.get(api_key_env)
-    if not api_key:
-        st.error(f"❌ `{api_key_env}` environment variable not set!")
         st.stop()
-    return api_key
 def query_llm(model_choice, prompt):
-    """Call the appropriate API based on model choice"""
-    config = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
-    if "extra_headers" in config:
-        headers.update(config["extra_headers"])
     payload = {
-        "model": config["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
-    if config["response_format"]:
-        payload["response_format"] = config["response_format"]
     try:
-        with st.spinner(f"🔍 Analyzing with {model_choice}..."):
-            response = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
-            if response.status_code != 200:
-                st.error(f"🚨 API Error {response.status_code}: {response.text}")
-                return None
-            try:
-                content = response.json()["choices"][0]["message"]["content"]
-                st.session_state.last_api_response = content
-                st.session_state.last_api_response_raw = response.text
-                return content
-            except KeyError as e:
-                st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
-                return None
-    except requests.exceptions.RequestException as e:
-        st.error(f"🌐 Connection Failed: {str(e)}")
         return None
 def clean_json_response(text):
-    """Improved JSON extraction with comprehensive error handling"""
     if not text:
         return None
-    # First attempt to parse directly
     try:
-        data = json.loads(text)
-        return data
-    except json.JSONDecodeError:
-        pass
-    # Try to extract JSON from potential markdown
-    json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
-    if json_match:
         try:
-            return json.loads(json_match.group(1))
         except json.JSONDecodeError:
-            pass
-    # Try to find any JSON-like structure
-    try:
-        start_idx = text.find('{')
-        end_idx = text.rfind('}') + 1
-        if start_idx != -1 and end_idx != 0:
-            return json.loads(text[start_idx:end_idx])
-    except:
-        pass
-    # Final fallback - manual reconstruction
-    try:
-        if '"invoice_header":' in text and '"line_items":' in text:
-            header_part = text.split('"line_items":')[0]
-            line_items_part = text.split('"line_items":')[1]
-            # Ensure proper closing of JSON
-            if not header_part.strip().endswith('{'):
-                header_part += '{'
-            if not line_items_part.strip().endswith('}}'):
-                line_items_part = line_items_part.split('}')[0] + ']}}'
-            reconstructed = header_part + '"line_items":' + line_items_part
-            return json.loads(reconstructed)
-    except Exception as e:
-        st.warning(f"Could not fully reconstruct JSON: {str(e)}")
-        return None
     return None
-def get_extraction_prompt(model_choice, text):
-    """Return the appropriate prompt based on model choice"""
-    if model_choice == "DeepSeek v3":
-        return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
-{{
-  "invoice_number": "string",
-  "invoice_date": "YYYY-MM-DD",
-  "po_number": "string or null",
-  "invoice_value": "string with currency symbol",
-  "line_items": [
-    {{
-      "description": "string",
-      "quantity": "number or string",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Include all line items found in the invoice
-4. For line items, quantity can be number or string, prices should include currency
-5. Do not include any explanations or notes
-Invoice Text:
-""" + text
-    elif model_choice == "DeepSeek R1":
-        return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
-{{
-  "invoice_number": "string or null",
-  "invoice_date": "YYYY-MM-DD or null",
-  "po_number": "string or null",
-  "invoice_value": "string with currency or null",
-  "line_items": [
-    {{
-      "description": "string",
-      "quantity": "number or string",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Invoice Text:
-""" + text
-    else:  # Llama 4 Mavericks
-        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
-{{
-  "invoice_header": {{
-    "invoice_number": "string",
-    "invoice_date": "YYYY-MM-DD",
-    "po_number": "string or null",
-    "invoice_value": "string with currency",
-    "supplier_name": "string or null",
-    "customer_name": "string or null"
-  }},
-  "line_items": [
-    {{
-      "item_number": "string or null",
-      "description": "string",
-      "quantity": "number",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Date format must be YYYY-MM-DD
-4. All currency values must include currency symbol or code
-5. Include all line items found in the invoice
-6. For line items, quantity should be a number, prices as strings with currency
-7. Do not include any explanations or notes
-Invoice Text:
-""" + text
-def format_currency(value):
-    """Helper function to format currency values consistently"""
-    if not value:
-        return "N/A"
-    if isinstance(value, (int, float)):
-        return f"${value:,.2f}"
-    return value
-def display_line_items(line_items, model_choice="DeepSeek v3"):
-    """Display line items in a formatted table"""
-    if not line_items:
-        st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
-        return
-    st.subheader("📋 Line Items")
-    if model_choice == "Llama 4 Mavericks":
-        # Display as a table for Llama
-        items_display = []
-        for idx, item in enumerate(line_items, 1):
-            items_display.append({
-                "#": idx,
-                "Description": item.get("description", "N/A"),
-                "Quantity": item.get("quantity", 0),
-                "Unit Price": item.get("unit_price", "N/A"),
-                "Total Price": item.get("total_price", "N/A")
-            })
-        st.table(items_display)
-    else:
-        # Display in columns for DeepSeek models
-        cols = st.columns([4, 2, 2, 2])
-        with st.container():
-            cols[0].write("**Description**")
-            cols[1].write("**Qty**")
-            cols[2].write("**Unit Price**")
-            cols[3].write("**Total**")
-            for item in line_items:
-                cols = st.columns([4, 2, 2, 2])
-                cols[0].write(item.get("description", "N/A"))
-                cols[1].write(item.get("quantity", "N/A"))
-                cols[2].write(format_currency(item.get("unit_price", "N/A")))
-                cols[3].write(format_currency(item.get("total_price", "N/A")))
-            st.divider()
-def display_invoice_data(model_choice, invoice_data):
-    if not invoice_data:
-        return
-    if model_choice == "Llama 4 Mavericks":
-        # Display header information
-        st.subheader("Invoice Summary")
-        header = invoice_data.get("invoice_header", {})
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Invoice Number", header.get("invoice_number", "Not found"))
-            st.metric("Supplier", header.get("supplier_name", "Not found"))
-        with col2:
-            st.metric("Invoice Date", header.get("invoice_date", "Not found"))
-            st.metric("Customer", header.get("customer_name", "Not found"))
-        with col3:
-            st.metric("PO Number", header.get("po_number", "Not found"))
-            st.metric("Total Value", header.get("invoice_value", "Not found"))
-        # Display line items
-        display_line_items(invoice_data.get("line_items", []), model_choice)
-        # Calculate and display subtotal if not provided in header
-        if not header.get("invoice_value"):
-            try:
-                total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
-                          for item in invoice_data.get("line_items", []) if item.get("total_price"))
-                st.metric("Calculated Total", f"${total:,.2f}")
-            except:
-                pass
     else:
-        # Display for DeepSeek models
-        st.success("Information extracted successfully!")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
-            st.metric("PO Number", invoice_data.get("po_number", "Not found"))
-        with col2:
-            st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
-            st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
-        # Display line items for both DeepSeek models
-        display_line_items(invoice_data.get("line_items", []), model_choice)
 def extract_invoice_info(model_choice, text):
-    """Extract structured data from pasted text"""
     prompt = get_extraction_prompt(model_choice, text)
-    result = query_llm(model_choice, prompt)
-    if not result:
         return None
-    parsed_data = clean_json_response(result)
-    if not parsed_data:
-        st.error("Failed to parse JSON. Raw response:")
-        st.code(result)
         return None
-    # Normalize data structure based on model
-    if model_choice == "Llama 4 Mavericks":
-        if "invoice_header" not in parsed_data:
-            parsed_data["invoice_header"] = {}
-        if "line_items" not in parsed_data:
-            parsed_data["line_items"] = []
-        # Set default values for header if missing
-        header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
-        for field in header_fields:
-            if field not in parsed_data["invoice_header"]:
-                parsed_data["invoice_header"][field] = None
-        # Validate line items structure
-        for item in parsed_data["line_items"]:
-            item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
-            for field in item_fields:
-                if field not in item:
-                    item[field] = None if field != "quantity" else 0
-                    if field == "quantity" and not isinstance(item[field], (int, float)):
-                        try:
-                            item[field] = float(item[field])
-                        except (ValueError, TypeError):
-                            item[field] = 0
-    else:  # DeepSeek models
-        # Ensure all required fields exist
-        for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
-            if field not in parsed_data:
-                parsed_data[field] = None
-        # Ensure line_items exists and has proper structure
-        if "line_items" not in parsed_data:
-            parsed_data["line_items"] = []
-        else:
-            for item in parsed_data["line_items"]:
-                item_fields = ["description", "quantity", "unit_price", "total_price"]
-                for field in item_fields:
-                    if field not in item:
-                        item[field] = None if field != "quantity" else 0
-    return parsed_data
-# Create tabs for different functionalities
-tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
-# PDF Summarizer Tab
 with tab1:
-    st.title("PDF to Bullet Point Summarizer 🗟 🔏")
-    # File uploader for the PDF
-    uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
-    # Slider for users to select the summarization extent
-    summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
-    # Submit button
-    submit_button = st.button("Generate Summary", key="summary_button")
-    # Check if the submit button is pressed
-    if submit_button and uploaded_file is not None:
-        with st.spinner('Processing...'):
-            # Read the PDF content
-            text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
-            # Extract key phrases from the text
-            key_phrases = extract_key_phrases(text)
-            # Score sentences based on the key phrases
-            sentence_scores = score_sentences(text, key_phrases)
-            # Determine the number of bullet points based on the selected summarization scale
-            total_sentences = len(list(sentence_scores.keys()))
-            num_points = max(1, total_sentences * summary_scale // 100)
-            # Generate the bullet-point summary
-            summary = summarize_text(sentence_scores, num_points=num_points)
-            # Display the summary as bullet points
-            st.subheader("Here's the summary: ")
-            st.markdown(summary)
-# Invoice Extractor Tab
 with tab2:
-    st.title("📋 Invoice Extractor from PDF")
-    st.write("Upload an invoice PDF to extract key details")
-    # Model selection
-    model_choice = st.selectbox(
-        "Select AI Model",
-        list(MODELS.keys()),
-        index=0,
-        help="Choose which AI model to use for extraction",
-        key="model_choice"
-    )
-    # File uploader for the invoice PDF
-    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
-    if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
-        with st.spinner('Reading PDF...'):
-            # Read the PDF content
-            invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
-        # Process in status container
-        with st.status("Processing...", expanded=True) as status:
-            st.write(f"🤖 Querying {model_choice} API...")
-            invoice_data = extract_invoice_info(model_choice, invoice_text)
-            if invoice_data:
-                status.update(label="✅ Extraction Complete!", state="complete")
-                display_invoice_data(model_choice, invoice_data)
             else:
-                status.update(label="❌ Extraction Failed", state="error")
-                st.error("Failed to extract information. Try simplifying the text.")
-        # Debug information outside the status container
-        if invoice_data and "last_api_response" in st.session_state:
-            with st.expander("Debug Information"):
-                st.write("API Response:")
-                st.json(st.session_state.last_api_response)
-                st.write("Raw API Response:")
-                st.code(st.session_state.get("last_api_response_raw", "No response"))

 import streamlit as st
 import io
 import requests
 import json
 import re
 import os
+from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
+st.set_page_config(page_title="PDF Tools", layout="wide")
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-chat",
+        "key_env": "DEEPSEEK_API_KEY",
+        "response_format": {"type": "json_object"},
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model": "deepseek-reasoner",
+        "key_env": "DEEPSEEK_API_KEY",
+        "response_format": None,
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model": "meta-llama/llama-4-maverick:free",
+        "key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor",
+        },
+    },
+    "Mistral Small": {
+        "api_url": "https://ezofisai.services.ai.azure.com/api/projects/firstProject",
+        "model": "mistral-small-2503",
+        "key_env": "AZUREMIST_API_KEY",
+        "response_format": {"type": "json_object"},
+        "extra_headers": {
+            "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor",
+        },
+    },
 }
 def get_api_key(model_choice):
+    key = os.getenv(MODELS[model_choice]["key_env"])
+    if not key:
+        st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
         st.stop()
+    return key
 def query_llm(model_choice, prompt):
+    cfg = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
+    if cfg.get("extra_headers"):
+        headers.update(cfg["extra_headers"])
     payload = {
+        "model": cfg["model"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
+    if cfg.get("response_format"):
+        payload["response_format"] = cfg["response_format"]
     try:
+        with st.spinner(f"🔍 Querying {model_choice}..."):
+            r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
+        if r.status_code != 200:
+            st.error(f"🚨 API Error {r.status_code}: {r.text}")
+            return None
+        content = r.json()["choices"][0]["message"]["content"]
+        st.session_state.last_api = content
+        st.session_state.last_raw = r.text
+        return content
+    except Exception as e:
+        st.error(f"Connection error: {e}")
         return None
 def clean_json_response(text):
     if not text:
         return None
+    orig = text
+    # strip ``` fences
+    text = re.sub(r'```(?:json)?', '', text).strip()
+    # find outer braces
+    start, end = text.find('{'), text.rfind('}') + 1
+    if start < 0 or end < 1:
+        st.error("Couldn't locate JSON in response.")
+        st.code(orig)
+        return None
+    frag = text[start:end]
+    # remove stray trailing commas
+    frag = re.sub(r',\s*([}\]])', r'\1', frag)
     try:
+        return json.loads(frag)
+    except json.JSONDecodeError as e:
+        # attempt to insert missing commas between adjacent fields
+        repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
         try:
+            return json.loads(repaired)
         except json.JSONDecodeError:
+            st.error(f"JSON parse error: {e}")
+            st.code(frag)
+            return None
+def fallback_supplier(text):
+    for line in text.splitlines():
+        line = line.strip()
+        if line:
+            return line
     return None
+def get_extraction_prompt(model_choice, txt):
+    if model_choice.startswith("DeepSeek"):
+        return (
+            "Extract full invoice info and RETURN ONLY a single-line json object with fields:\n"
+            '{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
+            '"po_number":"string|null","invoice_value":"string with currency",'
+            '"line_items":[{"description":"string","quantity":"number","unit_price":"string with currency","total_price":"string with currency"}]}\n'
+            "Use null for missing. NO extra text.\n\n"
+            f"Invoice Text:\n{txt}"
+        )
     else:
+        return (
+            "Extract invoice data and RETURN ONLY a compact, one-line json object exactly:\n"
+            '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
+            '"po_number":"string|null","invoice_value":"string with currency",'
+            '"supplier_name":"string|null","customer_name":"string|null"},'
+            '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
+            '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
+            "Use null for missing. NO extras.\n\n"
+            f"Invoice Text:\n{txt}"
+        )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
+    raw = query_llm(model_choice, prompt)
+    if not raw:
         return None
+    data = clean_json_response(raw)
+    if not data:
         return None
+    # normalize + supplier fallback
+    if model_choice in ("Llama 4 Mavericks","Mistral Small"):
+        hdr = data.setdefault("invoice_header", {})
+        for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
+            hdr.setdefault(k, None)
+        if not hdr.get("supplier_name"):
+            hdr["supplier_name"] = fallback_supplier(text)
+        items = data.setdefault("line_items", [])
+        for itm in items:
+            for k in ("item_number","description","quantity","unit_price","total_price"):
+                itm.setdefault(k, None)
+    else:
+        for k in ("invoice_number","invoice_date","po_number","invoice_value"):
+            data.setdefault(k, None)
+        items = data.setdefault("line_items", [])
+        for itm in items:
+            for k in ("description","quantity","unit_price","total_price"):
+                itm.setdefault(k, None)
+    return data
+# ---- UI ----
+tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
 with tab1:
+    st.title("PDF → Bullet Points")
+    pdf = st.file_uploader("Upload PDF", type="pdf")
+    pct = st.slider("Summarization %", 1, 100, 20)
+    if st.button("Summarize") and pdf:
+        txt = read_pdf(io.BytesIO(pdf.getvalue()))
+        keys = extract_key_phrases(txt)
+        scores = score_sentences(txt, keys)
+        n = max(1, len(scores)*pct//100)
+        st.markdown(summarize_text(scores, num_points=n))
 with tab2:
+    st.title("Invoice Extractor")
+    mdl = st.selectbox("Model", list(MODELS.keys()))
+    inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
+    if st.button("Extract") and inv_pdf:
+        txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
+        info = extract_invoice_info(mdl, txt)
+        if info:
+            st.success("Extraction Complete")
+            if mdl in ("Llama 4 Mavericks","Mistral Small"):
+                h=info["invoice_header"]
+                c1,c2,c3 = st.columns(3)
+                c1.metric("Invoice #", h["invoice_number"]); c1.metric("Supplier", h["supplier_name"])
+                c2.metric("Date", h["invoice_date"]);    c2.metric("Customer", h["customer_name"])
+                c3.metric("PO #", h["po_number"]);        c3.metric("Total", h["invoice_value"])
+                st.subheader("Line Items"); st.table(info["line_items"])
             else:
+                c1,c2 = st.columns(2)
+                c1.metric("Invoice #", info["invoice_number"]); c1.metric("PO #", info["po_number"])
+                c2.metric("Date", info["invoice_date"]);        c2.metric("Value", info["invoice_value"])
+                st.subheader("Line Items"); st.table(info["line_items"])
+    if "last_api" in st.session_state:
+        with st.expander("Debug"):
+            st.code(st.session_state.last_api)
+            st.code(st.session_state.last_raw)