PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 29, 2025

Commit

b3585ca

verified ·

1 Parent(s): 572e346

Update app.py

Browse files

Files changed (1) hide show

app.py +393 -156

app.py CHANGED Viewed

@@ -1,213 +1,450 @@
 import streamlit as st
 import io
 import requests
 import json
 import re
 import os
-from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
-st.set_page_config(page_title="PDF Tools", layout="wide")
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model": "deepseek-chat",
-        "key_env": "DEEPSEEK_API_KEY",
-        "response_format": {"type": "json_object"},
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
-        "model": "deepseek-reasoner",
-        "key_env": "DEEPSEEK_API_KEY",
-        "response_format": None,
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
-        "model": "meta-llama/llama-4-maverick:free",
-        "key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
-            "X-Title": "Invoice Extractor",
-        },
-    },
-    "Mistral Small": {
-        # Update these two fields with your Azure values:
-        "api_url": "https://ezofisai.services.ai.azure.com/api/projects/firstProject",
-        "model": "mistral-small-2503",  # this is not used by Azure, just for completeness
-        "key_env": "AZUREMIST_API_KEY",
-        "response_format": {"type": "json_object"},
-        # No extra_headers needed for Azure
-    },
 }
 def get_api_key(model_choice):
-    key = os.getenv(MODELS[model_choice]["key_env"])
-    if not key:
-        st.error(f"❌ {MODELS[model_choice]['key_env']} not set")
         st.stop()
-    return key
 def query_llm(model_choice, prompt):
-    cfg = MODELS[model_choice]
     headers = {
         "Content-Type": "application/json",
     }
-    # Azure OpenAI (Mistral Small) needs api-key header instead of Authorization
-    if model_choice == "mistral-small-2503":
-        headers["api-key"] = get_api_key(model_choice)
-    else:
-        headers["Authorization"] = f"Bearer {get_api_key(model_choice)}"
-    if cfg.get("extra_headers"):
-        headers.update(cfg["extra_headers"])
     payload = {
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
-    # Only non-Azure APIs need "model" in payload
-    if model_choice != "mistral-small-2503":
-        payload["model"] = cfg["model"]
-    if cfg.get("response_format"):
-        payload["response_format"] = cfg["response_format"]
     try:
-        with st.spinner(f"🔍 Querying {model_choice}..."):
-            r = requests.post(cfg["api_url"], headers=headers, json=payload, timeout=90)
-        if r.status_code != 200:
-            st.error(f"🚨 API Error {r.status_code}: {r.text}")
-            return None
-        content = r.json()["choices"][0]["message"]["content"]
-        st.session_state.last_api = content
-        st.session_state.last_raw = r.text
-        return content
-    except Exception as e:
-        st.error(f"Connection error: {e}")
         return None
 def clean_json_response(text):
     if not text:
         return None
-    orig = text
-    # strip ``` fences
-    text = re.sub(r'```(?:json)?', '', text).strip()
-    # find outer braces
-    start, end = text.find('{'), text.rfind('}') + 1
-    if start < 0 or end < 1:
-        st.error("Couldn't locate JSON in response.")
-        st.code(orig)
-        return None
-    frag = text[start:end]
-    # remove stray trailing commas
-    frag = re.sub(r',\s*([}\]])', r'\1', frag)
     try:
-        return json.loads(frag)
-    except json.JSONDecodeError as e:
-        # attempt to insert missing commas between adjacent fields
-        repaired = re.sub(r'"\s*"\s*(?="[^"]+"\s*:)', '","', frag)
         try:
-            return json.loads(repaired)
         except json.JSONDecodeError:
-            st.error(f"JSON parse error: {e}")
-            st.code(frag)
-            return None
-def fallback_supplier(text):
-    for line in text.splitlines():
-        line = line.strip()
-        if line:
-            return line
     return None
-def get_extraction_prompt(model_choice, txt):
-    if model_choice.startswith("DeepSeek"):
-        return (
-            "Extract full invoice info and RETURN ONLY a single-line json object with fields:\n"
-            '{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
-            '"po_number":"string|null","invoice_value":"string with currency",'
-            '"line_items":[{"description":"string","quantity":"number","unit_price":"string with currency","total_price":"string with currency"}]}\n'
-            "Use null for missing. NO extra text.\n\n"
-            f"Invoice Text:\n{txt}"
-        )
     else:
-        return (
-            "Extract invoice data and RETURN ONLY a compact, one-line json object exactly:\n"
-            '{"invoice_header":{"invoice_number":"string","invoice_date":"YYYY-MM-DD",'
-            '"po_number":"string|null","invoice_value":"string with currency",'
-            '"supplier_name":"string|null","customer_name":"string|null"},'
-            '"line_items":[{"item_number":"string|null","description":"string","quantity":number,'
-            '"unit_price":"string with currency","total_price":"string with currency"}]}\n'
-            "Use null for missing. NO extras.\n\n"
-            f"Invoice Text:\n{txt}"
-        )
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
-    raw = query_llm(model_choice, prompt)
-    if not raw:
         return None
-    data = clean_json_response(raw)
-    if not data:
         return None
-    # normalize + supplier fallback
-    if model_choice in ("Llama 4 Mavericks","Mistral Small"):
-        hdr = data.setdefault("invoice_header", {})
-        for k in ("invoice_number","invoice_date","po_number","invoice_value","supplier_name","customer_name"):
-            hdr.setdefault(k, None)
-        if not hdr.get("supplier_name"):
-            hdr["supplier_name"] = fallback_supplier(text)
-        items = data.setdefault("line_items", [])
-        for itm in items:
-            for k in ("item_number","description","quantity","unit_price","total_price"):
-                itm.setdefault(k, None)
-    else:
-        for k in ("invoice_number","invoice_date","po_number","invoice_value"):
-            data.setdefault(k, None)
-        items = data.setdefault("line_items", [])
-        for itm in items:
-            for k in ("description","quantity","unit_price","total_price"):
-                itm.setdefault(k, None)
-    return data
-# ---- UI ----
-tab1, tab2 = st.tabs(["PDF Summarizer","Invoice Extractor"])
 with tab1:
-    st.title("PDF → Bullet Points")
-    pdf = st.file_uploader("Upload PDF", type="pdf")
-    pct = st.slider("Summarization %", 1, 100, 20)
-    if st.button("Summarize") and pdf:
-        txt = read_pdf(io.BytesIO(pdf.getvalue()))
-        keys = extract_key_phrases(txt)
-        scores = score_sentences(txt, keys)
-        n = max(1, len(scores)*pct//100)
-        st.markdown(summarize_text(scores, num_points=n))
 with tab2:
-    st.title("Invoice Extractor")
-    mdl = st.selectbox("Model", list(MODELS.keys()))
-    inv_pdf = st.file_uploader("Invoice PDF", type="pdf")
-    if st.button("Extract") and inv_pdf:
-        txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
-        info = extract_invoice_info(mdl, txt)
-        if info:
-            st.success("Extraction Complete")
-            if mdl in ("Llama 4 Mavericks","Mistral Small"):
-                h=info["invoice_header"]
-                c1,c2,c3 = st.columns(3)
-                c1.metric("Invoice #", h["invoice_number"]); c1.metric("Supplier", h["supplier_name"])
-                c2.metric("Date", h["invoice_date"]);    c2.metric("Customer", h["customer_name"])
-                c3.metric("PO #", h["po_number"]);        c3.metric("Total", h["invoice_value"])
-                st.subheader("Line Items"); st.table(info["line_items"])
             else:
-                c1,c2 = st.columns(2)
-                c1.metric("Invoice #", info["invoice_number"]); c1.metric("PO #", info["po_number"])
-                c2.metric("Date", info["invoice_date"]);        c2.metric("Value", info["invoice_value"])
-                st.subheader("Line Items"); st.table(info["line_items"])
-    if "last_api" in st.session_state:
-        with st.expander("Debug"):
-            st.code(st.session_state.last_api)
-            st.code(st.session_state.last_raw)

 import streamlit as st
+from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 import io
 import requests
 import json
 import re
 import os
+from datetime import datetime
+# Configure Streamlit
+st.set_page_config(
+    page_title="PDF Tools - Summarizer & Invoice Extractor",
+    layout="wide",
+)
+# Model Configuration for Invoice Extractor
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model_name": "deepseek-chat",
+        "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": {"type": "json_object"}
     },
     "DeepSeek R1": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model_name": "deepseek-reasoner",
+        "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": None
     },
     "Llama 4 Mavericks": {
         "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model_name": "meta-llama/llama-4-maverick:free",
+        "api_key_env": "OPENROUTER_API_KEY",
         "response_format": {"type": "json_object"},
         "extra_headers": {
             "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor"
+        }
+    }
 }
 def get_api_key(model_choice):
+    """Get the appropriate API key based on model choice"""
+    api_key_env = MODELS[model_choice]["api_key_env"]
+    api_key = os.environ.get(api_key_env)
+    if not api_key:
+        st.error(f"❌ `{api_key_env}` environment variable not set!")
         st.stop()
+    return api_key
 def query_llm(model_choice, prompt):
+    """Call the appropriate API based on model choice"""
+    config = MODELS[model_choice]
     headers = {
+        "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
+    if "extra_headers" in config:
+        headers.update(config["extra_headers"])
     payload = {
+        "model": config["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
+    if config["response_format"]:
+        payload["response_format"] = config["response_format"]
     try:
+        with st.spinner(f"🔍 Analyzing with {model_choice}..."):
+            response = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
+            if response.status_code != 200:
+                st.error(f"🚨 API Error {response.status_code}: {response.text}")
+                return None
+            try:
+                content = response.json()["choices"][0]["message"]["content"]
+                st.session_state.last_api_response = content
+                st.session_state.last_api_response_raw = response.text
+                return content
+            except KeyError as e:
+                st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
+                return None
+    except requests.exceptions.RequestException as e:
+        st.error(f"🌐 Connection Failed: {str(e)}")
         return None
 def clean_json_response(text):
+    """Improved JSON extraction with comprehensive error handling"""
     if not text:
         return None
+    # First attempt to parse directly
     try:
+        data = json.loads(text)
+        return data
+    except json.JSONDecodeError:
+        pass
+    # Try to extract JSON from potential markdown
+    json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
+    if json_match:
         try:
+            return json.loads(json_match.group(1))
         except json.JSONDecodeError:
+            pass
+    # Try to find any JSON-like structure
+    try:
+        start_idx = text.find('{')
+        end_idx = text.rfind('}') + 1
+        if start_idx != -1 and end_idx != 0:
+            return json.loads(text[start_idx:end_idx])
+    except:
+        pass
+    # Final fallback - manual reconstruction
+    try:
+        if '"invoice_header":' in text and '"line_items":' in text:
+            header_part = text.split('"line_items":')[0]
+            line_items_part = text.split('"line_items":')[1]
+            # Ensure proper closing of JSON
+            if not header_part.strip().endswith('{'):
+                header_part += '{'
+            if not line_items_part.strip().endswith('}}'):
+                line_items_part = line_items_part.split('}')[0] + ']}}'
+            reconstructed = header_part + '"line_items":' + line_items_part
+            return json.loads(reconstructed)
+    except Exception as e:
+        st.warning(f"Could not fully reconstruct JSON: {str(e)}")
+        return None
     return None
+def get_extraction_prompt(model_choice, text):
+    """Return the appropriate prompt based on model choice"""
+    if model_choice == "DeepSeek v3":
+        return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
+{{
+  "invoice_number": "string",
+  "invoice_date": "YYYY-MM-DD",
+  "po_number": "string or null",
+  "invoice_value": "string with currency symbol",
+  "line_items": [
+    {{
+      "description": "string",
+      "quantity": "number or string",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Include all line items found in the invoice
+4. For line items, quantity can be number or string, prices should include currency
+5. Do not include any explanations or notes
+Invoice Text:
+""" + text
+    elif model_choice == "DeepSeek R1":
+        return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
+{{
+  "invoice_number": "string or null",
+  "invoice_date": "YYYY-MM-DD or null",
+  "po_number": "string or null",
+  "invoice_value": "string with currency or null",
+  "line_items": [
+    {{
+      "description": "string",
+      "quantity": "number or string",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Invoice Text:
+""" + text
+    else:  # Llama 4 Mavericks
+        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
+{{
+  "invoice_header": {{
+    "invoice_number": "string",
+    "invoice_date": "YYYY-MM-DD",
+    "po_number": "string or null",
+    "invoice_value": "string with currency",
+    "supplier_name": "string or null",
+    "customer_name": "string or null"
+  }},
+  "line_items": [
+    {{
+      "item_number": "string or null",
+      "description": "string",
+      "quantity": "number",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Date format must be YYYY-MM-DD
+4. All currency values must include currency symbol or code
+5. Include all line items found in the invoice
+6. For line items, quantity should be a number, prices as strings with currency
+7. Do not include any explanations or notes
+Invoice Text:
+""" + text
+def format_currency(value):
+    """Helper function to format currency values consistently"""
+    if not value:
+        return "N/A"
+    if isinstance(value, (int, float)):
+        return f"${value:,.2f}"
+    return value
+def display_line_items(line_items, model_choice="DeepSeek v3"):
+    """Display line items in a formatted table"""
+    if not line_items:
+        st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
+        return
+    st.subheader("📋 Line Items")
+    if model_choice == "Llama 4 Mavericks":
+        # Display as a table for Llama
+        items_display = []
+        for idx, item in enumerate(line_items, 1):
+            items_display.append({
+                "#": idx,
+                "Description": item.get("description", "N/A"),
+                "Quantity": item.get("quantity", 0),
+                "Unit Price": item.get("unit_price", "N/A"),
+                "Total Price": item.get("total_price", "N/A")
+            })
+        st.table(items_display)
+    else:
+        # Display in columns for DeepSeek models
+        cols = st.columns([4, 2, 2, 2])
+        with st.container():
+            cols[0].write("**Description**")
+            cols[1].write("**Qty**")
+            cols[2].write("**Unit Price**")
+            cols[3].write("**Total**")
+            for item in line_items:
+                cols = st.columns([4, 2, 2, 2])
+                cols[0].write(item.get("description", "N/A"))
+                cols[1].write(item.get("quantity", "N/A"))
+                cols[2].write(format_currency(item.get("unit_price", "N/A")))
+                cols[3].write(format_currency(item.get("total_price", "N/A")))
+            st.divider()
+def display_invoice_data(model_choice, invoice_data):
+    if not invoice_data:
+        return
+    if model_choice == "Llama 4 Mavericks":
+        # Display header information
+        st.subheader("Invoice Summary")
+        header = invoice_data.get("invoice_header", {})
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Invoice Number", header.get("invoice_number", "Not found"))
+            st.metric("Supplier", header.get("supplier_name", "Not found"))
+        with col2:
+            st.metric("Invoice Date", header.get("invoice_date", "Not found"))
+            st.metric("Customer", header.get("customer_name", "Not found"))
+        with col3:
+            st.metric("PO Number", header.get("po_number", "Not found"))
+            st.metric("Total Value", header.get("invoice_value", "Not found"))
+        # Display line items
+        display_line_items(invoice_data.get("line_items", []), model_choice)
+        # Calculate and display subtotal if not provided in header
+        if not header.get("invoice_value"):
+            try:
+                total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
+                          for item in invoice_data.get("line_items", []) if item.get("total_price"))
+                st.metric("Calculated Total", f"${total:,.2f}")
+            except:
+                pass
     else:
+        # Display for DeepSeek models
+        st.success("Information extracted successfully!")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
+            st.metric("PO Number", invoice_data.get("po_number", "Not found"))
+        with col2:
+            st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
+            st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
+        # Display line items for both DeepSeek models
+        display_line_items(invoice_data.get("line_items", []), model_choice)
 def extract_invoice_info(model_choice, text):
+    """Extract structured data from pasted text"""
     prompt = get_extraction_prompt(model_choice, text)
+    result = query_llm(model_choice, prompt)
+    if not result:
         return None
+    parsed_data = clean_json_response(result)
+    if not parsed_data:
+        st.error("Failed to parse JSON. Raw response:")
+        st.code(result)
         return None
+    # Normalize data structure based on model
+    if model_choice == "Llama 4 Mavericks":
+        if "invoice_header" not in parsed_data:
+            parsed_data["invoice_header"] = {}
+        if "line_items" not in parsed_data:
+            parsed_data["line_items"] = []
+        # Set default values for header if missing
+        header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
+        for field in header_fields:
+            if field not in parsed_data["invoice_header"]:
+                parsed_data["invoice_header"][field] = None
+        # Validate line items structure
+        for item in parsed_data["line_items"]:
+            item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
+            for field in item_fields:
+                if field not in item:
+                    item[field] = None if field != "quantity" else 0
+                    if field == "quantity" and not isinstance(item[field], (int, float)):
+                        try:
+                            item[field] = float(item[field])
+                        except (ValueError, TypeError):
+                            item[field] = 0
+    else:  # DeepSeek models
+        # Ensure all required fields exist
+        for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
+            if field not in parsed_data:
+                parsed_data[field] = None
+        # Ensure line_items exists and has proper structure
+        if "line_items" not in parsed_data:
+            parsed_data["line_items"] = []
+        else:
+            for item in parsed_data["line_items"]:
+                item_fields = ["description", "quantity", "unit_price", "total_price"]
+                for field in item_fields:
+                    if field not in item:
+                        item[field] = None if field != "quantity" else 0
+    return parsed_data
+# Create tabs for different functionalities
+tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
+# PDF Summarizer Tab
 with tab1:
+    st.title("PDF to Bullet Point Summarizer 🗟 🔏")
+    # File uploader for the PDF
+    uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
+    # Slider for users to select the summarization extent
+    summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
+    # Submit button
+    submit_button = st.button("Generate Summary", key="summary_button")
+    # Check if the submit button is pressed
+    if submit_button and uploaded_file is not None:
+        with st.spinner('Processing...'):
+            # Read the PDF content
+            text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
+            # Extract key phrases from the text
+            key_phrases = extract_key_phrases(text)
+            # Score sentences based on the key phrases
+            sentence_scores = score_sentences(text, key_phrases)
+            # Determine the number of bullet points based on the selected summarization scale
+            total_sentences = len(list(sentence_scores.keys()))
+            num_points = max(1, total_sentences * summary_scale // 100)
+            # Generate the bullet-point summary
+            summary = summarize_text(sentence_scores, num_points=num_points)
+            # Display the summary as bullet points
+            st.subheader("Here's the summary: ")
+            st.markdown(summary)
+# Invoice Extractor Tab
 with tab2:
+    st.title("📋 Invoice Extractor from PDF")
+    st.write("Upload an invoice PDF to extract key details")
+    # Model selection
+    model_choice = st.selectbox(
+        "Select AI Model",
+        list(MODELS.keys()),
+        index=0,
+        help="Choose which AI model to use for extraction",
+        key="model_choice"
+    )
+    # File uploader for the invoice PDF
+    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
+    if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
+        with st.spinner('Reading PDF...'):
+            # Read the PDF content
+            invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
+        # Process in status container
+        with st.status("Processing...", expanded=True) as status:
+            st.write(f"🤖 Querying {model_choice} API...")
+            invoice_data = extract_invoice_info(model_choice, invoice_text)
+            if invoice_data:
+                status.update(label="✅ Extraction Complete!", state="complete")
+                display_invoice_data(model_choice, invoice_data)
             else:
+                status.update(label="❌ Extraction Failed", state="error")
+                st.error("Failed to extract information. Try simplifying the text.")
+        # Debug information outside the status container
+        if invoice_data and "last_api_response" in st.session_state:
+            with st.expander("Debug Information"):
+                st.write("API Response:")
+                st.json(st.session_state.last_api_response)
+                st.write("Raw API Response:")
+                st.code(st.session_state.get("last_api_response_raw", "No response"))