PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

0ce55d2

verified ·

1 Parent(s): b23347b

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -392

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import streamlit as st
-from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 import io
 import requests
 import json
@@ -7,6 +6,8 @@ import re
 import os
 from datetime import datetime
 # Configure Streamlit
 st.set_page_config(
     page_title="PDF Tools - Summarizer & Invoice Extractor",
@@ -50,442 +51,154 @@ MODELS = {
 }
 def get_api_key(model_choice):
-    """Get the appropriate API key based on model choice"""
-    api_key_env = MODELS[model_choice]["api_key_env"]
-    api_key = os.environ.get(api_key_env)
     if not api_key:
-        st.error(f"❌ `{api_key_env}` environment variable not set!")
         st.stop()
     return api_key
 def query_llm(model_choice, prompt):
-    """Call the appropriate API based on model choice"""
     config = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
-    if "extra_headers" in config:
         headers.update(config["extra_headers"])
     payload = {
         "model": config["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
-    if config["response_format"]:
         payload["response_format"] = config["response_format"]
     try:
         with st.spinner(f"🔍 Analyzing with {model_choice}..."):
-            response = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
-            if response.status_code != 200:
-                st.error(f"🚨 API Error {response.status_code}: {response.text}")
-                return None
-            try:
-                content = response.json()["choices"][0]["message"]["content"]
-                st.session_state.last_api_response = content
-                st.session_state.last_api_response_raw = response.text
-                return content
-            except KeyError as e:
-                st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
                 return None
     except requests.exceptions.RequestException as e:
-        st.error(f"🌐 Connection Failed: {str(e)}")
         return None
-def find_json_end(text):
-    """Find the end of a potentially incomplete JSON object"""
-    stack = []
-    for i, c in enumerate(text):
-        if c == '{':
-            stack.append(i)
-        elif c == '}':
-            if stack:
-                stack.pop()
-                if not stack:
-                    return i+1
-    return -1
-def clean_json_response(text, model_choice):
-    """Robust JSON extraction with advanced error handling"""
     if not text:
         return None
-    original_text = text  # Save for error reporting
-    # Model-specific preprocessing
-    if model_choice == "Mistral Small":
-        # Remove all markdown formatting
-        text = re.sub(r'^```json|```$', '', text, flags=re.MULTILINE).strip()
-    # Common JSON repair patterns
-    repair_attempts = [
-        # Try extracting JSON from markdown
-        lambda t: re.search(r'```(?:json)?\n({.*?})\n```', t, re.DOTALL),
-        # Try finding the outermost JSON object
-        lambda t: {'start': t.find('{'), 'end': t.rfind('}')+1},
-        # Try last valid JSON fragment
-        lambda t: {'start': 0, 'end': find_json_end(t)}
-    ]
-    for attempt in repair_attempts:
-        try:
-            result = attempt(text)
-            if not result:
-                continue
-            if isinstance(result, re.Match):
-                json_str = result.group(1)
-            else:
-                start, end = result['start'], result['end']
-                if start >= 0 and end > start:
-                    json_str = text[start:end]
-                else:
-                    continue
-            data = json.loads(json_str)
-            # Ensure required structure exists
-            if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-                if "invoice_header" not in data:
-                    data["invoice_header"] = {}
-                if "line_items" not in data:
-                    data["line_items"] = []
-            return data
-        except (json.JSONDecodeError, AttributeError, KeyError) as e:
-            continue
-    # Final fallback - manual reconstruction for Llama
-    if model_choice == "Llama 4 Mavericks":
-        try:
-            if '"invoice_header":' in text:
-                header_part = text.split('"line_items":')[0] if '"line_items":' in text else text
-                if not header_part.strip().endswith('}'):
-                    header_part += '}'
-                data = json.loads(header_part + ('"line_items": []}' if '"line_items":' not in text else ''))
-                data["line_items"] = data.get("line_items", [])
-                return data
-        except:
-            pass
-    st.error(f"Failed to parse JSON after multiple attempts for {model_choice}")
-    st.code(f"Original response:\n{original_text}")
-    return None
 def get_extraction_prompt(model_choice, text):
-    """Return the appropriate prompt based on model choice"""
     if model_choice == "DeepSeek v3":
-        return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
-{{
-  "invoice_number": "string",
-  "invoice_date": "YYYY-MM-DD",
-  "po_number": "string or null",
-  "invoice_value": "string with currency symbol",
-  "line_items": [
-    {{
-      "description": "string",
-      "quantity": "number or string",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Include all line items found in the invoice
-4. For line items, quantity can be number or string, prices should include currency
-5. Do not include any explanations or notes
-Invoice Text:
-""" + text
     elif model_choice == "DeepSeek R1":
-        return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
-{{
-  "invoice_number": "string or null",
-  "invoice_date": "YYYY-MM-DD or null",
-  "po_number": "string or null",
-  "invoice_value": "string with currency or null",
-  "line_items": [
-    {{
-      "description": "string",
-      "quantity": "number or string",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Invoice Text:
-""" + text
-    else:  # For Llama 4 and Mistral
-        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
-{{
-  "invoice_header": {{
-    "invoice_number": "string",
-    "invoice_date": "YYYY-MM-DD",
-    "po_number": "string or null",
-    "invoice_value": "string with currency",
-    "supplier_name": "string or null",
-    "customer_name": "string or null"
-  }},
-  "line_items": [
-    {{
-      "item_number": "string or null",
-      "description": "string",
-      "quantity": "number",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Date format must be YYYY-MM-DD
-4. All currency values must include currency symbol or code
-5. Include all line items found in the invoice
-6. For line items, quantity should be a number, prices as strings with currency
-7. Do not include any explanations or notes
-Invoice Text:
-""" + text
-def format_currency(value):
-    """Helper function to format currency values consistently"""
-    if not value:
-        return "N/A"
-    if isinstance(value, (int, float)):
-        return f"${value:,.2f}"
-    return value
-def display_line_items(line_items, model_choice="DeepSeek v3"):
-    """Display line items in a formatted table"""
-    if not line_items:
-        st.info("No line items found in this invoice.")
-        return
-    st.subheader("📋 Line Items")
-    if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-        # Display as a table for Llama/Mistral
-        items_display = []
-        for idx, item in enumerate(line_items, 1):
-            items_display.append({
-                "#": idx,
-                "Description": item.get("description", "N/A"),
-                "Quantity": item.get("quantity", 0),
-                "Unit Price": item.get("unit_price", "N/A"),
-                "Total Price": item.get("total_price", "N/A")
-            })
-        st.table(items_display)
     else:
-        # Display in columns for DeepSeek models
-        cols = st.columns([4, 2, 2, 2])
-        with st.container():
-            cols[0].write("**Description**")
-            cols[1].write("**Qty**")
-            cols[2].write("**Unit Price**")
-            cols[3].write("**Total**")
-            for item in line_items:
-                cols = st.columns([4, 2, 2, 2])
-                cols[0].write(item.get("description", "N/A"))
-                cols[1].write(item.get("quantity", "N/A"))
-                cols[2].write(format_currency(item.get("unit_price", "N/A")))
-                cols[3].write(format_currency(item.get("total_price", "N/A")))
-            st.divider()
-def display_invoice_data(model_choice, invoice_data):
-    if not invoice_data:
-        return
-    if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-        # Display header information
-        st.subheader("Invoice Summary")
-        header = invoice_data.get("invoice_header", {})
-        col1, col2, col3 = st.columns(3)
-        with col1:
-            st.metric("Invoice Number", header.get("invoice_number", "Not found"))
-            st.metric("Supplier", header.get("supplier_name", "Not found"))
-        with col2:
-            st.metric("Invoice Date", header.get("invoice_date", "Not found"))
-            st.metric("Customer", header.get("customer_name", "Not found"))
-        with col3:
-            st.metric("PO Number", header.get("po_number", "Not found"))
-            st.metric("Total Value", header.get("invoice_value", "Not found"))
-        # Display line items
-        display_line_items(invoice_data.get("line_items", []), model_choice)
-        # Calculate and display subtotal if not provided in header
-        if not header.get("invoice_value"):
-            try:
-                total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
-                          for item in invoice_data.get("line_items", []) if item.get("total_price"))
-                st.metric("Calculated Total", f"${total:,.2f}")
-            except:
-                pass
-    else:
-        # Display for DeepSeek models
-        st.success("Information extracted successfully!")
-        col1, col2 = st.columns(2)
-        with col1:
-            st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
-            st.metric("PO Number", invoice_data.get("po_number", "Not found"))
-        with col2:
-            st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
-            st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
-        # Display line items for both DeepSeek models
-        display_line_items(invoice_data.get("line_items", []), model_choice)
 def extract_invoice_info(model_choice, text):
-    """Extract structured data from pasted text"""
     prompt = get_extraction_prompt(model_choice, text)
     result = query_llm(model_choice, prompt)
     if not result:
         return None
-    parsed_data = clean_json_response(result, model_choice)
-    if not parsed_data:
-        st.error("Failed to parse JSON. Raw response:")
-        st.code(result)
         return None
-    # Normalize data structure based on model
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-        if "invoice_header" not in parsed_data:
-            parsed_data["invoice_header"] = {}
-        if "line_items" not in parsed_data:
-            parsed_data["line_items"] = []
-        # Set default values for header if missing
-        header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
-        for field in header_fields:
-            if field not in parsed_data["invoice_header"]:
-                parsed_data["invoice_header"][field] = None
-        # Validate line items structure
-        for item in parsed_data["line_items"]:
-            item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
-            for field in item_fields:
-                if field not in item:
-                    item[field] = None if field != "quantity" else 0
-                    if field == "quantity" and not isinstance(item[field], (int, float)):
-                        try:
-                            item[field] = float(item[field])
-                        except (ValueError, TypeError):
-                            item[field] = 0
-    else:  # DeepSeek models
-        # Ensure all required fields exist
-        for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
-            if field not in parsed_data:
-                parsed_data[field] = None
-        # Ensure line_items exists and has proper structure
-        if "line_items" not in parsed_data:
-            parsed_data["line_items"] = []
-        else:
-            for item in parsed_data["line_items"]:
-                item_fields = ["description", "quantity", "unit_price", "total_price"]
-                for field in item_fields:
-                    if field not in item:
-                        item[field] = None if field != "quantity" else 0
-    return parsed_data
-# Create tabs for different functionalities
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
-# PDF Summarizer Tab
 with tab1:
-    st.title("PDF to Bullet Point Summarizer 🗟 🔏")
-    # File uploader for the PDF
-    uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
-    # Slider for users to select the summarization extent
-    summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
-    # Submit button
-    submit_button = st.button("Generate Summary", key="summary_button")
-    # Check if the submit button is pressed
-    if submit_button and uploaded_file is not None:
-        with st.spinner('Processing...'):
-            # Read the PDF content
-            text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
-            # Extract key phrases from the text
-            key_phrases = extract_key_phrases(text)
-            # Score sentences based on the key phrases
-            sentence_scores = score_sentences(text, key_phrases)
-            # Determine the number of bullet points based on the selected summarization scale
-            total_sentences = len(list(sentence_scores.keys()))
-            num_points = max(1, total_sentences * summary_scale // 100)
-            # Generate the bullet-point summary
-            summary = summarize_text(sentence_scores, num_points=num_points)
-            # Display the summary as bullet points
-            st.subheader("Here's the summary: ")
-            st.markdown(summary)
-# Invoice Extractor Tab
 with tab2:
     st.title("📋 Invoice Extractor from PDF")
-    st.write("Upload an invoice PDF to extract key details")
-    # Model selection
-    model_choice = st.selectbox(
-        "Select AI Model",
-        list(MODELS.keys()),
-        index=0,
-        help="Choose which AI model to use for extraction",
-        key="model_choice"
-    )
-    # File uploader for the invoice PDF
-    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
-    if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
-        with st.spinner('Reading PDF...'):
-            # Read the PDF content
-            invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
-        # Process in status container
-        with st.status("Processing...", expanded=True) as status:
-            st.write(f"🤖 Querying {model_choice} API...")
-            invoice_data = extract_invoice_info(model_choice, invoice_text)
-            if invoice_data:
-                status.update(label="✅ Extraction Complete!", state="complete")
-                display_invoice_data(model_choice, invoice_data)
             else:
-                status.update(label="❌ Extraction Failed", state="error")
-                st.error("Failed to extract information. Try simplifying the text.")
-        # Debug information outside the status container
-        if invoice_data and "last_api_response" in st.session_state:
-            with st.expander("Debug Information"):
-                st.write("API Response:")
-                st.json(st.session_state.last_api_response)
-                st.write("Raw API Response:")
-                st.code(st.session_state.get("last_api_response_raw", "No response"))

 import streamlit as st
 import io
 import requests
 import json
 import os
 from datetime import datetime
+from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 # Configure Streamlit
 st.set_page_config(
     page_title="PDF Tools - Summarizer & Invoice Extractor",
 }
 def get_api_key(model_choice):
+    api_key = os.environ.get(MODELS[model_choice]["api_key_env"])
     if not api_key:
+        st.error(f"❌ {MODELS[model_choice]['api_key_env']} environment variable not set!")
         st.stop()
     return api_key
 def query_llm(model_choice, prompt):
     config = MODELS[model_choice]
     headers = {
         "Authorization": f"Bearer {get_api_key(model_choice)}",
         "Content-Type": "application/json",
     }
+    if config.get("extra_headers"):
         headers.update(config["extra_headers"])
     payload = {
         "model": config["model_name"],
         "messages": [{"role": "user", "content": prompt}],
         "temperature": 0.1,
         "max_tokens": 2000,
     }
+    if config.get("response_format"):
         payload["response_format"] = config["response_format"]
     try:
         with st.spinner(f"🔍 Analyzing with {model_choice}..."):
+            resp = requests.post(config["api_url"], headers=headers, json=payload, timeout=90)
+            if resp.status_code != 200:
+                st.error(f"🚨 API Error {resp.status_code}: {resp.text}")
                 return None
+            content = resp.json()["choices"][0]["message"]["content"]
+            st.session_state.last_api_response = content
+            st.session_state.last_api_response_raw = resp.text
+            return content
     except requests.exceptions.RequestException as e:
+        st.error(f"🌐 Connection Failed: {e}")
         return None
+def clean_json_response(text):
+    """Strip code fences and extract a valid JSON segment."""
     if not text:
         return None
+    original = text
+    # Remove any ``` or ```json fences
+    text = re.sub(r'```(?:json)?', '', text)
+    text = text.strip()
+    # Find the JSON object boundaries
+    start = text.find('{')
+    end = text.rfind('}') + 1
+    if start == -1 or end == 0:
+        st.error("Failed to locate JSON in the response.")
+        st.code(original)
+        return None
+    json_str = text[start:end]
+    try:
+        return json.loads(json_str)
+    except json.JSONDecodeError as e:
+        st.error(f"JSON decode error: {e}")
+        st.code(json_str)
+        return None
 def get_extraction_prompt(model_choice, text):
+    # (Prompts abbreviated here for readability—use your existing prompt definitions)
     if model_choice == "DeepSeek v3":
+        return "..."  # your DeepSeek v3 prompt
     elif model_choice == "DeepSeek R1":
+        return "..."  # your DeepSeek R1 prompt
     else:
+        return "..."  # generic Llama/Mistral prompt
 def extract_invoice_info(model_choice, text):
     prompt = get_extraction_prompt(model_choice, text)
     result = query_llm(model_choice, prompt)
     if not result:
         return None
+    data = clean_json_response(result)
+    if not data:
         return None
+    # Normalize structure
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
+        header = data.setdefault("invoice_header", {})
+        for key in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
+            header.setdefault(key, None)
+        items = data.setdefault("line_items", [])
+        for item in items:
+            for key in ["item_number", "description", "quantity", "unit_price", "total_price"]:
+                item.setdefault(key, None)
+    else:
+        for key in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
+            data.setdefault(key, None)
+        items = data.setdefault("line_items", [])
+        for item in items:
+            for key in ["description", "quantity", "unit_price", "total_price"]:
+                item.setdefault(key, None)
+    return data
+# ---- UI Layout ----
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
+    st.title("PDF to Bullet Point Summarizer 🗟")
+    pdf_file = st.file_uploader("Upload PDF", type="pdf")
+    scale = st.slider("Summarization extent (%)", 1, 100, 20)
+    if st.button("Generate Summary") and pdf_file:
+        text = read_pdf(io.BytesIO(pdf_file.getvalue()))
+        phrases = extract_key_phrases(text)
+        scores = score_sentences(text, phrases)
+        count = max(1, len(scores) * scale // 100)
+        summary = summarize_text(scores, num_points=count)
+        st.subheader("Summary:")
+        st.markdown(summary)
 with tab2:
     st.title("📋 Invoice Extractor from PDF")
+    model_choice = st.selectbox("Select AI Model", list(MODELS.keys()))
+    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf")
+    if st.button("Extract Invoice Information") and invoice_pdf:
+        invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
+        invoice_data = extract_invoice_info(model_choice, invoice_text)
+        if invoice_data:
+            st.success("Extraction Complete!")
+            if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
+                hdr = invoice_data["invoice_header"]
+                c1, c2, c3 = st.columns(3)
+                c1.metric("Invoice #", hdr.get("invoice_number"))
+                c1.metric("Supplier", hdr.get("supplier_name"))
+                c2.metric("Date", hdr.get("invoice_date"))
+                c2.metric("Customer", hdr.get("customer_name"))
+                c3.metric("PO #", hdr.get("po_number"))
+                c3.metric("Total", hdr.get("invoice_value"))
+                st.subheader("Line Items")
+                st.table(invoice_data["line_items"])
             else:
+                c1, c2 = st.columns(2)
+                c1.metric("Invoice #", invoice_data.get("invoice_number"))
+                c1.metric("PO #", invoice_data.get("po_number"))
+                c2.metric("Date", invoice_data.get("invoice_date"))
+                c2.metric("Value", invoice_data.get("invoice_value"))
+                st.subheader("Line Items")
+                st.table(invoice_data["line_items"])
+    if "last_api_response" in st.session_state:
+        with st.expander("Debug Information"):
+            st.write("Extracted content (raw string):")
+            st.code(st.session_state.last_api_response)
+            st.write("Full HTTP response text:")
+            st.code(st.session_state.get("last_api_response_raw", "No response"))