AP_AGENT

Sleeping

App Files Files Community

Seth0330 commited on May 20, 2025

Commit

cdae312

verified ·

1 Parent(s): 0c39d40

Update app.py

Browse files

Files changed (1) hide show

app.py +428 -25

app.py CHANGED Viewed

@@ -1,38 +1,441 @@
 import streamlit as st
 from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 import io
-# Initialize your Streamlit app
-st.title("PDF to Bullet Point Summarizer 🗟 🔏")
-# File uploader for the PDF
-uploaded_file = st.file_uploader("Upload your PDF document", type="pdf")
-# Slider for users to select the summarization extent
-summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20)
-# Submit button
-submit_button = st.button("Generate Summary")
-# Check if the submit button is pressed
-if submit_button and uploaded_file is not None:
-    with st.spinner('Processing...'):
-        # Read the PDF content
-        text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
-        # Extract key phrases from the text
-        key_phrases = extract_key_phrases(text)
-        # Score sentences based on the key phrases
-        sentence_scores = score_sentences(text, key_phrases)
-        # Determine the number of bullet points based on the selected summarization scale
-        total_sentences = len(list(sentence_scores.keys()))
-        num_points = max(1, total_sentences * summary_scale // 100)
-        # Generate the bullet-point summary
-        summary = summarize_text(sentence_scores, num_points=num_points)
-        # Display the summary as bullet points
-        st.subheader("Here's the summary: ")
-        st.markdown(summary)

 import streamlit as st
 from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 import io
+import requests
+import json
+import re
+import os
+from datetime import datetime
+# Configure Streamlit
+st.set_page_config(
+    page_title="PDF Tools - Summarizer & Invoice Extractor",
+    layout="wide",
+)
+# Model Configuration for Invoice Extractor
+MODELS = {
+    "DeepSeek v3": {
+        "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model_name": "deepseek-chat",
+        "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": {"type": "json_object"}
+    },
+    "DeepSeek R1": {
+        "api_url": "https://api.deepseek.com/v1/chat/completions",
+        "model_name": "deepseek-reasoner",
+        "api_key_env": "DEEPSEEK_API_KEY",
+        "response_format": None
+    },
+    "Llama 4 Mavericks": {
+        "api_url": "https://openrouter.ai/api/v1/chat/completions",
+        "model_name": "meta-llama/llama-4-maverick:free",
+        "api_key_env": "OPENROUTER_API_KEY",
+        "response_format": {"type": "json_object"},
+        "extra_headers": {
+            "HTTP-Referer": "https://huggingface.co",
+            "X-Title": "Invoice Extractor"
+        }
+    }
+}
+# Create tabs for different functionalities
+tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
+# PDF Summarizer Tab
+with tab1:
+    st.title("PDF to Bullet Point Summarizer 🗟 🔏")
+    # File uploader for the PDF
+    uploaded_file = st.file_uploader("Upload your PDF document", type="pdf", key="pdf_uploader")
+    # Slider for users to select the summarization extent
+    summary_scale = st.slider("Select the extent of summarization (%)", min_value=1, max_value=100, value=20, key="summary_scale")
+    # Submit button
+    submit_button = st.button("Generate Summary", key="summary_button")
+    # Check if the submit button is pressed
+    if submit_button and uploaded_file is not None:
+        with st.spinner('Processing...'):
+            # Read the PDF content
+            text = read_pdf(io.BytesIO(uploaded_file.getvalue()))
+            # Extract key phrases from the text
+            key_phrases = extract_key_phrases(text)
+            # Score sentences based on the key phrases
+            sentence_scores = score_sentences(text, key_phrases)
+            # Determine the number of bullet points based on the selected summarization scale
+            total_sentences = len(list(sentence_scores.keys()))
+            num_points = max(1, total_sentences * summary_scale // 100)
+            # Generate the bullet-point summary
+            summary = summarize_text(sentence_scores, num_points=num_points)
+            # Display the summary as bullet points
+            st.subheader("Here's the summary: ")
+            st.markdown(summary)
+# Invoice Extractor Tab
+with tab2:
+    st.title("📋 Invoice Extractor from PDF")
+    st.write("Upload an invoice PDF to extract key details")
+    # Model selection
+    model_choice = st.selectbox(
+        "Select AI Model",
+        list(MODELS.keys()),
+        index=0,
+        help="Choose which AI model to use for extraction",
+        key="model_choice"
+    )
+    # File uploader for the invoice PDF
+    invoice_pdf = st.file_uploader("Upload Invoice PDF", type="pdf", key="invoice_pdf_uploader")
+    if st.button("Extract Invoice Information", key="invoice_button") and invoice_pdf is not None:
+        with st.spinner('Reading PDF...'):
+            # Read the PDF content
+            invoice_text = read_pdf(io.BytesIO(invoice_pdf.getvalue()))
+        # Process in status container
+        with st.status("Processing...", expanded=True) as status:
+            st.write(f"🤖 Querying {model_choice} API...")
+            invoice_data = extract_invoice_info(model_choice, invoice_text)
+            if invoice_data:
+                status.update(label="✅ Extraction Complete!", state="complete")
+                display_invoice_data(model_choice, invoice_data)
+            else:
+                status.update(label="❌ Extraction Failed", state="error")
+                st.error("Failed to extract information. Try simplifying the text.")
+        # Debug information outside the status container
+        if invoice_data and "last_api_response" in st.session_state:
+            with st.expander("Debug Information"):
+                st.write("API Response:")
+                st.json(st.session_state.last_api_response)
+                st.write("Raw API Response:")
+                st.code(st.session_state.get("last_api_response_raw", "No response"))
+# Invoice Extractor Functions
+def get_api_key(model_choice):
+    """Get the appropriate API key based on model choice"""
+    api_key_env = MODELS[model_choice]["api_key_env"]
+    api_key = os.environ.get(api_key_env)
+    if not api_key:
+        st.error(f"❌ `{api_key_env}` environment variable not set!")
+        st.stop()
+    return api_key
+def clean_json_response(text):
+    """Improved JSON extraction from API response with better error handling"""
+    # First try to parse directly as JSON
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+    # Try to extract JSON from markdown code blocks
+    json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
+    if json_match:
+        try:
+            return json.loads(json_match.group(1))
+        except json.JSONDecodeError:
+            pass
+    # Try to extract any JSON-like content
+    json_match = re.search(r'\{.*\}', text, re.DOTALL)
+    if json_match:
+        try:
+            return json.loads(json_match.group(0))
+        except json.JSONDecodeError:
+            pass
+    # Fallback to simple key-value parsing
+    try:
+        data = {}
+        for line in text.split('\n'):
+            if ':' in line:
+                parts = line.split(':', 1)
+                if len(parts) == 2:
+                    key, val = parts
+                    key = key.strip().strip('"').lower().replace(' ', '_')
+                    data[key] = val.strip().strip('"')
+        return data if data else None
+    except Exception:
+        return None
+def query_llm(model_choice, prompt):
+    """Call the appropriate API based on model choice"""
+    config = MODELS[model_choice]
+    headers = {
+        "Authorization": f"Bearer {get_api_key(model_choice)}",
+        "Content-Type": "application/json",
+    }
+    # Add extra headers if they exist (for OpenRouter)
+    if "extra_headers" in config:
+        headers.update(config["extra_headers"])
+    payload = {
+        "model": config["model_name"],
+        "messages": [{"role": "user", "content": prompt}],
+        "temperature": 0.1,
+        "max_tokens": 2000,
+    }
+    # Add response format if specified
+    if config["response_format"]:
+        payload["response_format"] = config["response_format"]
+    try:
+        with st.spinner(f"🔍 Analyzing with {model_choice}..."):
+            response = requests.post(config["api_url"], headers=headers, json=payload, timeout=60)
+            if response.status_code != 200:
+                st.error(f"🚨 API Error {response.status_code}: {response.text}")
+                return None
+            try:
+                content = response.json()["choices"][0]["message"]["content"]
+                st.session_state.last_api_response = content
+                st.session_state.last_api_response_raw = response.text
+                return content
+            except KeyError as e:
+                st.error(f"KeyError in response: {e}\nFull response: {response.json()}")
+                return None
+    except requests.exceptions.RequestException as e:
+        st.error(f"🌐 Connection Failed: {str(e)}")
+        return None
+def get_extraction_prompt(model_choice, text):
+    """Return the appropriate prompt based on model choice"""
+    if model_choice == "DeepSeek v3":
+        return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
+{{
+  "invoice_number": "string",
+  "invoice_date": "YYYY-MM-DD",
+  "po_number": "string or null",
+  "invoice_value": "string with currency symbol",
+  "line_items": [
+    {{
+      "description": "string",
+      "quantity": "number or string",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Include all line items found in the invoice
+4. For line items, quantity can be number or string, prices should include currency
+5. Do not include any explanations or notes
+Invoice Text:
+""" + text
+    elif model_choice == "DeepSeek R1":
+        return f"""Please extract the following information from the invoice text below and return ONLY the raw JSON without any markdown formatting or additional text:
+{{
+  "invoice_number": "string or null",
+  "invoice_date": "YYYY-MM-DD or null",
+  "po_number": "string or null",
+  "invoice_value": "string with currency or null",
+  "line_items": [
+    {{
+      "description": "string",
+      "quantity": "number or string",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Invoice Text:
+""" + text
+    else:  # Llama 4 Mavericks
+        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
+{{
+  "invoice_header": {{
+    "invoice_number": "string",
+    "invoice_date": "YYYY-MM-DD",
+    "po_number": "string or null",
+    "invoice_value": "string with currency",
+    "supplier_name": "string or null",
+    "customer_name": "string or null"
+  }},
+  "line_items": [
+    {{
+      "item_number": "string or null",
+      "description": "string",
+      "quantity": "number",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Date format must be YYYY-MM-DD
+4. All currency values must include currency symbol or code
+5. Include all line items found in the invoice
+6. For line items, quantity should be a number, prices as strings with currency
+7. Do not include any explanations or notes
+Invoice Text:
+""" + text
+def format_currency(value):
+    """Helper function to format currency values consistently"""
+    if not value:
+        return "N/A"
+    if isinstance(value, (int, float)):
+        return f"${value:,.2f}"
+    return value
+def display_line_items(line_items, model_choice="DeepSeek v3"):
+    """Display line items in a formatted table"""
+    if not line_items:
+        st.info("No line items found in this invoice.")
+        return
+    st.subheader("📋 Line Items")
+    if model_choice == "Llama 4 Mavericks":
+        # Display as a table for Llama
+        items_display = []
+        for idx, item in enumerate(line_items, 1):
+            items_display.append({
+                "#": idx,
+                "Description": item.get("description", "N/A"),
+                "Quantity": item.get("quantity", 0),
+                "Unit Price": item.get("unit_price", "N/A"),
+                "Total Price": item.get("total_price", "N/A")
+            })
+        st.table(items_display)
+    else:
+        # Display in columns for DeepSeek models
+        cols = st.columns([4, 2, 2, 2])
+        with st.container():
+            cols[0].write("**Description**")
+            cols[1].write("**Qty**")
+            cols[2].write("**Unit Price**")
+            cols[3].write("**Total**")
+            for item in line_items:
+                cols = st.columns([4, 2, 2, 2])
+                cols[0].write(item.get("description", "N/A"))
+                cols[1].write(item.get("quantity", "N/A"))
+                cols[2].write(format_currency(item.get("unit_price", "N/A")))
+                cols[3].write(format_currency(item.get("total_price", "N/A")))
+            st.divider()
+def display_invoice_data(model_choice, invoice_data):
+    if not invoice_data:
+        return
+    if model_choice == "Llama 4 Mavericks":
+        # Display header information
+        st.subheader("Invoice Summary")
+        header = invoice_data.get("invoice_header", {})
+        col1, col2, col3 = st.columns(3)
+        with col1:
+            st.metric("Invoice Number", header.get("invoice_number", "Not found"))
+            st.metric("Supplier", header.get("supplier_name", "Not found"))
+        with col2:
+            st.metric("Invoice Date", header.get("invoice_date", "Not found"))
+            st.metric("Customer", header.get("customer_name", "Not found"))
+        with col3:
+            st.metric("PO Number", header.get("po_number", "Not found"))
+            st.metric("Total Value", header.get("invoice_value", "Not found"))
+        # Display line items
+        display_line_items(invoice_data.get("line_items", []), model_choice)
+        # Calculate and display subtotal if not provided in header
+        if not header.get("invoice_value"):
+            try:
+                total = sum(float(re.sub(r'[^\d.]', '', item.get("total_price", "0")))
+                          for item in invoice_data.get("line_items", []) if item.get("total_price"))
+                st.metric("Calculated Total", f"${total:,.2f}")
+            except:
+                pass
+    else:
+        # Display for DeepSeek models
+        st.success("Information extracted successfully!")
+        col1, col2 = st.columns(2)
+        with col1:
+            st.metric("Invoice Number", invoice_data.get("invoice_number", "Not found"))
+            st.metric("PO Number", invoice_data.get("po_number", "Not found"))
+        with col2:
+            st.metric("Invoice Date", invoice_data.get("invoice_date", "Not found"))
+            st.metric("Invoice Value", format_currency(invoice_data.get("invoice_value")))
+        # Display line items for both DeepSeek models
+        display_line_items(invoice_data.get("line_items", []), model_choice)
+def extract_invoice_info(model_choice, text):
+    """Extract structured data from pasted text"""
+    prompt = get_extraction_prompt(model_choice, text)
+    result = query_llm(model_choice, prompt)
+    if not result:
+        return None
+    parsed_data = clean_json_response(result)
+    if not parsed_data:
+        st.error("Failed to parse JSON. Raw response:")
+        st.code(result)
+        return None
+    # Normalize data structure based on model
+    if model_choice == "Llama 4 Mavericks":
+        if "invoice_header" not in parsed_data:
+            parsed_data["invoice_header"] = {}
+        if "line_items" not in parsed_data:
+            parsed_data["line_items"] = []
+        # Set default values for header if missing
+        header_fields = ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]
+        for field in header_fields:
+            if field not in parsed_data["invoice_header"]:
+                parsed_data["invoice_header"][field] = None
+        # Validate line items structure
+        for item in parsed_data["line_items"]:
+            item_fields = ["item_number", "description", "quantity", "unit_price", "total_price"]
+            for field in item_fields:
+                if field not in item:
+                    item[field] = None if field != "quantity" else 0
+                    if field == "quantity" and not isinstance(item[field], (int, float)):
+                        try:
+                            item[field] = float(item[field])
+                        except (ValueError, TypeError):
+                            item[field] = 0
+    else:  # DeepSeek models
+        # Ensure all required fields exist
+        for field in ["invoice_number", "invoice_date", "po_number", "invoice_value"]:
+            if field not in parsed_data:
+                parsed_data[field] = None
+        # Ensure line_items exists and has proper structure
+        if "line_items" not in parsed_data:
+            parsed_data["line_items"] = []
+        else:
+            for item in parsed_data["line_items"]:
+                item_fields = ["description", "quantity", "unit_price", "total_price"]
+                for field in item_fields:
+                    if field not in item:
+                        item[field] = None if field != "quantity" else 0
+    return parsed_data