PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

835898b

verified ·

1 Parent(s): 7e5da41

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -21

app.py CHANGED Viewed

@@ -10,11 +10,11 @@ from main import read_pdf, extract_key_phrases, score_sentences, summarize_text
 # Configure Streamlit
 st.set_page_config(
-    page_title="PDF Tools - Summarizer & Invoice Extractor",
     layout="wide",
 )
-# Model Configuration for Invoice Extractor
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
@@ -93,9 +93,9 @@ def clean_json_response(text):
     if not text:
         return None
     original = text
-    # remove any ``` fences
     text = re.sub(r'```(?:json)?', '', text).strip()
-    # find outer braces
     start = text.find('{')
     end = text.rfind('}') + 1
     if start < 0 or end < 1:
@@ -111,7 +111,6 @@ def clean_json_response(text):
         return None
 def get_extraction_prompt(model_choice, text):
-    # NOTE: every prompt below includes the word "json" in lowercase
     if model_choice == "DeepSeek v3":
         return (
             "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
@@ -121,12 +120,13 @@ def get_extraction_prompt(model_choice, text):
             '  "po_number": "string or null",\n'
             '  "invoice_value": "string with currency symbol",\n'
             '  "line_items": [\n'
-            "    {...}\n"
             "  ]\n"
             "}\n"
             "Rules:\n"
             "1. Use null for missing fields\n"
-            "2. Do not include any additional text\n\n"
             "Invoice Text:\n"
             + text
         )
@@ -134,23 +134,43 @@ def get_extraction_prompt(model_choice, text):
     elif model_choice == "DeepSeek R1":
         return (
             "Please extract invoice info from the text below and return only raw json:\n"
-            "{...}\n"
             "Invoice Text:\n"
             + text
         )
-    else:  # Llama / Mistral
         return (
-            "Extract complete invoice information and return a valid json object with these fields:\n"
             "{\n"
-            '  "invoice_header": {...},\n'
-            '  "line_items": [...]\n'
             "}\n"
             "Rules:\n"
-            "1. Return ONLY json\n"
-            "2. Date format YYYY-MM-DD\n"
-            "3. Currency values with symbol\n"
-            "4. Do not include any explanations\n\n"
             "Invoice Text:\n"
             + text
         )
@@ -169,7 +189,7 @@ def extract_invoice_info(model_choice, text):
     if not data:
         return None
-    # normalize
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
         hdr = data.setdefault("invoice_header", {})
         for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
@@ -192,7 +212,7 @@ def extract_invoice_info(model_choice, text):
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
-    st.title("PDF to Bullet Point Summarizer")
     pdf = st.file_uploader("Upload PDF", type="pdf")
     pct = st.slider("Summarization (%)", 1, 100, 20)
     if st.button("Summarize") and pdf:
@@ -200,9 +220,9 @@ with tab1:
         keys = extract_key_phrases(txt)
         scores = score_sentences(txt, keys)
         n = max(1, len(scores) * pct // 100)
-        bullet = summarize_text(scores, num_points=n)
         st.subheader("Summary")
-        st.markdown(bullet)
 with tab2:
     st.title("Invoice Extractor")
@@ -212,7 +232,7 @@ with tab2:
         txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
         info = extract_invoice_info(mdl, txt)
         if info:
-            st.success("Done")
             if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
                 h = info["invoice_header"]
                 c1, c2, c3 = st.columns(3)

 # Configure Streamlit
 st.set_page_config(
+    page_title="PDF Tools – Summarizer & Invoice Extractor",
     layout="wide",
 )
+# Model configurations
 MODELS = {
     "DeepSeek v3": {
         "api_url": "https://api.deepseek.com/v1/chat/completions",
     if not text:
         return None
     original = text
+    # strip any ``` fences
     text = re.sub(r'```(?:json)?', '', text).strip()
+    # locate outermost JSON braces
     start = text.find('{')
     end = text.rfind('}') + 1
     if start < 0 or end < 1:
         return None
 def get_extraction_prompt(model_choice, text):
     if model_choice == "DeepSeek v3":
         return (
             "Extract complete invoice information and return ONLY a valid json object with these fields:\n"
             '  "po_number": "string or null",\n'
             '  "invoice_value": "string with currency symbol",\n'
             '  "line_items": [\n'
+            "    { \"description\": \"string\", \"quantity\": \"number or string\", "
+            "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }\n"
             "  ]\n"
             "}\n"
             "Rules:\n"
             "1. Use null for missing fields\n"
+            "2. Do not include any extra text\n\n"
             "Invoice Text:\n"
             + text
         )
     elif model_choice == "DeepSeek R1":
         return (
             "Please extract invoice info from the text below and return only raw json:\n"
+            "{ \"invoice_number\": \"string or null\", \"invoice_date\": \"YYYY-MM-DD or null\", "
+            "\"po_number\": \"string or null\", \"invoice_value\": \"string with currency or null\", "
+            "\"line_items\": [{ \"description\": \"string\", \"quantity\": \"number or string\", "
+            "\"unit_price\": \"string with currency\", \"total_price\": \"string with currency\" }] }\n"
             "Invoice Text:\n"
             + text
         )
+    else:  # Llama & Mistral
         return (
+            "You are given the text of an invoice. Extract the invoice information and return ONLY a valid json object "
+            "formatted exactly as below (nothing else):\n"
             "{\n"
+            '  "invoice_header": {\n'
+            '    "invoice_number": "string",\n'
+            '    "invoice_date": "YYYY-MM-DD",\n'
+            '    "po_number": "string or null",\n'
+            '    "invoice_value": "string with currency symbol",\n'
+            '    "supplier_name": "string or null",\n'
+            '    "customer_name": "string or null"\n'
+            '  },\n'
+            '  "line_items": [\n'
+            '    {\n'
+            '      "item_number": "string or null",\n'
+            '      "description": "string",\n'
+            '      "quantity": number,\n'
+            '      "unit_price": "string with currency symbol",\n'
+            '      "total_price": "string with currency symbol"\n'
+            '    }\n'
+            '  ]\n'
             "}\n"
             "Rules:\n"
+            "1. Date: YYYY-MM-DD\n"
+            "2. Use null for missing values\n"
+            "3. Currency values must include a symbol or code\n"
+            "4. No extra keys or explanatory text\n"
+            "5. Output must start with '{' and end with '}'\n\n"
             "Invoice Text:\n"
             + text
         )
     if not data:
         return None
+    # normalize fields
     if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
         hdr = data.setdefault("invoice_header", {})
         for k in ["invoice_number", "invoice_date", "po_number", "invoice_value", "supplier_name", "customer_name"]:
 tab1, tab2 = st.tabs(["PDF Summarizer", "Invoice Extractor"])
 with tab1:
+    st.title("PDF to Bullet-Point Summarizer")
     pdf = st.file_uploader("Upload PDF", type="pdf")
     pct = st.slider("Summarization (%)", 1, 100, 20)
     if st.button("Summarize") and pdf:
         keys = extract_key_phrases(txt)
         scores = score_sentences(txt, keys)
         n = max(1, len(scores) * pct // 100)
+        summary = summarize_text(scores, num_points=n)
         st.subheader("Summary")
+        st.markdown(summary)
 with tab2:
     st.title("Invoice Extractor")
         txt = read_pdf(io.BytesIO(inv_pdf.getvalue()))
         info = extract_invoice_info(mdl, txt)
         if info:
+            st.success("Extraction Complete")
             if mdl in ["Llama 4 Mavericks", "Mistral Small"]:
                 h = info["invoice_header"]
                 c1, c2, c3 = st.columns(3)