PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

dd68ed4

verified ·

1 Parent(s): e31081a

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -76

app.py CHANGED Viewed

@@ -100,89 +100,40 @@ def query_llm(model_choice, prompt):
         st.error(f"🌐 Connection Failed: {str(e)}")
         return None
-def clean_json_response(text):
-    """Improved JSON extraction with comprehensive error handling"""
     if not text:
         return None
-    # First attempt to parse directly
-    try:
-        data = json.loads(text)
-        return data
-    except json.JSONDecodeError:
-        pass
-    # Try to extract JSON from potential markdown
-    json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
-    if json_match:
-        try:
-            return json.loads(json_match.group(1))
-        except json.JSONDecodeError:
-            pass
-    # Try to find any JSON-like structure
-    try:
-        start_idx = text.find('{')
-        end_idx = text.rfind('}') + 1
-        if start_idx != -1 and end_idx != 0:
-            return json.loads(text[start_idx:end_idx])
-    except:
-        pass
-    # Final fallback - manual reconstruction
     try:
-        if '"invoice_header":' in text and '"line_items":' in text:
-            header_part = text.split('"line_items":')[0]
-            line_items_part = text.split('"line_items":')[1]
-            # Ensure proper closing of JSON
-            if not header_part.strip().endswith('{'):
-                header_part += '{'
-            if not line_items_part.strip().endswith('}}'):
-                line_items_part = line_items_part.split('}')[0] + ']}}'
-            reconstructed = header_part + '"line_items":' + line_items_part
-            return json.loads(reconstructed)
-    except Exception as e:
-        st.warning(f"Could not fully reconstruct JSON: {str(e)}")
         return None
-    return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
-    base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
-{
-  "invoice_header": {
-    "invoice_number": "string",
-    "invoice_date": "YYYY-MM-DD",
-    "po_number": "string or null",
-    "invoice_value": "string with currency",
-    "supplier_name": "string or null",
-    "customer_name": "string or null"
-  },
-  "line_items": [
-    {
-      "item_number": "string or null",
-      "description": "string",
-      "quantity": "number",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }
-  ]
-}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Date format must be YYYY-MM-DD
-4. All currency values must include currency symbol or code
-5. Include all line items found in the invoice
-6. For line items, quantity should be a number, prices as strings with currency
-7. Do not include any explanations or notes
-Invoice Text:
-""" + text
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
@@ -228,7 +179,36 @@ Invoice Text:
 """ + text
     else:  # For Llama 4 and Mistral
-        return base_prompt
 def format_currency(value):
     """Helper function to format currency values consistently"""
@@ -241,7 +221,7 @@ def format_currency(value):
 def display_line_items(line_items, model_choice="DeepSeek v3"):
     """Display line items in a formatted table"""
     if not line_items:
-        st.info("No line items found in this invoice. This may be due to incomplete data from the API.")
         return
     st.subheader("📋 Line Items")
@@ -331,7 +311,7 @@ def extract_invoice_info(model_choice, text):
     if not result:
         return None
-    parsed_data = clean_json_response(result)
     if not parsed_data:
         st.error("Failed to parse JSON. Raw response:")
         st.code(result)

         st.error(f"🌐 Connection Failed: {str(e)}")
         return None
+def clean_json_response(text, model_choice):
+    """Improved JSON extraction with model-specific handling"""
     if not text:
         return None
+    # Handle Mistral's markdown response
+    if model_choice == "Mistral Small":
+        json_match = re.search(r'```(?:json)?\n({.*?})\n```', text, re.DOTALL)
+        if json_match:
+            text = json_match.group(1)
+    # Handle Llama's truncated response
+    if model_choice == "Llama 4 Mavericks":
+        if '"line_items":' in text and not text.strip().endswith('}}'):
+            text = text.split('"line_items":')[0] + '"line_items": []}}'
+    # Try parsing the cleaned JSON
     try:
+        data = json.loads(text)
+        # Ensure proper structure exists
+        if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
+            if "invoice_header" not in data:
+                data["invoice_header"] = {}
+            if "line_items" not in data:
+                data["line_items"] = []
+        return data
+    except json.JSONDecodeError as e:
+        st.warning(f"JSON parsing failed: {str(e)}")
         return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
 """ + text
     else:  # For Llama 4 and Mistral
+        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
+{{
+  "invoice_header": {{
+    "invoice_number": "string",
+    "invoice_date": "YYYY-MM-DD",
+    "po_number": "string or null",
+    "invoice_value": "string with currency",
+    "supplier_name": "string or null",
+    "customer_name": "string or null"
+  }},
+  "line_items": [
+    {{
+      "item_number": "string or null",
+      "description": "string",
+      "quantity": "number",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Date format must be YYYY-MM-DD
+4. All currency values must include currency symbol or code
+5. Include all line items found in the invoice
+6. For line items, quantity should be a number, prices as strings with currency
+7. Do not include any explanations or notes
+Invoice Text:
+""" + text
 def format_currency(value):
     """Helper function to format currency values consistently"""
 def display_line_items(line_items, model_choice="DeepSeek v3"):
     """Display line items in a formatted table"""
     if not line_items:
+        st.info("No line items found in this invoice.")
         return
     st.subheader("📋 Line Items")
     if not result:
         return None
+    parsed_data = clean_json_response(result, model_choice)
     if not parsed_data:
         st.error("Failed to parse JSON. Raw response:")
         st.code(result)