PDF_Upload_Vision

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

88f23c6

verified ·

1 Parent(s): 73e2df7

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -47

app.py CHANGED Viewed

@@ -107,17 +107,32 @@ def clean_json_response(text, model_choice):
     # Handle Mistral's markdown response
     if model_choice == "Mistral Small":
-        # Remove all markdown formatting
-        text = text.replace('```json', '').replace('```', '').strip()
-    # Handle Llama's truncated response
     if model_choice == "Llama 4 Mavericks":
-        if '"line_items":' in text and not text.strip().endswith('}}'):
-            # Check if we have at least a complete header
-            if '"invoice_header":' in text:
-                # Return with empty line items
-                text = text.split('"line_items":')[0] + '"line_items": []}'
     # Try parsing the cleaned JSON
     try:
         data = json.loads(text)
@@ -131,21 +146,68 @@ def clean_json_response(text, model_choice):
         return data
     except json.JSONDecodeError as e:
-        # Try one more time with strict=False for Llama
-        if model_choice == "Llama 4 Mavericks":
-            try:
                 # Find the last complete JSON object
                 end_pos = text.rfind('}')
                 if end_pos != -1:
-                    return json.loads(text[:end_pos+1])
-            except:
-                pass
-        st.warning(f"JSON parsing failed: {str(e)}")
         return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
@@ -191,36 +253,7 @@ Invoice Text:
 """ + text
     else:  # For Llama 4 and Mistral
-        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
-{{
-  "invoice_header": {{
-    "invoice_number": "string",
-    "invoice_date": "YYYY-MM-DD",
-    "po_number": "string or null",
-    "invoice_value": "string with currency",
-    "supplier_name": "string or null",
-    "customer_name": "string or null"
-  }},
-  "line_items": [
-    {{
-      "item_number": "string or null",
-      "description": "string",
-      "quantity": "number",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }}
-  ]
-}}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Date format must be YYYY-MM-DD
-4. All currency values must include currency symbol or code
-5. Include all line items found in the invoice
-6. For line items, quantity should be a number, prices as strings with currency
-7. Do not include any explanations or notes
-Invoice Text:
-""" + text
 def format_currency(value):
     """Helper function to format currency values consistently"""

     # Handle Mistral's markdown response
     if model_choice == "Mistral Small":
+        try:
+            # Extract JSON from between ```json and ```
+            json_start = text.find('{')
+            json_end = text.rfind('}') + 1
+            if json_start != -1 and json_end != 0:
+                text = text[json_start:json_end]
+        except Exception as e:
+            st.warning(f"Error processing Mistral response: {str(e)}")
+            return None
+    # Handle Llama's response
     if model_choice == "Llama 4 Mavericks":
+        # Check if response is complete
+        if not text.strip().endswith('}}'):
+            # Try to complete the JSON structure
+            if '"line_items":' in text:
+                # Case 1: Line items started but not finished
+                if '"line_items": [' in text:
+                    text = text.split('"line_items": [')[0] + '"line_items": []}'
+                # Case 2: Just the line_items key exists
+                else:
+                    text = text.split('"line_items":')[0] + '"line_items": []}'
+            # Ensure proper closing
+            if not text.endswith('}'):
+                text += '}'
     # Try parsing the cleaned JSON
     try:
         data = json.loads(text)
         return data
     except json.JSONDecodeError as e:
+        st.warning(f"JSON parsing failed: {str(e)}")
+        st.warning(f"Attempting to repair JSON for {model_choice}...")
+        # Final attempt to repair
+        try:
+            if model_choice == "Llama 4 Mavericks":
                 # Find the last complete JSON object
                 end_pos = text.rfind('}')
                 if end_pos != -1:
+                    repaired = text[:end_pos+1]
+                    data = json.loads(repaired)
+                    if "line_items" not in data:
+                        data["line_items"] = []
+                    return data
+            elif model_choice == "Mistral Small":
+                # Remove all non-JSON content
+                json_start = text.find('{')
+                json_end = text.rfind('}') + 1
+                if json_start != -1 and json_end != 0:
+                    repaired = text[json_start:json_end]
+                    return json.loads(repaired)
+        except Exception as e:
+            st.error(f"Failed to repair JSON: {str(e)}")
+            return None
         return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
+    base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
+{
+  "invoice_header": {
+    "invoice_number": "string",
+    "invoice_date": "YYYY-MM-DD",
+    "po_number": "string or null",
+    "invoice_value": "string with currency",
+    "supplier_name": "string or null",
+    "customer_name": "string or null"
+  },
+  "line_items": [
+    {
+      "item_number": "string or null",
+      "description": "string",
+      "quantity": "number",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }
+  ]
+}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Date format must be YYYY-MM-DD
+4. All currency values must include currency symbol or code
+5. Include all line items found in the invoice
+6. For line items, quantity should be a number, prices as strings with currency
+7. Do not include any explanations or notes
+Invoice Text:
+""" + text
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
 """ + text
     else:  # For Llama 4 and Mistral
+        return base_prompt
 def format_currency(value):
     """Helper function to format currency values consistently"""