PDF_Upload

Sleeping

App Files Files Community

Seth0330 commited on May 21, 2025

Commit

b23347b

verified ·

1 Parent(s): 88f23c6

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -97

app.py CHANGED Viewed

@@ -100,114 +100,89 @@ def query_llm(model_choice, prompt):
         st.error(f"🌐 Connection Failed: {str(e)}")
         return None
 def clean_json_response(text, model_choice):
-    """Improved JSON extraction with model-specific handling"""
     if not text:
         return None
-    # Handle Mistral's markdown response
     if model_choice == "Mistral Small":
-        try:
-            # Extract JSON from between ```json and ```
-            json_start = text.find('{')
-            json_end = text.rfind('}') + 1
-            if json_start != -1 and json_end != 0:
-                text = text[json_start:json_end]
-        except Exception as e:
-            st.warning(f"Error processing Mistral response: {str(e)}")
-            return None
-    # Handle Llama's response
-    if model_choice == "Llama 4 Mavericks":
-        # Check if response is complete
-        if not text.strip().endswith('}}'):
-            # Try to complete the JSON structure
-            if '"line_items":' in text:
-                # Case 1: Line items started but not finished
-                if '"line_items": [' in text:
-                    text = text.split('"line_items": [')[0] + '"line_items": []}'
-                # Case 2: Just the line_items key exists
                 else:
-                    text = text.split('"line_items":')[0] + '"line_items": []}'
-            # Ensure proper closing
-            if not text.endswith('}'):
-                text += '}'
-    # Try parsing the cleaned JSON
-    try:
-        data = json.loads(text)
-        # Ensure proper structure exists
-        if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
-            if "invoice_header" not in data:
-                data["invoice_header"] = {}
-            if "line_items" not in data:
-                data["line_items"] = []
-        return data
-    except json.JSONDecodeError as e:
-        st.warning(f"JSON parsing failed: {str(e)}")
-        st.warning(f"Attempting to repair JSON for {model_choice}...")
-        # Final attempt to repair
         try:
-            if model_choice == "Llama 4 Mavericks":
-                # Find the last complete JSON object
-                end_pos = text.rfind('}')
-                if end_pos != -1:
-                    repaired = text[:end_pos+1]
-                    data = json.loads(repaired)
-                    if "line_items" not in data:
-                        data["line_items"] = []
-                    return data
-            elif model_choice == "Mistral Small":
-                # Remove all non-JSON content
-                json_start = text.find('{')
-                json_end = text.rfind('}') + 1
-                if json_start != -1 and json_end != 0:
-                    repaired = text[json_start:json_end]
-                    return json.loads(repaired)
-        except Exception as e:
-            st.error(f"Failed to repair JSON: {str(e)}")
-            return None
-        return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
-    base_prompt = """Extract complete invoice information and return a VALID JSON object with these fields:
-{
-  "invoice_header": {
-    "invoice_number": "string",
-    "invoice_date": "YYYY-MM-DD",
-    "po_number": "string or null",
-    "invoice_value": "string with currency",
-    "supplier_name": "string or null",
-    "customer_name": "string or null"
-  },
-  "line_items": [
-    {
-      "item_number": "string or null",
-      "description": "string",
-      "quantity": "number",
-      "unit_price": "string with currency",
-      "total_price": "string with currency"
-    }
-  ]
-}
-Rules:
-1. Return ONLY valid JSON (no additional text or markdown)
-2. Use null for missing fields
-3. Date format must be YYYY-MM-DD
-4. All currency values must include currency symbol or code
-5. Include all line items found in the invoice
-6. For line items, quantity should be a number, prices as strings with currency
-7. Do not include any explanations or notes
-Invoice Text:
-""" + text
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
@@ -253,7 +228,36 @@ Invoice Text:
 """ + text
     else:  # For Llama 4 and Mistral
-        return base_prompt
 def format_currency(value):
     """Helper function to format currency values consistently"""

         st.error(f"🌐 Connection Failed: {str(e)}")
         return None
+def find_json_end(text):
+    """Find the end of a potentially incomplete JSON object"""
+    stack = []
+    for i, c in enumerate(text):
+        if c == '{':
+            stack.append(i)
+        elif c == '}':
+            if stack:
+                stack.pop()
+                if not stack:
+                    return i+1
+    return -1
 def clean_json_response(text, model_choice):
+    """Robust JSON extraction with advanced error handling"""
     if not text:
         return None
+    original_text = text  # Save for error reporting
+    # Model-specific preprocessing
     if model_choice == "Mistral Small":
+        # Remove all markdown formatting
+        text = re.sub(r'^```json|```$', '', text, flags=re.MULTILINE).strip()
+    # Common JSON repair patterns
+    repair_attempts = [
+        # Try extracting JSON from markdown
+        lambda t: re.search(r'```(?:json)?\n({.*?})\n```', t, re.DOTALL),
+        # Try finding the outermost JSON object
+        lambda t: {'start': t.find('{'), 'end': t.rfind('}')+1},
+        # Try last valid JSON fragment
+        lambda t: {'start': 0, 'end': find_json_end(t)}
+    ]
+    for attempt in repair_attempts:
+        try:
+            result = attempt(text)
+            if not result:
+                continue
+            if isinstance(result, re.Match):
+                json_str = result.group(1)
+            else:
+                start, end = result['start'], result['end']
+                if start >= 0 and end > start:
+                    json_str = text[start:end]
                 else:
+                    continue
+            data = json.loads(json_str)
+            # Ensure required structure exists
+            if model_choice in ["Llama 4 Mavericks", "Mistral Small"]:
+                if "invoice_header" not in data:
+                    data["invoice_header"] = {}
+                if "line_items" not in data:
+                    data["line_items"] = []
+            return data
+        except (json.JSONDecodeError, AttributeError, KeyError) as e:
+            continue
+    # Final fallback - manual reconstruction for Llama
+    if model_choice == "Llama 4 Mavericks":
         try:
+            if '"invoice_header":' in text:
+                header_part = text.split('"line_items":')[0] if '"line_items":' in text else text
+                if not header_part.strip().endswith('}'):
+                    header_part += '}'
+                data = json.loads(header_part + ('"line_items": []}' if '"line_items":' not in text else ''))
+                data["line_items"] = data.get("line_items", [])
+                return data
+        except:
+            pass
+    st.error(f"Failed to parse JSON after multiple attempts for {model_choice}")
+    st.code(f"Original response:\n{original_text}")
+    return None
 def get_extraction_prompt(model_choice, text):
     """Return the appropriate prompt based on model choice"""
     if model_choice == "DeepSeek v3":
         return f"""Extract complete invoice information from the text below and return ONLY a valid JSON object with these fields:
 {{
 """ + text
     else:  # For Llama 4 and Mistral
+        return f"""Extract complete invoice information and return a VALID JSON object with these fields:
+{{
+  "invoice_header": {{
+    "invoice_number": "string",
+    "invoice_date": "YYYY-MM-DD",
+    "po_number": "string or null",
+    "invoice_value": "string with currency",
+    "supplier_name": "string or null",
+    "customer_name": "string or null"
+  }},
+  "line_items": [
+    {{
+      "item_number": "string or null",
+      "description": "string",
+      "quantity": "number",
+      "unit_price": "string with currency",
+      "total_price": "string with currency"
+    }}
+  ]
+}}
+Rules:
+1. Return ONLY valid JSON (no additional text or markdown)
+2. Use null for missing fields
+3. Date format must be YYYY-MM-DD
+4. All currency values must include currency symbol or code
+5. Include all line items found in the invoice
+6. For line items, quantity should be a number, prices as strings with currency
+7. Do not include any explanations or notes
+Invoice Text:
+""" + text
 def format_currency(value):
     """Helper function to format currency values consistently"""