Spaces:

vachaspathi
/

Agentic

Sleeping

App Files Files Community

vachaspathi commited on Nov 22, 2025

Commit

86f6949

verified ·

1 Parent(s): 9484c37

Update ai_engine.py

Browse files

Files changed (1) hide show

ai_engine.py +41 -58

ai_engine.py CHANGED Viewed

@@ -17,7 +17,6 @@ except:
     model = None
 def get_metadata(file_obj):
-    """Extracts file clues."""
     try:
         name = os.path.basename(file_obj)
         size = os.path.getsize(file_obj)
@@ -27,68 +26,65 @@ def get_metadata(file_obj):
         return {"filename": "unknown", "extension": "", "size_kb": 0}
 def perform_ocr(file_obj):
-    if file_obj is None: return "", None
     try:
-        # extract metadata before processing
         meta = get_metadata(file_obj)
         if meta["filename"].lower().endswith(".pdf"):
             image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
         else:
             image = Image.open(file_obj).convert("RGB")
         text = pytesseract.image_to_string(image)
         return text, image, meta
     except: return "", None, {}
 def fallback_classifier(text, filename):
-    """
-    Rule-based classifier if AI fails.
-    """
     combined = (text + " " + filename).lower()
-    if "invoice" in combined or "inv-" in combined: return "invoice"
-    if "estimate" in combined or "quote" in combined: return "estimate"
-    if "credit note" in combined: return "credit_note"
-    if "purchase order" in combined or "po-" in combined: return "purchase_order"
-    if "bill" in combined or "payment due" in combined: return "bill"
     if "receipt" in combined: return "expense"
-    return "unknown"
 def extract_intelligent_json(text, metadata):
-    """
-    Combines OCR + Metadata -> AI -> JSON
-    """
     if not model: return {}
-    # Inject Metadata into System Prompt
     prompt = f"""<|im_start|>system
-    You are a Document Classifier. Use the Filename and Text to identify the document type.
-    VALID TYPES: ["invoice", "bill", "estimate", "credit_note", "purchase_order", "expense"]
-    RULES:
-    1. If filename contains 'INV', it is an 'invoice'.
-    2. If text mentions 'Purchase Order', it is a 'purchase_order'.
-    3. Extract the Vendor/Customer Name and Dates carefully.
-    OUTPUT JSON FORMAT:
     {{
         "doc_type": "invoice",
-        "confidence": "high",
         "data": {{
-            "contact_name": "...",
             "date": "YYYY-MM-DD",
-            "reference_number": "...",
             "total": 0.00,
-            "line_items": [ {{"name": "...", "description": "...", "rate": 0, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
-    METADATA: {json.dumps(metadata)}
-    DOCUMENT TEXT:
     {text[:1500]}
     <|im_end|>
     <|im_start|>assistant
@@ -98,27 +94,14 @@ def extract_intelligent_json(text, metadata):
     inputs = tokenizer(prompt, return_tensors="pt")
     out = model.generate(**inputs, max_new_tokens=400, temperature=0.1)
-    try:
-        # Extract JSON block using Regex (More robust than split)
-        full_response = tokenizer.decode(out[0])
-        json_match = re.search(r"```json\s*(\{.*?\})\s*```", full_response, re.DOTALL)
-        if json_match:
-            data = json.loads(json_match.group(1))
-        else:
-            # Fallback: Try finding the first { and last }
-            start = full_response.find("{")
-            end = full_response.rfind("}") + 1
-            data = json.loads(full_response[start:end])
-        # Double Check Classification
-        if data.get("doc_type") == "unknown":
-            data["doc_type"] = fallback_classifier(text, metadata.get("filename", ""))
-        return data
-    except Exception as e:
-        print(f"AI Parsing Error: {e}")
-        # Hard Fallback
-        guessed_type = fallback_classifier(text, metadata.get("filename", ""))
-        return {"doc_type": guessed_type, "data": {}}

     model = None
 def get_metadata(file_obj):
     try:
         name = os.path.basename(file_obj)
         size = os.path.getsize(file_obj)
         return {"filename": "unknown", "extension": "", "size_kb": 0}
 def perform_ocr(file_obj):
+    if file_obj is None: return "", None, {}
     try:
         meta = get_metadata(file_obj)
         if meta["filename"].lower().endswith(".pdf"):
             image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
         else:
             image = Image.open(file_obj).convert("RGB")
         text = pytesseract.image_to_string(image)
         return text, image, meta
     except: return "", None, {}
+def repair_json(json_str):
+    """CRITICAL FIX: Extracts the largest valid JSON object from messy text."""
+    if not json_str: return {}
+    # Strategy 1: Direct Load
+    try: return json.loads(json_str)
+    except: pass
+    # Strategy 2: Extract between first { and last }
+    try:
+        start = json_str.find('{')
+        end = json_str.rfind('}') + 1
+        if start != -1 and end != 0:
+            clean = json_str[start:end]
+            return json.loads(clean)
+    except: pass
+    return {}
 def fallback_classifier(text, filename):
     combined = (text + " " + filename).lower()
+    if "invoice" in combined: return "invoice"
+    if "estimate" in combined: return "estimate"
+    if "bill" in combined: return "bill"
     if "receipt" in combined: return "expense"
+    return "invoice" # Default to invoice
 def extract_intelligent_json(text, metadata):
     if not model: return {}
     prompt = f"""<|im_start|>system
+    Extract JSON data. Valid doc_types: ["invoice", "bill", "estimate", "expense"].
+    OUTPUT FORMAT:
     {{
         "doc_type": "invoice",
         "data": {{
+            "vendor_name": "Name or 'Unknown'",
             "date": "YYYY-MM-DD",
+            "reference_number": "REF-123",
             "total": 0.00,
+            "line_items": [ {{"name": "Item", "description": "Desc", "rate": 0, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
+    FILE: {metadata.get('filename')}
+    CONTENT:
     {text[:1500]}
     <|im_end|>
     <|im_start|>assistant
     inputs = tokenizer(prompt, return_tensors="pt")
     out = model.generate(**inputs, max_new_tokens=400, temperature=0.1)
+    raw_output = tokenizer.decode(out[0])
+    # Use the new Repair Function
+    data = repair_json(raw_output)
+    # If repair failed or empty, use heuristics
+    if not data or "doc_type" not in data:
+        doc_type = fallback_classifier(text, metadata.get('filename'))
+        data = {"doc_type": doc_type, "data": {"vendor_name": "Unknown"}}
+    return data