Spaces:

vachaspathi
/

Agentic

Sleeping

App Files Files Community

vachaspathi commited on Nov 22, 2025

Commit

39bc8a4

verified ·

1 Parent(s): 050d9ff

Update ai_engine.py

Browse files

Files changed (1) hide show

ai_engine.py +73 -19

ai_engine.py CHANGED Viewed

@@ -5,6 +5,7 @@ from pdf2image import convert_from_path
 from PIL import Image
 import os
 import json
 import config
 # Load Model
@@ -15,42 +16,78 @@ try:
 except:
     model = None
 def perform_ocr(file_obj):
     if file_obj is None: return "", None
     try:
-        filename = os.path.basename(file_obj)
-        if filename.lower().endswith(".pdf"):
             image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
         else:
             image = Image.open(file_obj).convert("RGB")
-        return pytesseract.image_to_string(image), image
-    except: return "", None
-def extract_intelligent_json(text):
     """
-    Classifies the document and extracts relevant fields.
     """
     if not model: return {}
-    # Robust prompt instructing the AI to classify and format
     prompt = f"""<|im_start|>system
-    Analyze the document text.
-    1. CLASSIFY the type as one of: ["invoice", "estimate", "credit_note", "expense", "contact", "purchase_order"].
-    2. EXTRACT data based on the type.
-    OUTPUT FORMAT (JSON ONLY):
     {{
         "doc_type": "invoice",
         "data": {{
-            "vendor_name": "...",
             "date": "YYYY-MM-DD",
             "reference_number": "...",
             "total": 0.00,
-            "line_items": [ {{"name": "...", "rate": 0, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
     DOCUMENT TEXT:
     {text[:1500]}
     <|im_end|>
@@ -59,12 +96,29 @@ def extract_intelligent_json(text):
     """
     inputs = tokenizer(prompt, return_tensors="pt")
-    out = model.generate(**inputs, max_new_tokens=350, temperature=0.1)
     try:
-        json_str = tokenizer.decode(out[0]).split("```json")[1].split("```")[0].strip()
-        return json.loads(json_str)
     except Exception as e:
-        print(f"AI Error: {e}")
-        # Fallback default
-        return {"doc_type": "unknown", "data": {}}

 from PIL import Image
 import os
 import json
+import re
 import config
 # Load Model
 except:
     model = None
+def get_metadata(file_obj):
+    """Extracts file clues."""
+    try:
+        name = os.path.basename(file_obj)
+        size = os.path.getsize(file_obj)
+        ext = name.split('.')[-1].lower()
+        return {"filename": name, "extension": ext, "size_kb": size/1024}
+    except:
+        return {"filename": "unknown", "extension": "", "size_kb": 0}
 def perform_ocr(file_obj):
     if file_obj is None: return "", None
     try:
+        # extract metadata before processing
+        meta = get_metadata(file_obj)
+        if meta["filename"].lower().endswith(".pdf"):
             image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
         else:
             image = Image.open(file_obj).convert("RGB")
+        text = pytesseract.image_to_string(image)
+        return text, image, meta
+    except: return "", None, {}
+def fallback_classifier(text, filename):
+    """
+    Rule-based classifier if AI fails.
     """
+    combined = (text + " " + filename).lower()
+    if "invoice" in combined or "inv-" in combined: return "invoice"
+    if "estimate" in combined or "quote" in combined: return "estimate"
+    if "credit note" in combined: return "credit_note"
+    if "purchase order" in combined or "po-" in combined: return "purchase_order"
+    if "bill" in combined or "payment due" in combined: return "bill"
+    if "receipt" in combined: return "expense"
+    return "unknown"
+def extract_intelligent_json(text, metadata):
+    """
+    Combines OCR + Metadata -> AI -> JSON
     """
     if not model: return {}
+    # Inject Metadata into System Prompt
     prompt = f"""<|im_start|>system
+    You are a Document Classifier. Use the Filename and Text to identify the document type.
+    VALID TYPES: ["invoice", "bill", "estimate", "credit_note", "purchase_order", "expense"]
+    RULES:
+    1. If filename contains 'INV', it is an 'invoice'.
+    2. If text mentions 'Purchase Order', it is a 'purchase_order'.
+    3. Extract the Vendor/Customer Name and Dates carefully.
+    OUTPUT JSON FORMAT:
     {{
         "doc_type": "invoice",
+        "confidence": "high",
         "data": {{
+            "contact_name": "...",
             "date": "YYYY-MM-DD",
             "reference_number": "...",
             "total": 0.00,
+            "line_items": [ {{"name": "...", "description": "...", "rate": 0, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
+    METADATA: {json.dumps(metadata)}
     DOCUMENT TEXT:
     {text[:1500]}
     <|im_end|>
     """
     inputs = tokenizer(prompt, return_tensors="pt")
+    out = model.generate(**inputs, max_new_tokens=400, temperature=0.1)
     try:
+        # Extract JSON block using Regex (More robust than split)
+        full_response = tokenizer.decode(out[0])
+        json_match = re.search(r"```json\s*(\{.*?\})\s*```", full_response, re.DOTALL)
+        if json_match:
+            data = json.loads(json_match.group(1))
+        else:
+            # Fallback: Try finding the first { and last }
+            start = full_response.find("{")
+            end = full_response.rfind("}") + 1
+            data = json.loads(full_response[start:end])
+        # Double Check Classification
+        if data.get("doc_type") == "unknown":
+            data["doc_type"] = fallback_classifier(text, metadata.get("filename", ""))
+        return data
     except Exception as e:
+        print(f"AI Parsing Error: {e}")
+        # Hard Fallback
+        guessed_type = fallback_classifier(text, metadata.get("filename", ""))
+        return {"doc_type": guessed_type, "data": {}}