Spaces:

vachaspathi
/

Agentic

Sleeping

App Files Files Community

vachaspathi commited on Nov 22, 2025

Commit

dcb9f42

verified ·

1 Parent(s): c40ad39

Update ai_engine.py

Browse files

Files changed (1) hide show

ai_engine.py +40 -12

ai_engine.py CHANGED Viewed

@@ -1,4 +1,3 @@
-# ai_engine.py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import pytesseract
@@ -8,14 +7,13 @@ import os
 import json
 import config
-# Load Model Once
 print(">>> Loading AI Model...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
 except:
     model = None
-    print("❌ Model Failed to Load")
 def perform_ocr(file_obj):
     if file_obj is None: return "", None
@@ -26,17 +24,47 @@ def perform_ocr(file_obj):
         else:
             image = Image.open(file_obj).convert("RGB")
         return pytesseract.image_to_string(image), image
-    except:
-        return "", None
-def extract_json(text):
     if not model: return {}
-    prompt = f"<|im_start|>user\nExtract JSON: vendor_name, invoice_date, total, item_desc\nText:\n{text[:1000]}<|im_end|>\n<|im_start|>assistant\n```json"
     inputs = tokenizer(prompt, return_tensors="pt")
-    out = model.generate(**inputs, max_new_tokens=200)
     try:
         json_str = tokenizer.decode(out[0]).split("```json")[1].split("```")[0].strip()
-        data = json.loads(json_str)
-        return data[0] if isinstance(data, list) else data
-    except:
-        return {}

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import pytesseract
 import json
 import config
+# Load Model
 print(">>> Loading AI Model...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
 except:
     model = None
 def perform_ocr(file_obj):
     if file_obj is None: return "", None
         else:
             image = Image.open(file_obj).convert("RGB")
         return pytesseract.image_to_string(image), image
+    except: return "", None
+def extract_intelligent_json(text):
+    """
+    Classifies the document and extracts relevant fields.
+    """
     if not model: return {}
+    # Robust prompt instructing the AI to classify and format
+    prompt = f"""<|im_start|>system
+    Analyze the document text.
+    1. CLASSIFY the type as one of: ["invoice", "estimate", "credit_note", "expense", "contact", "purchase_order"].
+    2. EXTRACT data based on the type.
+    OUTPUT FORMAT (JSON ONLY):
+    {{
+        "doc_type": "invoice",
+        "data": {{
+            "vendor_name": "...",
+            "date": "YYYY-MM-DD",
+            "reference_number": "...",
+            "total": 0.00,
+            "line_items": [ {{"name": "...", "rate": 0, "quantity": 1}} ]
+        }}
+    }}
+    <|im_end|>
+    <|im_start|>user
+    DOCUMENT TEXT:
+    {text[:1500]}
+    <|im_end|>
+    <|im_start|>assistant
+    ```json
+    """
     inputs = tokenizer(prompt, return_tensors="pt")
+    out = model.generate(**inputs, max_new_tokens=350, temperature=0.1)
     try:
         json_str = tokenizer.decode(out[0]).split("```json")[1].split("```")[0].strip()
+        return json.loads(json_str)
+    except Exception as e:
+        print(f"AI Error: {e}")
+        # Fallback default
+        return {"doc_type": "unknown", "data": {}}