Spaces:

vachaspathi
/

Agentic

Sleeping

App Files Files Community

vachaspathi commited on Nov 22, 2025

Commit

c835cd1

verified ·

1 Parent(s): 986c126

Update ai_engine.py

Browse files

Files changed (1) hide show

ai_engine.py +121 -53

ai_engine.py CHANGED Viewed

@@ -2,106 +2,174 @@ import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import pytesseract
 from pdf2image import convert_from_path
-from PIL import Image
 import os
 import json
 import re
 import config
 # Load Model
-print(">>> Loading AI Model...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
 except:
     model = None
-def get_metadata(file_obj):
-    try:
-        name = os.path.basename(file_obj)
-        size = os.path.getsize(file_obj)
-        ext = name.split('.')[-1].lower()
-        return {"filename": name, "extension": ext, "size_kb": size/1024}
-    except:
-        return {"filename": "unknown", "extension": "", "size_kb": 0}
 def perform_ocr(file_obj):
     if file_obj is None: return "", None, {}
     try:
-        meta = get_metadata(file_obj)
-        if meta["filename"].lower().endswith(".pdf"):
-            image = convert_from_path(file_obj, first_page=1, last_page=1)[0]
         else:
-            image = Image.open(file_obj).convert("RGB")
-        text = pytesseract.image_to_string(image)
-        return text, image, meta
-    except: return "", None, {}
 def repair_json(json_str):
-    """CRITICAL FIX: Extracts the largest valid JSON object from messy text."""
     if not json_str: return {}
-    # Strategy 1: Direct Load
-    try: return json.loads(json_str)
-    except: pass
-    # Strategy 2: Extract between first { and last }
     try:
         start = json_str.find('{')
         end = json_str.rfind('}') + 1
         if start != -1 and end != 0:
-            clean = json_str[start:end]
-            return json.loads(clean)
     except: pass
     return {}
-def fallback_classifier(text, filename):
-    combined = (text + " " + filename).lower()
-    if "invoice" in combined: return "invoice"
-    if "estimate" in combined: return "estimate"
-    if "bill" in combined: return "bill"
-    if "receipt" in combined: return "expense"
-    return "invoice" # Default to invoice
 def extract_intelligent_json(text, metadata):
     if not model: return {}
     prompt = f"""<|im_start|>system
-    Extract JSON data. Valid doc_types: ["invoice", "bill", "estimate", "expense"].
-    OUTPUT FORMAT:
     {{
-        "doc_type": "invoice",
         "data": {{
-            "vendor_name": "Name or 'Unknown'",
-            "date": "YYYY-MM-DD",
-            "reference_number": "REF-123",
-            "total": 0.00,
-            "line_items": [ {{"name": "Item", "description": "Desc", "rate": 0, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
-    FILE: {metadata.get('filename')}
-    CONTENT:
-    {text[:1500]}
     <|im_end|>
     <|im_start|>assistant
     ```json
     """
     inputs = tokenizer(prompt, return_tensors="pt")
-    out = model.generate(**inputs, max_new_tokens=400, temperature=0.1)
     raw_output = tokenizer.decode(out[0])
-    # Use the new Repair Function
     data = repair_json(raw_output)
-    # If repair failed or empty, use heuristics
-    if not data or "doc_type" not in data:
-        doc_type = fallback_classifier(text, metadata.get('filename'))
-        data = {"doc_type": doc_type, "data": {"vendor_name": "Unknown"}}
     return data

 from transformers import AutoModelForCausalLM, AutoTokenizer
 import pytesseract
 from pdf2image import convert_from_path
+from PIL import Image, ImageEnhance, ImageFilter
 import os
 import json
 import re
 import config
 # Load Model
+print(f">>> Loading AI Model: {config.MODEL_ID}...")
 try:
     tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
     model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
 except:
     model = None
+    print("❌ Model Failed to Load")
+# =====================================================
+#  1. ADVANCED OCR PIPELINE
+# =====================================================
+def preprocess_image(image):
+    """
+    Cleans image for better OCR results:
+    1. Grayscale
+    2. Sharpen
+    3. Increase Contrast
+    """
+    # Convert to gray
+    image = image.convert('L')
+    # Increase Contrast
+    enhancer = ImageEnhance.Contrast(image)
+    image = enhancer.enhance(2.0)
+    # Sharpen (helps with blurry fonts)
+    image = image.filter(ImageFilter.SHARPEN)
+    return image
 def perform_ocr(file_obj):
     if file_obj is None: return "", None, {}
     try:
+        filename = os.path.basename(file_obj)
+        # HIGH QUALITY CONVERSION (DPI=300)
+        if filename.lower().endswith(".pdf"):
+            # dpi=300 makes text much clearer than default 72
+            images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
+            original_img = images[0]
         else:
+            original_img = Image.open(file_obj).convert("RGB")
+        # Preprocess for Tesseract
+        processed_img = preprocess_image(original_img)
+        # Run Tesseract
+        text = pytesseract.image_to_string(processed_img)
+        # Metadata extraction
+        meta = {
+            "filename": filename,
+            "size_kb": os.path.getsize(file_obj)/1024
+        }
+        return text, original_img, meta
+    except Exception as e:
+        print(f"OCR Error: {e}")
+        return "", None, {}
+# =====================================================
+#  2. REGEX FALLBACKS (The "Generic Name" Fix)
+# =====================================================
+def regex_extract_vendor(text):
+    """
+    If AI fails, we use old-school logic to find the name.
+    """
+    lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]
+    # 1. Look for "To" / "From"
+    for i, line in enumerate(lines):
+        if re.search(r'^(bill|invoice)\s*to:?$', line.lower()):
+            # The NEXT line is likely the customer name
+            if i + 1 < len(lines): return lines[i+1]
+        if re.search(r'^(from|vendor):?$', line.lower()):
+            if i + 1 < len(lines): return lines[i+1]
+    # 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
+    if len(lines) > 0:
+        # Ignore common headers
+        if "invoice" not in lines[0].lower(): return lines[0]
+        if len(lines) > 1: return lines[1]
+    return "Unknown"
+def regex_extract_total(text):
+    # Looks for "Total $1,234.56" patterns
+    match = re.search(r'(?:total|amount|balance).*?([\d,]+\.\d{2})', text.lower())
+    if match:
+        try: return float(match.group(1).replace(',', ''))
+        except: pass
+    return 0.0
+# =====================================================
+#  3. AI EXTRACTION
+# =====================================================
 def repair_json(json_str):
     if not json_str: return {}
     try:
+        # Find the first { and the last }
         start = json_str.find('{')
         end = json_str.rfind('}') + 1
         if start != -1 and end != 0:
+            return json.loads(json_str[start:end])
     except: pass
     return {}
 def extract_intelligent_json(text, metadata):
     if not model: return {}
+    # Stronger Prompt
     prompt = f"""<|im_start|>system
+    You are a financial data extractor.
+    TASK: Convert OCR text into JSON.
+    MANDATORY RULES:
+    1. Extract the VENDOR_NAME (Who sent the invoice?)
+    2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
+    3. Extract LINE_ITEMS.
+    JSON FORMAT:
     {{
+        "doc_type": "invoice",
         "data": {{
+            "vendor_name": "Acme Corp",
+            "date": "2024-01-01",
+            "reference_number": "INV-001",
+            "total": 100.00,
+            "line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
         }}
     }}
     <|im_end|>
     <|im_start|>user
+    DOCUMENT TEXT:
+    {text[:2000]}
     <|im_end|>
     <|im_start|>assistant
     ```json
     """
     inputs = tokenizer(prompt, return_tensors="pt")
+    out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)
     raw_output = tokenizer.decode(out[0])
     data = repair_json(raw_output)
+    # --- FALLBACK LAYER ---
+    # If AI returned empty/garbage data, overlay with Regex
+    if not data or "data" not in data:
+        data = {"doc_type": "invoice", "data": {}}
+    inner = data.get("data", {})
+    # Fix Name
+    if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
+        inner["vendor_name"] = regex_extract_vendor(text)
+    # Fix Total
+    if not inner.get("total"):
+        inner["total"] = regex_extract_total(text)
+    data["data"] = inner
     return data