Spaces:

vachaspathi
/

Agentic

Sleeping

File size: 5,410 Bytes

43316bd
 
 
 
c835cd1
43316bd
 
39bc8a4
43316bd
 
dcb9f42
c835cd1
43316bd
 
 
 
 
c835cd1
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39bc8a4
43316bd
86f6949
43316bd
c835cd1
 
 
 
 
 
 
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f6949
 
 
c835cd1
86f6949
 
 
c835cd1
86f6949
 
 
39bc8a4
43316bd
dcb9f42
c835cd1
dcb9f42
c835cd1
 
 
 
 
 
 
39bc8a4
c835cd1
dcb9f42
c835cd1
dcb9f42
c835cd1
 
 
 
 
dcb9f42
 
 
 
c835cd1
 
dcb9f42
 
 
 
 
43316bd
c835cd1
dcb9f42
86f6949
 
 
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
39bc8a4
c835cd1
86f6949

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import os
import json
import re
import config

# Load Model
print(f">>> Loading AI Model: {config.MODEL_ID}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
except:
    model = None
    print("❌ Model Failed to Load")

# =====================================================
#  1. ADVANCED OCR PIPELINE
# =====================================================
def preprocess_image(image):
    """
    Cleans image for better OCR results:
    1. Grayscale
    2. Sharpen
    3. Increase Contrast
    """
    # Convert to gray
    image = image.convert('L')
    
    # Increase Contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)
    
    # Sharpen (helps with blurry fonts)
    image = image.filter(ImageFilter.SHARPEN)
    
    return image

def perform_ocr(file_obj):
    if file_obj is None: return "", None, {}
    try:
        filename = os.path.basename(file_obj)
        
        # HIGH QUALITY CONVERSION (DPI=300)
        if filename.lower().endswith(".pdf"):
            # dpi=300 makes text much clearer than default 72
            images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
            original_img = images[0]
        else:
            original_img = Image.open(file_obj).convert("RGB")
            
        # Preprocess for Tesseract
        processed_img = preprocess_image(original_img)
        
        # Run Tesseract
        text = pytesseract.image_to_string(processed_img)
        
        # Metadata extraction
        meta = {
            "filename": filename,
            "size_kb": os.path.getsize(file_obj)/1024
        }
        
        return text, original_img, meta
    except Exception as e:
        print(f"OCR Error: {e}")
        return "", None, {}

# =====================================================
#  2. REGEX FALLBACKS (The "Generic Name" Fix)
# =====================================================
def regex_extract_vendor(text):
    """
    If AI fails, we use old-school logic to find the name.
    """
    lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]
    
    # 1. Look for "To" / "From"
    for i, line in enumerate(lines):
        if re.search(r'^(bill|invoice)\s*to:?$', line.lower()):
            # The NEXT line is likely the customer name
            if i + 1 < len(lines): return lines[i+1]
            
        if re.search(r'^(from|vendor):?$', line.lower()):
            if i + 1 < len(lines): return lines[i+1]

    # 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
    if len(lines) > 0:
        # Ignore common headers
        if "invoice" not in lines[0].lower(): return lines[0]
        if len(lines) > 1: return lines[1]
        
    return "Unknown"

def regex_extract_total(text):
    # Looks for "Total $1,234.56" patterns
    match = re.search(r'(?:total|amount|balance).*?([\d,]+\.\d{2})', text.lower())
    if match:
        try: return float(match.group(1).replace(',', ''))
        except: pass
    return 0.0

# =====================================================
#  3. AI EXTRACTION
# =====================================================
def repair_json(json_str):
    if not json_str: return {}
    try:
        # Find the first { and the last }
        start = json_str.find('{')
        end = json_str.rfind('}') + 1
        if start != -1 and end != 0:
            return json.loads(json_str[start:end])
    except: pass
    return {}

def extract_intelligent_json(text, metadata):
    if not model: return {}
    
    # Stronger Prompt
    prompt = f"""<|im_start|>system
    You are a financial data extractor. 
    TASK: Convert OCR text into JSON.
    
    MANDATORY RULES:
    1. Extract the VENDOR_NAME (Who sent the invoice?)
    2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
    3. Extract LINE_ITEMS.
    
    JSON FORMAT:
    {{
        "doc_type": "invoice",
        "data": {{
            "vendor_name": "Acme Corp", 
            "date": "2024-01-01",
            "reference_number": "INV-001",
            "total": 100.00,
            "line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
        }}
    }}
    <|im_end|>
    <|im_start|>user
    DOCUMENT TEXT:
    {text[:2000]}
    <|im_end|>
    <|im_start|>assistant
    ```json
    """
    
    inputs = tokenizer(prompt, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)
    
    raw_output = tokenizer.decode(out[0])
    data = repair_json(raw_output)
    
    # --- FALLBACK LAYER ---
    # If AI returned empty/garbage data, overlay with Regex
    if not data or "data" not in data:
        data = {"doc_type": "invoice", "data": {}}
    
    inner = data.get("data", {})
    
    # Fix Name
    if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
        inner["vendor_name"] = regex_extract_vendor(text)
        
    # Fix Total
    if not inner.get("total"):
        inner["total"] = regex_extract_total(text)
        
    data["data"] = inner
    return data