import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import os
import json
import re
import config

# Load Model
print(f">>> Loading AI Model: {config.MODEL_ID}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
except:
    model = None
    print("❌ Model Failed to Load")

# =====================================================
#  1. ADVANCED OCR PIPELINE
# =====================================================
def preprocess_image(image):
    """
    Cleans image for better OCR results:
    1. Grayscale
    2. Sharpen
    3. Increase Contrast
    """
    # Convert to gray
    image = image.convert('L')
    
    # Increase Contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)
    
    # Sharpen (helps with blurry fonts)
    image = image.filter(ImageFilter.SHARPEN)
    
    return image

def perform_ocr(file_obj):
    if file_obj is None: return "", None, {}
    try:
        filename = os.path.basename(file_obj)
        
        # HIGH QUALITY CONVERSION (DPI=300)
        if filename.lower().endswith(".pdf"):
            # dpi=300 makes text much clearer than default 72
            images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
            original_img = images[0]
        else:
            original_img = Image.open(file_obj).convert("RGB")
            
        # Preprocess for Tesseract
        processed_img = preprocess_image(original_img)
        
        # Run Tesseract
        text = pytesseract.image_to_string(processed_img)
        
        # Metadata extraction
        meta = {
            "filename": filename,
            "size_kb": os.path.getsize(file_obj)/1024
        }
        
        return text, original_img, meta
    except Exception as e:
        print(f"OCR Error: {e}")
        return "", None, {}

# =====================================================
#  2. REGEX FALLBACKS (The "Generic Name" Fix)
# =====================================================
def regex_extract_vendor(text):
    """
    If AI fails, we use old-school logic to find the name.
    """
    lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]
    
    # 1. Look for "To" / "From"
    for i, line in enumerate(lines):
        if re.search(r'^(bill|invoice)\s*to:?$', line.lower()):
            # The NEXT line is likely the customer name
            if i + 1 < len(lines): return lines[i+1]
            
        if re.search(r'^(from|vendor):?$', line.lower()):
            if i + 1 < len(lines): return lines[i+1]

    # 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
    if len(lines) > 0:
        # Ignore common headers
        if "invoice" not in lines[0].lower(): return lines[0]
        if len(lines) > 1: return lines[1]
        
    return "Unknown"

def regex_extract_total(text):
    # Looks for "Total $1,234.56" patterns
    match = re.search(r'(?:total|amount|balance).*?([\d,]+\.\d{2})', text.lower())
    if match:
        try: return float(match.group(1).replace(',', ''))
        except: pass
    return 0.0

# =====================================================
#  3. AI EXTRACTION
# =====================================================
def repair_json(json_str):
    if not json_str: return {}
    try:
        # Find the first { and the last }
        start = json_str.find('{')
        end = json_str.rfind('}') + 1
        if start != -1 and end != 0:
            return json.loads(json_str[start:end])
    except: pass
    return {}

def extract_intelligent_json(text, metadata):
    if not model: return {}
    
    # Stronger Prompt
    prompt = f"""<|im_start|>system
    You are a financial data extractor. 
    TASK: Convert OCR text into JSON.
    
    MANDATORY RULES:
    1. Extract the VENDOR_NAME (Who sent the invoice?)
    2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
    3. Extract LINE_ITEMS.
    
    JSON FORMAT:
    {{
        "doc_type": "invoice",
        "data": {{
            "vendor_name": "Acme Corp", 
            "date": "2024-01-01",
            "reference_number": "INV-001",
            "total": 100.00,
            "line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
        }}
    }}
    <|im_end|>
    <|im_start|>user
    DOCUMENT TEXT:
    {text[:2000]}
    <|im_end|>
    <|im_start|>assistant
    ```json
    """
    
    inputs = tokenizer(prompt, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)
    
    raw_output = tokenizer.decode(out[0])
    data = repair_json(raw_output)
    
    # --- FALLBACK LAYER ---
    # If AI returned empty/garbage data, overlay with Regex
    if not data or "data" not in data:
        data = {"doc_type": "invoice", "data": {}}
    
    inner = data.get("data", {})
    
    # Fix Name
    if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
        inner["vendor_name"] = regex_extract_vendor(text)
        
    # Fix Total
    if not inner.get("total"):
        inner["total"] = regex_extract_total(text)
        
    data["data"] = inner
    return data