File size: 5,410 Bytes
43316bd
 
 
 
c835cd1
43316bd
 
39bc8a4
43316bd
 
dcb9f42
c835cd1
43316bd
 
 
 
 
c835cd1
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39bc8a4
43316bd
86f6949
43316bd
c835cd1
 
 
 
 
 
 
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43316bd
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86f6949
 
 
c835cd1
86f6949
 
 
c835cd1
86f6949
 
 
39bc8a4
43316bd
dcb9f42
c835cd1
dcb9f42
c835cd1
 
 
 
 
 
 
39bc8a4
c835cd1
dcb9f42
c835cd1
dcb9f42
c835cd1
 
 
 
 
dcb9f42
 
 
 
c835cd1
 
dcb9f42
 
 
 
 
43316bd
c835cd1
dcb9f42
86f6949
 
 
c835cd1
 
 
 
 
 
 
 
 
 
 
 
 
 
39bc8a4
c835cd1
86f6949
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import os
import json
import re
import config

# Load Model
print(f">>> Loading AI Model: {config.MODEL_ID}...")
try:
    tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
except:
    model = None
    print("❌ Model Failed to Load")

# =====================================================
#  1. ADVANCED OCR PIPELINE
# =====================================================
def preprocess_image(image):
    """
    Cleans image for better OCR results:
    1. Grayscale
    2. Sharpen
    3. Increase Contrast
    """
    # Convert to gray
    image = image.convert('L')
    
    # Increase Contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)
    
    # Sharpen (helps with blurry fonts)
    image = image.filter(ImageFilter.SHARPEN)
    
    return image

def perform_ocr(file_obj):
    if file_obj is None: return "", None, {}
    try:
        filename = os.path.basename(file_obj)
        
        # HIGH QUALITY CONVERSION (DPI=300)
        if filename.lower().endswith(".pdf"):
            # dpi=300 makes text much clearer than default 72
            images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
            original_img = images[0]
        else:
            original_img = Image.open(file_obj).convert("RGB")
            
        # Preprocess for Tesseract
        processed_img = preprocess_image(original_img)
        
        # Run Tesseract
        text = pytesseract.image_to_string(processed_img)
        
        # Metadata extraction
        meta = {
            "filename": filename,
            "size_kb": os.path.getsize(file_obj)/1024
        }
        
        return text, original_img, meta
    except Exception as e:
        print(f"OCR Error: {e}")
        return "", None, {}

# =====================================================
#  2. REGEX FALLBACKS (The "Generic Name" Fix)
# =====================================================
def regex_extract_vendor(text):
    """
    If AI fails, we use old-school logic to find the name.
    """
    lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]
    
    # 1. Look for "To" / "From"
    for i, line in enumerate(lines):
        if re.search(r'^(bill|invoice)\s*to:?$', line.lower()):
            # The NEXT line is likely the customer name
            if i + 1 < len(lines): return lines[i+1]
            
        if re.search(r'^(from|vendor):?$', line.lower()):
            if i + 1 < len(lines): return lines[i+1]

    # 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
    if len(lines) > 0:
        # Ignore common headers
        if "invoice" not in lines[0].lower(): return lines[0]
        if len(lines) > 1: return lines[1]
        
    return "Unknown"

def regex_extract_total(text):
    # Looks for "Total $1,234.56" patterns
    match = re.search(r'(?:total|amount|balance).*?([\d,]+\.\d{2})', text.lower())
    if match:
        try: return float(match.group(1).replace(',', ''))
        except: pass
    return 0.0

# =====================================================
#  3. AI EXTRACTION
# =====================================================
def repair_json(json_str):
    if not json_str: return {}
    try:
        # Find the first { and the last }
        start = json_str.find('{')
        end = json_str.rfind('}') + 1
        if start != -1 and end != 0:
            return json.loads(json_str[start:end])
    except: pass
    return {}

def extract_intelligent_json(text, metadata):
    if not model: return {}
    
    # Stronger Prompt
    prompt = f"""<|im_start|>system
    You are a financial data extractor. 
    TASK: Convert OCR text into JSON.
    
    MANDATORY RULES:
    1. Extract the VENDOR_NAME (Who sent the invoice?)
    2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
    3. Extract LINE_ITEMS.
    
    JSON FORMAT:
    {{
        "doc_type": "invoice",
        "data": {{
            "vendor_name": "Acme Corp", 
            "date": "2024-01-01",
            "reference_number": "INV-001",
            "total": 100.00,
            "line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
        }}
    }}
    <|im_end|>
    <|im_start|>user
    DOCUMENT TEXT:
    {text[:2000]}
    <|im_end|>
    <|im_start|>assistant
    ```json
    """
    
    inputs = tokenizer(prompt, return_tensors="pt")
    out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)
    
    raw_output = tokenizer.decode(out[0])
    data = repair_json(raw_output)
    
    # --- FALLBACK LAYER ---
    # If AI returned empty/garbage data, overlay with Regex
    if not data or "data" not in data:
        data = {"doc_type": "invoice", "data": {}}
    
    inner = data.get("data", {})
    
    # Fix Name
    if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
        inner["vendor_name"] = regex_extract_vendor(text)
        
    # Fix Total
    if not inner.get("total"):
        inner["total"] = regex_extract_total(text)
        
    data["data"] = inner
    return data