Spaces:
Sleeping
Sleeping
File size: 5,410 Bytes
43316bd c835cd1 43316bd 39bc8a4 43316bd dcb9f42 c835cd1 43316bd c835cd1 43316bd c835cd1 39bc8a4 43316bd 86f6949 43316bd c835cd1 43316bd c835cd1 43316bd c835cd1 86f6949 c835cd1 86f6949 c835cd1 86f6949 39bc8a4 43316bd dcb9f42 c835cd1 dcb9f42 c835cd1 39bc8a4 c835cd1 dcb9f42 c835cd1 dcb9f42 c835cd1 dcb9f42 c835cd1 dcb9f42 43316bd c835cd1 dcb9f42 86f6949 c835cd1 39bc8a4 c835cd1 86f6949 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import pytesseract
from pdf2image import convert_from_path
from PIL import Image, ImageEnhance, ImageFilter
import os
import json
import re
import config
# Load Model
print(f">>> Loading AI Model: {config.MODEL_ID}...")
try:
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
except:
model = None
print("❌ Model Failed to Load")
# =====================================================
# 1. ADVANCED OCR PIPELINE
# =====================================================
def preprocess_image(image):
"""
Cleans image for better OCR results:
1. Grayscale
2. Sharpen
3. Increase Contrast
"""
# Convert to gray
image = image.convert('L')
# Increase Contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Sharpen (helps with blurry fonts)
image = image.filter(ImageFilter.SHARPEN)
return image
def perform_ocr(file_obj):
if file_obj is None: return "", None, {}
try:
filename = os.path.basename(file_obj)
# HIGH QUALITY CONVERSION (DPI=300)
if filename.lower().endswith(".pdf"):
# dpi=300 makes text much clearer than default 72
images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
original_img = images[0]
else:
original_img = Image.open(file_obj).convert("RGB")
# Preprocess for Tesseract
processed_img = preprocess_image(original_img)
# Run Tesseract
text = pytesseract.image_to_string(processed_img)
# Metadata extraction
meta = {
"filename": filename,
"size_kb": os.path.getsize(file_obj)/1024
}
return text, original_img, meta
except Exception as e:
print(f"OCR Error: {e}")
return "", None, {}
# =====================================================
# 2. REGEX FALLBACKS (The "Generic Name" Fix)
# =====================================================
def regex_extract_vendor(text):
"""
If AI fails, we use old-school logic to find the name.
"""
lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]
# 1. Look for "To" / "From"
for i, line in enumerate(lines):
if re.search(r'^(bill|invoice)\s*to:?$', line.lower()):
# The NEXT line is likely the customer name
if i + 1 < len(lines): return lines[i+1]
if re.search(r'^(from|vendor):?$', line.lower()):
if i + 1 < len(lines): return lines[i+1]
# 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
if len(lines) > 0:
# Ignore common headers
if "invoice" not in lines[0].lower(): return lines[0]
if len(lines) > 1: return lines[1]
return "Unknown"
def regex_extract_total(text):
# Looks for "Total $1,234.56" patterns
match = re.search(r'(?:total|amount|balance).*?([\d,]+\.\d{2})', text.lower())
if match:
try: return float(match.group(1).replace(',', ''))
except: pass
return 0.0
# =====================================================
# 3. AI EXTRACTION
# =====================================================
def repair_json(json_str):
if not json_str: return {}
try:
# Find the first { and the last }
start = json_str.find('{')
end = json_str.rfind('}') + 1
if start != -1 and end != 0:
return json.loads(json_str[start:end])
except: pass
return {}
def extract_intelligent_json(text, metadata):
if not model: return {}
# Stronger Prompt
prompt = f"""<|im_start|>system
You are a financial data extractor.
TASK: Convert OCR text into JSON.
MANDATORY RULES:
1. Extract the VENDOR_NAME (Who sent the invoice?)
2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
3. Extract LINE_ITEMS.
JSON FORMAT:
{{
"doc_type": "invoice",
"data": {{
"vendor_name": "Acme Corp",
"date": "2024-01-01",
"reference_number": "INV-001",
"total": 100.00,
"line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
}}
}}
<|im_end|>
<|im_start|>user
DOCUMENT TEXT:
{text[:2000]}
<|im_end|>
<|im_start|>assistant
```json
"""
inputs = tokenizer(prompt, return_tensors="pt")
out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)
raw_output = tokenizer.decode(out[0])
data = repair_json(raw_output)
# --- FALLBACK LAYER ---
# If AI returned empty/garbage data, overlay with Regex
if not data or "data" not in data:
data = {"doc_type": "invoice", "data": {}}
inner = data.get("data", {})
# Fix Name
if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
inner["vendor_name"] = regex_extract_vendor(text)
# Fix Total
if not inner.get("total"):
inner["total"] = regex_extract_total(text)
data["data"] = inner
return data |