Spaces:

vachaspathi
/

Agentic

Sleeping

App Files Files Community

Agentic / ai_engine.py

vachaspathi

Update ai_engine.py

c835cd1 verified 4 months ago

raw

history blame contribute delete

5.41 kB

	import torch
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import pytesseract
	from pdf2image import convert_from_path
	from PIL import Image, ImageEnhance, ImageFilter
	import os
	import json
	import re
	import config

	# Load Model
	print(f">>> Loading AI Model: {config.MODEL_ID}...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(config.MODEL_ID)
	model = AutoModelForCausalLM.from_pretrained(config.MODEL_ID, device_map="cpu", torch_dtype=torch.float32, low_cpu_mem_usage=True)
	except:
	model = None
	print("❌ Model Failed to Load")

	# =====================================================
	# 1. ADVANCED OCR PIPELINE
	# =====================================================
	def preprocess_image(image):
	"""
	Cleans image for better OCR results:
	1. Grayscale
	2. Sharpen
	3. Increase Contrast
	"""
	# Convert to gray
	image = image.convert('L')

	# Increase Contrast
	enhancer = ImageEnhance.Contrast(image)
	image = enhancer.enhance(2.0)

	# Sharpen (helps with blurry fonts)
	image = image.filter(ImageFilter.SHARPEN)

	return image

	def perform_ocr(file_obj):
	if file_obj is None: return "", None, {}
	try:
	filename = os.path.basename(file_obj)

	# HIGH QUALITY CONVERSION (DPI=300)
	if filename.lower().endswith(".pdf"):
	# dpi=300 makes text much clearer than default 72
	images = convert_from_path(file_obj, first_page=1, last_page=1, dpi=300)
	original_img = images[0]
	else:
	original_img = Image.open(file_obj).convert("RGB")

	# Preprocess for Tesseract
	processed_img = preprocess_image(original_img)

	# Run Tesseract
	text = pytesseract.image_to_string(processed_img)

	# Metadata extraction
	meta = {
	"filename": filename,
	"size_kb": os.path.getsize(file_obj)/1024
	}

	return text, original_img, meta
	except Exception as e:
	print(f"OCR Error: {e}")
	return "", None, {}

	# =====================================================
	# 2. REGEX FALLBACKS (The "Generic Name" Fix)
	# =====================================================
	def regex_extract_vendor(text):
	"""
	If AI fails, we use old-school logic to find the name.
	"""
	lines = [l.strip() for l in text.split('\n') if len(l.strip()) > 3]

	# 1. Look for "To" / "From"
	for i, line in enumerate(lines):
	if re.search(r'^(bill\|invoice)\s*to:?$', line.lower()):
	# The NEXT line is likely the customer name
	if i + 1 < len(lines): return lines[i+1]

	if re.search(r'^(from\|vendor):?$', line.lower()):
	if i + 1 < len(lines): return lines[i+1]

	# 2. Top-most bold text (heuristic: usually the first or second line is the Company Name)
	if len(lines) > 0:
	# Ignore common headers
	if "invoice" not in lines[0].lower(): return lines[0]
	if len(lines) > 1: return lines[1]

	return "Unknown"

	def regex_extract_total(text):
	# Looks for "Total $1,234.56" patterns
	match = re.search(r'(?:total\|amount\|balance).*?([\d,]+\.\d{2})', text.lower())
	if match:
	try: return float(match.group(1).replace(',', ''))
	except: pass
	return 0.0

	# =====================================================
	# 3. AI EXTRACTION
	# =====================================================
	def repair_json(json_str):
	if not json_str: return {}
	try:
	# Find the first { and the last }
	start = json_str.find('{')
	end = json_str.rfind('}') + 1
	if start != -1 and end != 0:
	return json.loads(json_str[start:end])
	except: pass
	return {}

	def extract_intelligent_json(text, metadata):
	if not model: return {}

	# Stronger Prompt
	prompt = f"""<\|im_start\|>system
	You are a financial data extractor.
	TASK: Convert OCR text into JSON.

	MANDATORY RULES:
	1. Extract the VENDOR_NAME (Who sent the invoice?)
	2. Extract the DOCUMENT_TYPE: ["invoice", "bill", "expense", "estimate"]
	3. Extract LINE_ITEMS.

	JSON FORMAT:
	{{
	"doc_type": "invoice",
	"data": {{
	"vendor_name": "Acme Corp",
	"date": "2024-01-01",
	"reference_number": "INV-001",
	"total": 100.00,
	"line_items": [ {{"name": "Service", "description": "...", "rate": 100, "quantity": 1}} ]
	}}
	}}
	<\|im_end\|>
	<\|im_start\|>user
	DOCUMENT TEXT:
	{text[:2000]}
	<\|im_end\|>
	<\|im_start\|>assistant
	```json
	"""

	inputs = tokenizer(prompt, return_tensors="pt")
	out = model.generate(**inputs, max_new_tokens=500, temperature=0.1)

	raw_output = tokenizer.decode(out[0])
	data = repair_json(raw_output)

	# --- FALLBACK LAYER ---
	# If AI returned empty/garbage data, overlay with Regex
	if not data or "data" not in data:
	data = {"doc_type": "invoice", "data": {}}

	inner = data.get("data", {})

	# Fix Name
	if not inner.get("vendor_name") or inner["vendor_name"] == "Unknown":
	inner["vendor_name"] = regex_extract_vendor(text)

	# Fix Total
	if not inner.get("total"):
	inner["total"] = regex_extract_total(text)

	data["data"] = inner
	return data