Spaces:
Sleeping
Sleeping
File size: 3,958 Bytes
6034171 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 |
# File: llm_processor.py
import os
import json
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
# Model Configuration
MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"
llm = None
def load_llm_model():
"""Downloads and loads the GGUF model from Hugging Face."""
global llm
try:
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise EnvironmentError("HF_TOKEN environment variable not found.")
print(f"Downloading model {MODEL_FILE}...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token)
print("Loading GGUF model...")
llm = Llama(
model_path=model_path,
n_ctx=2048,
n_threads=2,
n_gpu_layers=0,
verbose=False
)
print("GGUF model loaded successfully.")
except Exception as e:
print(f"Fatal error loading LLM: {e}")
llm = None
def generate_json_from_text(ocr_text: str) -> dict:
"""
Takes raw OCR text and uses the LLM to convert it into a structured JSON object.
"""
if not llm:
raise RuntimeError("LLM is not available.")
prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly:
- Output ONLY the JSON object, with no additional text, markdown, or backticks.
- Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms).
- Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing.
- Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context).
- Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address).
- For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items.
- Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount.
- Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit.
- Be precise and fast – focus only on relevant data.
**JSON Schema:**
{{
"invoice_number": "string or null",
"invoice_date": "DD-MM-YYYY or null",
"seller": "string or null",
"invoice_to": "string or null",
"items": [
{{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }}
],
"subtotal": "float or null",
"tax_amount": "float or null",
"grand_total": "float or null"
}}
**OCR Text:**
{ocr_text}
"""
output = llm(
prompt,
max_tokens=1024, # Increased for longer JSON
temperature=0.5, # Slightly higher for better reasoning
top_p=0.9,
stop=["<|endoftext|>", "</s>"],
echo=False
)
generated_text = output["choices"][0]["text"].strip()
try:
start_idx = generated_text.find("{")
end_idx = generated_text.rfind("}") + 1
if start_idx != -1 and end_idx != -1:
json_str = generated_text[start_idx:end_idx]
json_data = json.loads(json_str)
return json_data
else:
raise json.JSONDecodeError("No JSON object found.", generated_text, 0)
except json.JSONDecodeError:
# Fallback: Return structured error with cleaned OCR text
return {
"error": "LLM failed to generate valid JSON.",
"raw_output": generated_text,
"cleaned_ocr_text": ocr_text
} |