File size: 3,958 Bytes
6034171
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
# File: llm_processor.py
import os
import json
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Model Configuration
MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"

llm = None

def load_llm_model():
    """Downloads and loads the GGUF model from Hugging Face."""
    global llm
    try:
        hf_token = os.getenv("HF_TOKEN")
        if not hf_token:
            raise EnvironmentError("HF_TOKEN environment variable not found.")

        print(f"Downloading model {MODEL_FILE}...")
        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token)
        
        print("Loading GGUF model...")
        llm = Llama(
            model_path=model_path,
            n_ctx=2048,
            n_threads=2,
            n_gpu_layers=0,
            verbose=False
        )
        print("GGUF model loaded successfully.")
    except Exception as e:
        print(f"Fatal error loading LLM: {e}")
        llm = None
def generate_json_from_text(ocr_text: str) -> dict:
    """
    Takes raw OCR text and uses the LLM to convert it into a structured JSON object.
    """
    if not llm:
        raise RuntimeError("LLM is not available.")
    
    prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly:
- Output ONLY the JSON object, with no additional text, markdown, or backticks.
- Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms).
- Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing.
- Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context).
- Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address).
- For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items.
- Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount.
- Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit.
- Be precise and fast – focus only on relevant data.

**JSON Schema:**
{{
  "invoice_number": "string or null",
  "invoice_date": "DD-MM-YYYY or null",
  "seller": "string or null",
  "invoice_to": "string or null",
  "items": [
    {{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }}
  ],
  "subtotal": "float or null",
  "tax_amount": "float or null",
  "grand_total": "float or null"
}}
**OCR Text:**
{ocr_text}
"""
    output = llm(
        prompt,
        max_tokens=1024,  # Increased for longer JSON
        temperature=0.5,  # Slightly higher for better reasoning
        top_p=0.9,
        stop=["<|endoftext|>", "</s>"],
        echo=False
    )
    
    generated_text = output["choices"][0]["text"].strip()

    try:
        start_idx = generated_text.find("{")
        end_idx = generated_text.rfind("}") + 1
        if start_idx != -1 and end_idx != -1:
            json_str = generated_text[start_idx:end_idx]
            json_data = json.loads(json_str)
            return json_data
        else:
            raise json.JSONDecodeError("No JSON object found.", generated_text, 0)
    except json.JSONDecodeError:
        # Fallback: Return structured error with cleaned OCR text
        return {
            "error": "LLM failed to generate valid JSON.",
            "raw_output": generated_text,
            "cleaned_ocr_text": ocr_text
        }