Ayush soni commited on
Commit
6034171
·
1 Parent(s): 342a0c3

Add application file

Browse files
Files changed (5) hide show
  1. app.py +43 -0
  2. llm_processor.py +96 -0
  3. main.py +65 -0
  4. ocr_processor.py +31 -0
  5. requirements.txt +10 -0
app.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from ocr_processor import extract_text_from_image
3
+ from llm_processor import load_llm_model, generate_json_from_text
4
+
5
+ # Load LLM model on startup
6
+ load_llm_model()
7
+
8
+ def process_invoice(file):
9
+ # Read file bytes
10
+ image_bytes = file.read()
11
+
12
+ # Step 1: Extract raw text
13
+ raw_text = extract_text_from_image(image_bytes)
14
+ if not raw_text or "No text detected" in raw_text:
15
+ return raw_text, {"error": "No text could be extracted from the image."}
16
+
17
+ # Step 2: Convert raw text → structured JSON
18
+ json_data = generate_json_from_text(raw_text)
19
+
20
+ return raw_text, json_data
21
+
22
+
23
+ ### Gradio UI
24
+ with gr.Blocks() as demo:
25
+ gr.Markdown("# 🧾 Invoice Processing App")
26
+ gr.Markdown("Upload an invoice image. The app extracts **OCR text** and generates **structured JSON**.")
27
+
28
+ with gr.Row():
29
+ input_file = gr.File(label="Upload Invoice Image", type="file", file_types=[".png", ".jpg", ".jpeg"])
30
+
31
+ with gr.Row():
32
+ raw_text_output = gr.Textbox(label="Extracted OCR Text", lines=10)
33
+ json_output = gr.JSON(label="Structured JSON")
34
+
35
+ process_btn = gr.Button("Process Invoice")
36
+
37
+ process_btn.click(
38
+ process_invoice,
39
+ inputs=input_file,
40
+ outputs=[raw_text_output, json_output]
41
+ )
42
+
43
+ demo.launch()
llm_processor.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: llm_processor.py
2
+ import os
3
+ import json
4
+ from huggingface_hub import hf_hub_download
5
+ from llama_cpp import Llama
6
+
7
+ # Model Configuration
8
+ MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
9
+ MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"
10
+
11
+ llm = None
12
+
13
+ def load_llm_model():
14
+ """Downloads and loads the GGUF model from Hugging Face."""
15
+ global llm
16
+ try:
17
+ hf_token = os.getenv("HF_TOKEN")
18
+ if not hf_token:
19
+ raise EnvironmentError("HF_TOKEN environment variable not found.")
20
+
21
+ print(f"Downloading model {MODEL_FILE}...")
22
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token)
23
+
24
+ print("Loading GGUF model...")
25
+ llm = Llama(
26
+ model_path=model_path,
27
+ n_ctx=2048,
28
+ n_threads=2,
29
+ n_gpu_layers=0,
30
+ verbose=False
31
+ )
32
+ print("GGUF model loaded successfully.")
33
+ except Exception as e:
34
+ print(f"Fatal error loading LLM: {e}")
35
+ llm = None
36
+ def generate_json_from_text(ocr_text: str) -> dict:
37
+ """
38
+ Takes raw OCR text and uses the LLM to convert it into a structured JSON object.
39
+ """
40
+ if not llm:
41
+ raise RuntimeError("LLM is not available.")
42
+
43
+ prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly:
44
+ - Output ONLY the JSON object, with no additional text, markdown, or backticks.
45
+ - Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms).
46
+ - Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing.
47
+ - Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context).
48
+ - Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address).
49
+ - For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items.
50
+ - Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount.
51
+ - Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit.
52
+ - Be precise and fast – focus only on relevant data.
53
+
54
+ **JSON Schema:**
55
+ {{
56
+ "invoice_number": "string or null",
57
+ "invoice_date": "DD-MM-YYYY or null",
58
+ "seller": "string or null",
59
+ "invoice_to": "string or null",
60
+ "items": [
61
+ {{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }}
62
+ ],
63
+ "subtotal": "float or null",
64
+ "tax_amount": "float or null",
65
+ "grand_total": "float or null"
66
+ }}
67
+ **OCR Text:**
68
+ {ocr_text}
69
+ """
70
+ output = llm(
71
+ prompt,
72
+ max_tokens=1024, # Increased for longer JSON
73
+ temperature=0.5, # Slightly higher for better reasoning
74
+ top_p=0.9,
75
+ stop=["<|endoftext|>", "</s>"],
76
+ echo=False
77
+ )
78
+
79
+ generated_text = output["choices"][0]["text"].strip()
80
+
81
+ try:
82
+ start_idx = generated_text.find("{")
83
+ end_idx = generated_text.rfind("}") + 1
84
+ if start_idx != -1 and end_idx != -1:
85
+ json_str = generated_text[start_idx:end_idx]
86
+ json_data = json.loads(json_str)
87
+ return json_data
88
+ else:
89
+ raise json.JSONDecodeError("No JSON object found.", generated_text, 0)
90
+ except json.JSONDecodeError:
91
+ # Fallback: Return structured error with cleaned OCR text
92
+ return {
93
+ "error": "LLM failed to generate valid JSON.",
94
+ "raw_output": generated_text,
95
+ "cleaned_ocr_text": ocr_text
96
+ }
main.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: main.py
2
+ import os
3
+ from fastapi import FastAPI, File, UploadFile, HTTPException
4
+ from fastapi.responses import JSONResponse
5
+ import uvicorn
6
+ from llm_processor import load_llm_model, generate_json_from_text
7
+ from ocr_processor import extract_text_from_image
8
+
9
+ # Set environment variables for performance
10
+ os.environ["OMP_NUM_THREADS"] = "1"
11
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
12
+
13
+ # Create the FastAPI app
14
+ app = FastAPI(
15
+ title="Invoice Processing API",
16
+ description="A single endpoint to process an invoice image and return both raw text and structured JSON."
17
+ )
18
+
19
+ @app.on_event("startup")
20
+ def startup_event():
21
+ """Load models once when the server starts."""
22
+ load_llm_model()
23
+
24
+ @app.get("/", summary="Health Check")
25
+ def read_root():
26
+ """A simple endpoint to check if the API is running."""
27
+ return {"status": "API is running"}
28
+
29
+ @app.post("/process_invoice/", summary="Process Invoice to Text & JSON")
30
+ async def process_invoice_endpoint(file: UploadFile = File(...)):
31
+ """
32
+ Accepts an image file and returns both the extracted OCR text and the structured JSON data.
33
+ """
34
+ # Validate file type
35
+ if not file.content_type.startswith("image/"):
36
+ raise HTTPException(status_code=400, detail="Only image files are supported (e.g., PNG, JPEG).")
37
+
38
+ try:
39
+ image_bytes = await file.read()
40
+
41
+ # Step 1: Extract text from the image using the OCR processor
42
+ raw_text = extract_text_from_image(image_bytes)
43
+
44
+ if not raw_text or "No text detected" in raw_text:
45
+ return JSONResponse(content={
46
+ "extracted_text": raw_text,
47
+ "structured_json": {"error": "No text could be extracted from the image."}
48
+ })
49
+
50
+ # Step 2: Generate structured JSON from the extracted text
51
+ json_data = generate_json_from_text(raw_text)
52
+
53
+ # Step 3: Combine both results into a single response
54
+ combined_response = {
55
+ "extracted_text": raw_text,
56
+ "structured_json": json_data
57
+ }
58
+
59
+ return JSONResponse(content=combined_response)
60
+
61
+ except Exception as e:
62
+ raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
63
+
64
+ if __name__ == "__main__":
65
+ uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False) # Disable reload for production
ocr_processor.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # File: ocr_processor.py
2
+ import numpy as np
3
+ from paddleocr import PaddleOCR
4
+ from PIL import Image
5
+ import io
6
+
7
+ # Initialize PaddleOCR with modern, compatible settings
8
+ ocr = PaddleOCR(lang='en', use_angle_cls=True)
9
+
10
+ def extract_text_from_image(image_bytes: bytes) -> str:
11
+ """
12
+ Performs OCR on a given image using PaddleOCR.
13
+ """
14
+ try:
15
+ # 1. Convert bytes to PIL Image
16
+ img = Image.open(io.BytesIO(image_bytes))
17
+ img = img.convert("RGB")
18
+ img_array = np.array(img)
19
+
20
+ # 2. Run OCR
21
+ result = ocr.ocr(img_array)
22
+
23
+ # 3. Extract and combine the recognized text
24
+ if result and result[0]:
25
+ text_lines = [line[1][0] for line in result[0]]
26
+ return " ".join(text_lines)
27
+ else:
28
+ return "No text detected in the image."
29
+
30
+ except Exception as e:
31
+ return f"An error occurred during OCR: {str(e)}"
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.115.0
2
+ uvicorn==0.30.6
3
+ pillow==10.4.0
4
+ numpy==1.26.4
5
+ paddleocr==2.8.1
6
+ llama-cpp-python==0.2.88
7
+ huggingface_hub==0.25.1
8
+ paddlepaddle==2.6.1
9
+ gradio
10
+ transformers