Spaces:

ayushsoni155
/

Invoice_IMG_To_JSON

Sleeping

App Files Files Community

Ayush soni commited on Sep 26, 2025

Commit

6034171

1 Parent(s): 342a0c3

Add application file

Browse files

Files changed (5) hide show

app.py +43 -0
llm_processor.py +96 -0
main.py +65 -0
ocr_processor.py +31 -0
requirements.txt +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import gradio as gr
+from ocr_processor import extract_text_from_image
+from llm_processor import load_llm_model, generate_json_from_text
+# Load LLM model on startup
+load_llm_model()
+def process_invoice(file):
+    # Read file bytes
+    image_bytes = file.read()
+    # Step 1: Extract raw text
+    raw_text = extract_text_from_image(image_bytes)
+    if not raw_text or "No text detected" in raw_text:
+        return raw_text, {"error": "No text could be extracted from the image."}
+    # Step 2: Convert raw text → structured JSON
+    json_data = generate_json_from_text(raw_text)
+    return raw_text, json_data
+### Gradio UI
+with gr.Blocks() as demo:
+    gr.Markdown("# 🧾 Invoice Processing App")
+    gr.Markdown("Upload an invoice image. The app extracts **OCR text** and generates **structured JSON**.")
+    with gr.Row():
+        input_file = gr.File(label="Upload Invoice Image", type="file", file_types=[".png", ".jpg", ".jpeg"])
+    with gr.Row():
+        raw_text_output = gr.Textbox(label="Extracted OCR Text", lines=10)
+        json_output = gr.JSON(label="Structured JSON")
+    process_btn = gr.Button("Process Invoice")
+    process_btn.click(
+        process_invoice,
+        inputs=input_file,
+        outputs=[raw_text_output, json_output]
+    )
+demo.launch()

llm_processor.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# File: llm_processor.py
+import os
+import json
+from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
+# Model Configuration
+MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
+MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"
+llm = None
+def load_llm_model():
+    """Downloads and loads the GGUF model from Hugging Face."""
+    global llm
+    try:
+        hf_token = os.getenv("HF_TOKEN")
+        if not hf_token:
+            raise EnvironmentError("HF_TOKEN environment variable not found.")
+        print(f"Downloading model {MODEL_FILE}...")
+        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE, token=hf_token)
+        print("Loading GGUF model...")
+        llm = Llama(
+            model_path=model_path,
+            n_ctx=2048,
+            n_threads=2,
+            n_gpu_layers=0,
+            verbose=False
+        )
+        print("GGUF model loaded successfully.")
+    except Exception as e:
+        print(f"Fatal error loading LLM: {e}")
+        llm = None
+def generate_json_from_text(ocr_text: str) -> dict:
+    """
+    Takes raw OCR text and uses the LLM to convert it into a structured JSON object.
+    """
+    if not llm:
+        raise RuntimeError("LLM is not available.")
+    prompt = f"""You are an expert invoice parsing AI. Convert the OCR text below into a structured JSON object based on the provided schema. Follow these rules strictly:
+- Output ONLY the JSON object, with no additional text, markdown, or backticks.
+- Interpret OCR errors logically and correct them without confusion (e.g., '3il1' as 'Bill', 'DoSa' as 'Dosa', 'Cofee' as 'Coffee', 'BisiBeleBATH' as 'Bisibelebath', 'Masala-Dosa*' as 'Masala Dosa', 'ONION*DoSa' as 'Onion Dosa' – treat * or other artifacts as typos, not synonyms).
+- Extract invoice_number from patterns like 'Bill #:128998' or similar; use null if missing.
+- Format invoice_date as DD-MM-YYYY; infer full year if abbreviated (e.g., '17/02/19' as '17-02-2019' based on context).
+- Seller is the business name/address at the top (e.g., 'SHANTHI HOTEL CATERERS'); invoice_to is only a clear buyer name if present, else null (do not confuse with seller's address).
+- For items, parse lines matching 'Item Qty Rate Value' pattern; extract description (normalized), quantity (integer), rate (float), total (float). Ignore tax or total lines in items.
+- Sum all tax amounts (e.g., CGT 13.94 + SGT 13.94 = 27.88) for tax_amount.
+- Use 'Net Amount' or similar as grand_total; calculate subtotal as grand_total minus tax_amount if not explicit.
+- Be precise and fast – focus only on relevant data.
+**JSON Schema:**
+{{
+  "invoice_number": "string or null",
+  "invoice_date": "DD-MM-YYYY or null",
+  "seller": "string or null",
+  "invoice_to": "string or null",
+  "items": [
+    {{ "description": "string", "quantity": "integer or null", "rate": "float or null", "total": "float or null" }}
+  ],
+  "subtotal": "float or null",
+  "tax_amount": "float or null",
+  "grand_total": "float or null"
+}}
+**OCR Text:**
+{ocr_text}
+"""
+    output = llm(
+        prompt,
+        max_tokens=1024,  # Increased for longer JSON
+        temperature=0.5,  # Slightly higher for better reasoning
+        top_p=0.9,
+        stop=["<|endoftext|>", "</s>"],
+        echo=False
+    )
+    generated_text = output["choices"][0]["text"].strip()
+    try:
+        start_idx = generated_text.find("{")
+        end_idx = generated_text.rfind("}") + 1
+        if start_idx != -1 and end_idx != -1:
+            json_str = generated_text[start_idx:end_idx]
+            json_data = json.loads(json_str)
+            return json_data
+        else:
+            raise json.JSONDecodeError("No JSON object found.", generated_text, 0)
+    except json.JSONDecodeError:
+        # Fallback: Return structured error with cleaned OCR text
+        return {
+            "error": "LLM failed to generate valid JSON.",
+            "raw_output": generated_text,
+            "cleaned_ocr_text": ocr_text
+        }

main.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# File: main.py
+import os
+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import uvicorn
+from llm_processor import load_llm_model, generate_json_from_text
+from ocr_processor import extract_text_from_image
+# Set environment variables for performance
+os.environ["OMP_NUM_THREADS"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Create the FastAPI app
+app = FastAPI(
+    title="Invoice Processing API",
+    description="A single endpoint to process an invoice image and return both raw text and structured JSON."
+)
+@app.on_event("startup")
+def startup_event():
+    """Load models once when the server starts."""
+    load_llm_model()
+@app.get("/", summary="Health Check")
+def read_root():
+    """A simple endpoint to check if the API is running."""
+    return {"status": "API is running"}
+@app.post("/process_invoice/", summary="Process Invoice to Text & JSON")
+async def process_invoice_endpoint(file: UploadFile = File(...)):
+    """
+    Accepts an image file and returns both the extracted OCR text and the structured JSON data.
+    """
+    # Validate file type
+    if not file.content_type.startswith("image/"):
+        raise HTTPException(status_code=400, detail="Only image files are supported (e.g., PNG, JPEG).")
+    try:
+        image_bytes = await file.read()
+        # Step 1: Extract text from the image using the OCR processor
+        raw_text = extract_text_from_image(image_bytes)
+        if not raw_text or "No text detected" in raw_text:
+            return JSONResponse(content={
+                "extracted_text": raw_text,
+                "structured_json": {"error": "No text could be extracted from the image."}
+            })
+        # Step 2: Generate structured JSON from the extracted text
+        json_data = generate_json_from_text(raw_text)
+        # Step 3: Combine both results into a single response
+        combined_response = {
+            "extracted_text": raw_text,
+            "structured_json": json_data
+        }
+        return JSONResponse(content=combined_response)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"An error occurred: {str(e)}")
+if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=False)  # Disable reload for production

ocr_processor.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# File: ocr_processor.py
+import numpy as np
+from paddleocr import PaddleOCR
+from PIL import Image
+import io
+# Initialize PaddleOCR with modern, compatible settings
+ocr = PaddleOCR(lang='en', use_angle_cls=True)
+def extract_text_from_image(image_bytes: bytes) -> str:
+    """
+    Performs OCR on a given image using PaddleOCR.
+    """
+    try:
+        # 1. Convert bytes to PIL Image
+        img = Image.open(io.BytesIO(image_bytes))
+        img = img.convert("RGB")
+        img_array = np.array(img)
+        # 2. Run OCR
+        result = ocr.ocr(img_array)
+        # 3. Extract and combine the recognized text
+        if result and result[0]:
+            text_lines = [line[1][0] for line in result[0]]
+            return " ".join(text_lines)
+        else:
+            return "No text detected in the image."
+    except Exception as e:
+        return f"An error occurred during OCR: {str(e)}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+fastapi==0.115.0
+uvicorn==0.30.6
+pillow==10.4.0
+numpy==1.26.4
+paddleocr==2.8.1
+llama-cpp-python==0.2.88
+huggingface_hub==0.25.1
+paddlepaddle==2.6.1
+gradio
+transformers