Spaces:

credent007
/

easyocr-phi3

Paused

App Files Files Community

credent007 commited on Apr 9

Commit

24d4193

verified ·

1 Parent(s): 98298cd

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +47 -0
llm.py +47 -0
mainapp.py +51 -0
ocr_utils.py +65 -0
requirements.txt +15 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,47 @@

+# Use a slim Python image for a smaller footprint
+FROM python:3.11-slim
+# Set environment variables
+# PYTHONDONTWRITEBYTECODE: Prevents Python from writing .pyc files
+# PYTHONUNBUFFERED: Ensures logs are sent straight to terminal without buffering
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PORT=7860
+# Install system dependencies
+# Added libmupdf-dev if you decide to use advanced PyMuPDF features,
+# though the pip package usually bundles what it needs.
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        build-essential \
+        libgl1 \
+        libglib2.0-0 \
+        libsm6 \
+        libxrender1 \
+        libxext6 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Set the working directory
+WORKDIR /app
+# Install Python dependencies first (leverages Docker layer caching)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application code
+COPY . .
+# Create a non-root user for security (Best Practice)
+RUN useradd -m appuser && chown -R appuser /app
+USER appuser
+# Expose the designated port
+EXPOSE 7860
+# Run the application
+# We use the list form of CMD for better signal handling (CTRL+C)
+CMD ["sh", "-c", "uvicorn mainapp:app --host 0.0.0.0 --port ${PORT}"]

llm.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from functools import partial
+import asyncio
+model_name = "microsoft/phi-3-mini-128k-instruct"
+# 8-bit quantization config
+quant_config = BitsAndBytesConfig(
+    load_in_8bit=True
+)
+# tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+# model
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    quantization_config=quant_config
+)
+# Mock LLM function (replace with your actual LLM API call)
+async def call_llm(prompt: str):
+    # Simulate LLM call in executor
+    llm_function_with_args=partial(execute_llm(model,tokenizer,prompt))
+    loop = asyncio.get_event_loop()
+    result = await loop.run_in_executor(None, llm_function_with_args)
+    return result
+async def execute_llm(model,tokenizer,prompt:str):
+    prompt="what is json give an example "
+    data=""
+    full_prompt=prompt+" "+data
+    messages = [
+        {"role": "user", "content":full_prompt }
+    ]
+    inputs = tokenizer.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt"
+    ).to(model.device)
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=10000
+    )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)

mainapp.py ADDED Viewed

	@@ -0,0 +1,51 @@

+from fastapi import FastAPI, UploadFile, File, HTTPException
+from fastapi.responses import JSONResponse
+from ocr_utils import process_pdf_page,ocr_image
+import fitz  # PyMuPDF
+import numpy as np
+from PIL import Image
+from functools import partial
+import io
+import asyncio
+from llm import tokenizer , model ,call_llm
+app = FastAPI()
+ # Initialize once
+@app.get("/")
+def home ():
+    return JSONResponse(content={'message':'home page'})
+@app.post("/ocr-llm")
+async def ocr_llm_endpoint(file: UploadFile = File(...)):
+    if not file.filename.lower().endswith((".pdf", ".png", ".jpg", ".jpeg")):
+        raise HTTPException(status_code=400, detail="File must be PDF or image")
+    try:
+        results = []
+        if file.filename.lower().endswith(".pdf"):
+            file_bytes = await file.read()
+            doc = fitz.open(stream=file_bytes, filetype="pdf")
+            # Process each page sequentially or concurrently
+            tasks = [process_pdf_page(page) for page in doc]
+            results = await asyncio.gather(*tasks)
+        else:
+            # Single image file
+            image_bytes = await file.read()
+            image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+            img_array = np.array(image)
+            ocr_result = await ocr_image(img_array)
+            print(print('output in main app'),ocr_result,type(ocr_result))
+            ocr_text = ",".join(ocr_result)
+            # llm_result = await call_llm(ocr_text)
+            results.append({"page": 1, "ocr": str(ocr_text), "llm": "llm_result"})
+        return JSONResponse(content={"results": results})
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

ocr_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import easyocr
+import asyncio
+import numpy as np
+# Initialize reader once at module level
+reader = easyocr.Reader(['hi', 'en'], gpu=False)
+print('instance of reader ocr is created ')
+def process_ocr_output(results):
+    """
+    Converts raw EasyOCR list into a list of dictionaries.
+    """
+    print('andara ayaa ')
+    invoice_data = []
+    for bbox, text, conf in results:
+        # bbox comes as [[x,y], [x,y], [x,y], [x,y]]
+        # We convert to list for JSON serializability
+        invoice_data.append(str({
+            "bbox": [[int(pt[0]) , int(pt[1])] for pt in bbox],
+            "text": text,
+            "confidence": float(conf)
+        }))
+    print('yaah pr')
+    return invoice_data
+async def ocr_image(image: np.ndarray):
+    """
+    Runs OCR in a thread pool to avoid blocking the FastAPI event loop.
+    """
+    loop = asyncio.get_event_loop()
+    # EasyOCR's readtext is CPU bound, so we run in executor
+    results = await loop.run_in_executor(None, reader.readtext, image)
+    results=process_ocr_output(results)
+    print(results)
+    return results
+async def process_pdf_page(page):
+    """
+    Converts PDF page to image and processes OCR.
+    """
+    pix = page.get_pixmap()
+    # Convert PyMuPDF pixmap to numpy array
+    img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
+    if pix.n == 4:  # Convert RGBA to RGB
+        img = img[:, :, :3]
+    # Get raw results
+    raw_results = await ocr_image(img)
+    # 1. Create the clean string for the LLM
+    full_text = " ".join([res[1] for res in raw_results])
+    # 2. Create the detailed JSON structure for the response
+    structured_ocr = process_ocr_output(raw_results)
+    # Optional: If you want to call LLM here
+    # llm_result = await call_llm(full_text)
+    return {
+        "page_number": page.number + 1,
+        "ocr_details": structured_ocr,
+        "raw_text": full_text,
+        "llm_analysis": "llm_result_placeholder"
+    }

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi
+uvicorn[standard]
+git+https://github.com/huggingface/transformers
+accelerate
+bitsandbytes
+sentencepiece
+torch
+numpy
+matplotlib
+easyocr
+opencv-python
+accelerator
+PyMuPDF
+pillow
+python-multipart