credent007 commited on
Commit
24d4193
·
verified ·
1 Parent(s): 98298cd

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +47 -0
  2. llm.py +47 -0
  3. mainapp.py +51 -0
  4. ocr_utils.py +65 -0
  5. requirements.txt +15 -0
Dockerfile ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a slim Python image for a smaller footprint
2
+ FROM python:3.11-slim
3
+
4
+ # Set environment variables
5
+ # PYTHONDONTWRITEBYTECODE: Prevents Python from writing .pyc files
6
+ # PYTHONUNBUFFERED: Ensures logs are sent straight to terminal without buffering
7
+ ENV PYTHONDONTWRITEBYTECODE=1 \
8
+ PYTHONUNBUFFERED=1 \
9
+ PORT=7860
10
+
11
+ # Install system dependencies
12
+ # Added libmupdf-dev if you decide to use advanced PyMuPDF features,
13
+ # though the pip package usually bundles what it needs.
14
+ # Install system dependencies
15
+ RUN apt-get update && \
16
+ apt-get install -y --no-install-recommends \
17
+ git \
18
+ build-essential \
19
+ libgl1 \
20
+ libglib2.0-0 \
21
+ libsm6 \
22
+ libxrender1 \
23
+ libxext6 \
24
+ && apt-get clean \
25
+ && rm -rf /var/lib/apt/lists/*
26
+
27
+ # Set the working directory
28
+ WORKDIR /app
29
+
30
+ # Install Python dependencies first (leverages Docker layer caching)
31
+ COPY requirements.txt .
32
+ RUN pip install --no-cache-dir --upgrade pip && \
33
+ pip install --no-cache-dir -r requirements.txt
34
+
35
+ # Copy the rest of the application code
36
+ COPY . .
37
+
38
+ # Create a non-root user for security (Best Practice)
39
+ RUN useradd -m appuser && chown -R appuser /app
40
+ USER appuser
41
+
42
+ # Expose the designated port
43
+ EXPOSE 7860
44
+
45
+ # Run the application
46
+ # We use the list form of CMD for better signal handling (CTRL+C)
47
+ CMD ["sh", "-c", "uvicorn mainapp:app --host 0.0.0.0 --port ${PORT}"]
llm.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
+ from functools import partial
4
+ import asyncio
5
+ model_name = "microsoft/phi-3-mini-128k-instruct"
6
+
7
+ # 8-bit quantization config
8
+ quant_config = BitsAndBytesConfig(
9
+ load_in_8bit=True
10
+ )
11
+
12
+ # tokenizer
13
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
14
+
15
+ # model
16
+ model = AutoModelForCausalLM.from_pretrained(
17
+ model_name,
18
+ device_map="auto",
19
+ quantization_config=quant_config
20
+ )
21
+ # Mock LLM function (replace with your actual LLM API call)
22
+ async def call_llm(prompt: str):
23
+ # Simulate LLM call in executor
24
+ llm_function_with_args=partial(execute_llm(model,tokenizer,prompt))
25
+ loop = asyncio.get_event_loop()
26
+ result = await loop.run_in_executor(None, llm_function_with_args)
27
+ return result
28
+
29
+ async def execute_llm(model,tokenizer,prompt:str):
30
+ prompt="what is json give an example "
31
+ data=""
32
+ full_prompt=prompt+" "+data
33
+ messages = [
34
+ {"role": "user", "content":full_prompt }
35
+ ]
36
+
37
+ inputs = tokenizer.apply_chat_template(
38
+ messages,
39
+ add_generation_prompt=True,
40
+ return_tensors="pt"
41
+ ).to(model.device)
42
+ outputs = model.generate(
43
+ **inputs,
44
+ max_new_tokens=10000
45
+ )
46
+
47
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)
mainapp.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.responses import JSONResponse
3
+ from ocr_utils import process_pdf_page,ocr_image
4
+ import fitz # PyMuPDF
5
+ import numpy as np
6
+ from PIL import Image
7
+ from functools import partial
8
+ import io
9
+ import asyncio
10
+ from llm import tokenizer , model ,call_llm
11
+ app = FastAPI()
12
+ # Initialize once
13
+
14
+
15
+
16
+
17
+
18
+ @app.get("/")
19
+ def home ():
20
+ return JSONResponse(content={'message':'home page'})
21
+ @app.post("/ocr-llm")
22
+ async def ocr_llm_endpoint(file: UploadFile = File(...)):
23
+ if not file.filename.lower().endswith((".pdf", ".png", ".jpg", ".jpeg")):
24
+ raise HTTPException(status_code=400, detail="File must be PDF or image")
25
+
26
+ try:
27
+ results = []
28
+
29
+ if file.filename.lower().endswith(".pdf"):
30
+ file_bytes = await file.read()
31
+ doc = fitz.open(stream=file_bytes, filetype="pdf")
32
+
33
+ # Process each page sequentially or concurrently
34
+ tasks = [process_pdf_page(page) for page in doc]
35
+ results = await asyncio.gather(*tasks)
36
+
37
+ else:
38
+ # Single image file
39
+ image_bytes = await file.read()
40
+ image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
41
+ img_array = np.array(image)
42
+ ocr_result = await ocr_image(img_array)
43
+ print(print('output in main app'),ocr_result,type(ocr_result))
44
+ ocr_text = ",".join(ocr_result)
45
+ # llm_result = await call_llm(ocr_text)
46
+ results.append({"page": 1, "ocr": str(ocr_text), "llm": "llm_result"})
47
+
48
+ return JSONResponse(content={"results": results})
49
+
50
+ except Exception as e:
51
+ raise HTTPException(status_code=500, detail=str(e))
ocr_utils.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyocr
2
+ import asyncio
3
+ import numpy as np
4
+
5
+ # Initialize reader once at module level
6
+ reader = easyocr.Reader(['hi', 'en'], gpu=False)
7
+ print('instance of reader ocr is created ')
8
+ def process_ocr_output(results):
9
+ """
10
+ Converts raw EasyOCR list into a list of dictionaries.
11
+ """
12
+ print('andara ayaa ')
13
+ invoice_data = []
14
+ for bbox, text, conf in results:
15
+ # bbox comes as [[x,y], [x,y], [x,y], [x,y]]
16
+ # We convert to list for JSON serializability
17
+ invoice_data.append(str({
18
+ "bbox": [[int(pt[0]) , int(pt[1])] for pt in bbox],
19
+ "text": text,
20
+ "confidence": float(conf)
21
+ }))
22
+ print('yaah pr')
23
+ return invoice_data
24
+
25
+ async def ocr_image(image: np.ndarray):
26
+ """
27
+ Runs OCR in a thread pool to avoid blocking the FastAPI event loop.
28
+ """
29
+ loop = asyncio.get_event_loop()
30
+ # EasyOCR's readtext is CPU bound, so we run in executor
31
+ results = await loop.run_in_executor(None, reader.readtext, image)
32
+
33
+ results=process_ocr_output(results)
34
+ print(results)
35
+ return results
36
+
37
+ async def process_pdf_page(page):
38
+ """
39
+ Converts PDF page to image and processes OCR.
40
+ """
41
+ pix = page.get_pixmap()
42
+ # Convert PyMuPDF pixmap to numpy array
43
+ img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n)
44
+
45
+ if pix.n == 4: # Convert RGBA to RGB
46
+ img = img[:, :, :3]
47
+
48
+ # Get raw results
49
+ raw_results = await ocr_image(img)
50
+
51
+ # 1. Create the clean string for the LLM
52
+ full_text = " ".join([res[1] for res in raw_results])
53
+
54
+ # 2. Create the detailed JSON structure for the response
55
+ structured_ocr = process_ocr_output(raw_results)
56
+
57
+ # Optional: If you want to call LLM here
58
+ # llm_result = await call_llm(full_text)
59
+
60
+ return {
61
+ "page_number": page.number + 1,
62
+ "ocr_details": structured_ocr,
63
+ "raw_text": full_text,
64
+ "llm_analysis": "llm_result_placeholder"
65
+ }
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ git+https://github.com/huggingface/transformers
4
+ accelerate
5
+ bitsandbytes
6
+ sentencepiece
7
+ torch
8
+ numpy
9
+ matplotlib
10
+ easyocr
11
+ opencv-python
12
+ accelerator
13
+ PyMuPDF
14
+ pillow
15
+ python-multipart