Spaces:
Paused
Paused
| import easyocr | |
| import asyncio | |
| import numpy as np | |
| # Initialize reader once at module level | |
| reader = easyocr.Reader(['hi', 'en'], gpu=False) | |
| print('instance of reader ocr is created ') | |
| def process_ocr_output(results): | |
| """ | |
| Converts raw EasyOCR list into a list of dictionaries. | |
| """ | |
| print('andara ayaa ') | |
| invoice_data = [] | |
| for bbox, text, conf in results: | |
| # bbox comes as [[x,y], [x,y], [x,y], [x,y]] | |
| # We convert to list for JSON serializability | |
| invoice_data.append(str({ | |
| "bbox": [[int(pt[0]) , int(pt[1])] for pt in bbox], | |
| "text": text, | |
| "confidence": float(conf) | |
| })) | |
| print('yaah pr') | |
| return invoice_data | |
| async def ocr_image(image: np.ndarray): | |
| """ | |
| Runs OCR in a thread pool to avoid blocking the FastAPI event loop. | |
| """ | |
| loop = asyncio.get_event_loop() | |
| # EasyOCR's readtext is CPU bound, so we run in executor | |
| results = await loop.run_in_executor(None, reader.readtext, image) | |
| results=process_ocr_output(results) | |
| print(results) | |
| return results | |
| async def process_pdf_page(page): | |
| """ | |
| Converts PDF page to image and processes OCR. | |
| """ | |
| pix = page.get_pixmap() | |
| # Convert PyMuPDF pixmap to numpy array | |
| img = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.height, pix.width, pix.n) | |
| if pix.n == 4: # Convert RGBA to RGB | |
| img = img[:, :, :3] | |
| # Get raw results | |
| raw_results = await ocr_image(img) | |
| # 1. Create the clean string for the LLM | |
| full_text = " ".join([res[1] for res in raw_results]) | |
| # 2. Create the detailed JSON structure for the response | |
| structured_ocr = process_ocr_output(raw_results) | |
| # Optional: If you want to call LLM here | |
| # llm_result = await call_llm(full_text) | |
| return { | |
| "page_number": page.number + 1, | |
| "ocr_details": structured_ocr, | |
| "raw_text": full_text, | |
| "llm_analysis": "llm_result_placeholder" | |
| } |