import os import io import json import uvicorn from fastapi import FastAPI, UploadFile, File, HTTPException from pdf2image import convert_from_bytes import layoutparser as lp import numpy as np import cv2 import pytesseract from PIL import Image app = FastAPI() import urllib.request import os def download_model(): """Download model files to /tmp (the only writable area on HF)""" print("Checking model files in /tmp...") base_dir = "/tmp/models" model_dir = os.path.join(base_dir, "faster_rcnn_R_50_FPN_3x") os.makedirs(model_dir, exist_ok=True) files = { os.path.join(model_dir, "config.yaml"): "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml", os.path.join(model_dir, "model_final.pth"): "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth" } for path, url in files.items(): if not os.path.exists(path): print(f"Downloading {path} from {url}...") urllib.request.urlretrieve(url, path) print("Model files ready.") # Ensure models are downloaded before initialization download_model() # Initialize LayoutParser with paths pointing to /tmp model = lp.Detectron2LayoutModel( config_path="/tmp/models/faster_rcnn_R_50_FPN_3x/config.yaml", model_path="/tmp/models/faster_rcnn_R_50_FPN_3x/model_final.pth", extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5], label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"} ) @app.get("/") def home(): return {"message": "Deshonnati AI Worker is running"} @app.post("/process") async def process_pdf(file: UploadFile = File(...)): try: # 1. Read PDF bytes pdf_bytes = await file.read() # 2. Convert PDF to Image (first page for demo) images = convert_from_bytes(pdf_bytes, dpi=200) if not images: raise HTTPException(status_code=400, detail="Could not convert PDF to images") results = [] for i, image in enumerate(images): # 3. Convert PIL image to CV2 format for LayoutParser open_cv_image = np.array(image) open_cv_image = open_cv_image[:, :, ::-1].copy() # RGB to BGR # 4. Detect Layout layout = model.detect(open_cv_image) page_articles = [] # 5. Process each detected block for block in layout: if block.type in ['Text', 'Title']: # Get coordinates x0, y0, x1, y1 = block.coordinates # Crop image for better OCR cropped_img = image.crop((x0, y0, x1, y1)) # 6. Run OCR (Multi-language) text = pytesseract.image_to_string(cropped_img, lang='mar+eng+hin') page_articles.append({ "id": f"art_{i}_{len(page_articles)}", "type": block.type, "bbox": { "x0": x0, "y0": y0, "x1": x1, "y1": y1 }, "text": text.strip() }) results.append({ "page": i + 1, "articles": page_articles }) return {"success": True, "data": results} except Exception as e: raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": # HF Spaces requires port 7860 uvicorn.run(app, host="0.0.0.0", port=7860)