Spaces:

dead031
/

ai-worker

Sleeping

File size: 3,815 Bytes

import os
import io
import json
import uvicorn
from fastapi import FastAPI, UploadFile, File, HTTPException
from pdf2image import convert_from_bytes
import layoutparser as lp
import numpy as np
import cv2
import pytesseract
from PIL import Image

app = FastAPI()

import urllib.request
import os

def download_model():
    """Download model files to /tmp (the only writable area on HF)"""
    print("Checking model files in /tmp...")
    base_dir = "/tmp/models"
    model_dir = os.path.join(base_dir, "faster_rcnn_R_50_FPN_3x")
    os.makedirs(model_dir, exist_ok=True)

    files = {
        os.path.join(model_dir, "config.yaml"): 
            "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml",
        os.path.join(model_dir, "model_final.pth"): 
            "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth"
    }

    for path, url in files.items():
        if not os.path.exists(path):
            print(f"Downloading {path} from {url}...")
            urllib.request.urlretrieve(url, path)
    print("Model files ready.")

# Ensure models are downloaded before initialization
download_model()

# Initialize LayoutParser with paths pointing to /tmp
model = lp.Detectron2LayoutModel(
    config_path="/tmp/models/faster_rcnn_R_50_FPN_3x/config.yaml",
    model_path="/tmp/models/faster_rcnn_R_50_FPN_3x/model_final.pth",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)

@app.get("/")
def home():
    return {"message": "Deshonnati AI Worker is running"}

@app.post("/process")
async def process_pdf(file: UploadFile = File(...)):
    try:
        # 1. Read PDF bytes
        pdf_bytes = await file.read()
        
        # 2. Convert PDF to Image (first page for demo)
        images = convert_from_bytes(pdf_bytes, dpi=200)
        if not images:
            raise HTTPException(status_code=400, detail="Could not convert PDF to images")
        
        results = []
        
        for i, image in enumerate(images):
            # 3. Convert PIL image to CV2 format for LayoutParser
            open_cv_image = np.array(image)
            open_cv_image = open_cv_image[:, :, ::-1].copy() # RGB to BGR

            # 4. Detect Layout
            layout = model.detect(open_cv_image)
            
            page_articles = []
            
            # 5. Process each detected block
            for block in layout:
                if block.type in ['Text', 'Title']:
                    # Get coordinates
                    x0, y0, x1, y1 = block.coordinates
                    
                    # Crop image for better OCR
                    cropped_img = image.crop((x0, y0, x1, y1))
                    
                    # 6. Run OCR (Multi-language)
                    text = pytesseract.image_to_string(cropped_img, lang='mar+eng+hin')
                    
                    page_articles.append({
                        "id": f"art_{i}_{len(page_articles)}",
                        "type": block.type,
                        "bbox": {
                            "x0": x0,
                            "y0": y0,
                            "x1": x1,
                            "y1": y1
                        },
                        "text": text.strip()
                    })
            
            results.append({
                "page": i + 1,
                "articles": page_articles
            })
            
        return {"success": True, "data": results}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    # HF Spaces requires port 7860
    uvicorn.run(app, host="0.0.0.0", port=7860)