File size: 3,815 Bytes
1e7ca15
 
 
 
 
 
 
 
 
 
 
 
 
 
c2f6371
 
 
 
d965f39
 
 
c2f6371
 
 
 
 
7b148a9
c2f6371
 
1a948a0
39315ff
c2f6371
 
 
 
 
 
 
 
 
d965f39
1e7ca15
d965f39
 
1e7ca15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
import io
import json
import uvicorn
from fastapi import FastAPI, UploadFile, File, HTTPException
from pdf2image import convert_from_bytes
import layoutparser as lp
import numpy as np
import cv2
import pytesseract
from PIL import Image

app = FastAPI()

import urllib.request
import os

def download_model():
    """Download model files to /tmp (the only writable area on HF)"""
    print("Checking model files in /tmp...")
    base_dir = "/tmp/models"
    model_dir = os.path.join(base_dir, "faster_rcnn_R_50_FPN_3x")
    os.makedirs(model_dir, exist_ok=True)

    files = {
        os.path.join(model_dir, "config.yaml"): 
            "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml",
        os.path.join(model_dir, "model_final.pth"): 
            "https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth"
    }

    for path, url in files.items():
        if not os.path.exists(path):
            print(f"Downloading {path} from {url}...")
            urllib.request.urlretrieve(url, path)
    print("Model files ready.")

# Ensure models are downloaded before initialization
download_model()

# Initialize LayoutParser with paths pointing to /tmp
model = lp.Detectron2LayoutModel(
    config_path="/tmp/models/faster_rcnn_R_50_FPN_3x/config.yaml",
    model_path="/tmp/models/faster_rcnn_R_50_FPN_3x/model_final.pth",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)

@app.get("/")
def home():
    return {"message": "Deshonnati AI Worker is running"}

@app.post("/process")
async def process_pdf(file: UploadFile = File(...)):
    try:
        # 1. Read PDF bytes
        pdf_bytes = await file.read()
        
        # 2. Convert PDF to Image (first page for demo)
        images = convert_from_bytes(pdf_bytes, dpi=200)
        if not images:
            raise HTTPException(status_code=400, detail="Could not convert PDF to images")
        
        results = []
        
        for i, image in enumerate(images):
            # 3. Convert PIL image to CV2 format for LayoutParser
            open_cv_image = np.array(image)
            open_cv_image = open_cv_image[:, :, ::-1].copy() # RGB to BGR

            # 4. Detect Layout
            layout = model.detect(open_cv_image)
            
            page_articles = []
            
            # 5. Process each detected block
            for block in layout:
                if block.type in ['Text', 'Title']:
                    # Get coordinates
                    x0, y0, x1, y1 = block.coordinates
                    
                    # Crop image for better OCR
                    cropped_img = image.crop((x0, y0, x1, y1))
                    
                    # 6. Run OCR (Multi-language)
                    text = pytesseract.image_to_string(cropped_img, lang='mar+eng+hin')
                    
                    page_articles.append({
                        "id": f"art_{i}_{len(page_articles)}",
                        "type": block.type,
                        "bbox": {
                            "x0": x0,
                            "y0": y0,
                            "x1": x1,
                            "y1": y1
                        },
                        "text": text.strip()
                    })
            
            results.append({
                "page": i + 1,
                "articles": page_articles
            })
            
        return {"success": True, "data": results}

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

if __name__ == "__main__":
    # HF Spaces requires port 7860
    uvicorn.run(app, host="0.0.0.0", port=7860)