Spaces:

dead031
/

ai-worker

Sleeping

App Files Files Community

ai-worker / app.py

dead031

Update app.py

7b148a9 verified about 1 month ago

raw

history blame contribute delete

3.82 kB

	import os
	import io
	import json
	import uvicorn
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from pdf2image import convert_from_bytes
	import layoutparser as lp
	import numpy as np
	import cv2
	import pytesseract
	from PIL import Image

	app = FastAPI()

	import urllib.request
	import os

	def download_model():
	"""Download model files to /tmp (the only writable area on HF)"""
	print("Checking model files in /tmp...")
	base_dir = "/tmp/models"
	model_dir = os.path.join(base_dir, "faster_rcnn_R_50_FPN_3x")
	os.makedirs(model_dir, exist_ok=True)

	files = {
	os.path.join(model_dir, "config.yaml"):
	"https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/config.yml",
	os.path.join(model_dir, "model_final.pth"):
	"https://huggingface.co/layoutparser/detectron2/resolve/main/PubLayNet/faster_rcnn_R_50_FPN_3x/model_final.pth"
	}

	for path, url in files.items():
	if not os.path.exists(path):
	print(f"Downloading {path} from {url}...")
	urllib.request.urlretrieve(url, path)
	print("Model files ready.")

	# Ensure models are downloaded before initialization
	download_model()

	# Initialize LayoutParser with paths pointing to /tmp
	model = lp.Detectron2LayoutModel(
	config_path="/tmp/models/faster_rcnn_R_50_FPN_3x/config.yaml",
	model_path="/tmp/models/faster_rcnn_R_50_FPN_3x/model_final.pth",
	extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
	label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
	)

	@app.get("/")
	def home():
	return {"message": "Deshonnati AI Worker is running"}

	@app.post("/process")
	async def process_pdf(file: UploadFile = File(...)):
	try:
	# 1. Read PDF bytes
	pdf_bytes = await file.read()

	# 2. Convert PDF to Image (first page for demo)
	images = convert_from_bytes(pdf_bytes, dpi=200)
	if not images:
	raise HTTPException(status_code=400, detail="Could not convert PDF to images")

	results = []

	for i, image in enumerate(images):
	# 3. Convert PIL image to CV2 format for LayoutParser
	open_cv_image = np.array(image)
	open_cv_image = open_cv_image[:, :, ::-1].copy() # RGB to BGR

	# 4. Detect Layout
	layout = model.detect(open_cv_image)

	page_articles = []

	# 5. Process each detected block
	for block in layout:
	if block.type in ['Text', 'Title']:
	# Get coordinates
	x0, y0, x1, y1 = block.coordinates

	# Crop image for better OCR
	cropped_img = image.crop((x0, y0, x1, y1))

	# 6. Run OCR (Multi-language)
	text = pytesseract.image_to_string(cropped_img, lang='mar+eng+hin')

	page_articles.append({
	"id": f"art_{i}_{len(page_articles)}",
	"type": block.type,
	"bbox": {
	"x0": x0,
	"y0": y0,
	"x1": x1,
	"y1": y1
	},
	"text": text.strip()
	})

	results.append({
	"page": i + 1,
	"articles": page_articles
	})

	return {"success": True, "data": results}

	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	# HF Spaces requires port 7860
	uvicorn.run(app, host="0.0.0.0", port=7860)