Spaces:

triflix
/

sortitout

Sleeping

App Files Files Community

sortitout / app.py

triflix

Update app.py

6894202 verified about 1 month ago

raw

history blame contribute delete

4.1 kB

	import os
	import uuid
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from typing import List
	import fitz

	# -------------------------------------------------------------------
	# FORCE PADDLEX / PADDLEOCR CACHE DIRECTORIES TO WRITABLE LOCATIONS
	# -------------------------------------------------------------------
	os.environ["PADDLE_HOME"] = "/app/paddle_home"
	os.environ["XDG_CACHE_HOME"] = "/app/xdg_cache"
	os.makedirs("/app/paddle_home", exist_ok=True)
	os.makedirs("/app/xdg_cache", exist_ok=True)

	# now safe to import paddlex/paddleocr
	from paddleocr import PaddleOCR

	# -------------------------------------------------------------------
	# PDF → IMAGE
	# -------------------------------------------------------------------
	def pdf_to_images(pdf_path: str, max_pages: int \| None = 3) -> List[str]:
	if not os.path.exists(pdf_path):
	raise FileNotFoundError(pdf_path)

	doc = fitz.open(pdf_path)
	page_count = len(doc)

	limit = page_count if max_pages is None else min(max_pages, page_count)
	output_paths: List[str] = []

	out_dir = "/app/pdf_images"
	os.makedirs(out_dir, exist_ok=True)

	for i in range(limit):
	page = doc.load_page(i)
	pix = page.get_pixmap(dpi=220)
	img_name = f"{uuid.uuid4()}.jpg"
	img_path = os.path.join(out_dir, img_name)
	pix.save(img_path)
	output_paths.append(img_path)

	return output_paths


	# -------------------------------------------------------------------
	# OCR ENGINE
	# -------------------------------------------------------------------
	ocr_engine = PaddleOCR(
	lang="mr",
	text_recognition_model_name="devanagari_PP-OCRv5_mobile_rec",
	use_doc_orientation_classify=False,
	use_doc_unwarping=False,
	use_textline_orientation=False
	)


	def extract_text(image_path: str):
	result = ocr_engine.predict(input=image_path)
	output = []
	for block in result:
	texts = block["rec_texts"]
	scores = block["rec_scores"]
	for t, s in zip(texts, scores):
	output.append({"text": t, "confidence": float(s)})
	return output


	# -------------------------------------------------------------------
	# FASTAPI
	# -------------------------------------------------------------------
	app = FastAPI()
	UPLOAD_DIR = "/app/uploads"
	os.makedirs(UPLOAD_DIR, exist_ok=True)


	@app.post("/ocr")
	async def ocr_endpoint(files: List[UploadFile] = File(...), max_pages: int \| None = 3):
	if len(files) > 15:
	raise HTTPException(status_code=400, detail="Maximum 15 files allowed.")

	structured_output = {"files": []}

	for index, file in enumerate(files, start=1):
	filename = file.filename.lower()
	ext = filename.split(".")[-1]

	temp_name = f"{uuid.uuid4()}.{ext}"
	temp_path = os.path.join(UPLOAD_DIR, temp_name)

	with open(temp_path, "wb") as f:
	f.write(await file.read())

	file_record = {
	"file_id": f"file_{index}",
	"filename": filename,
	"pages": []
	}

	# -------------------------------
	# PDF
	# -------------------------------
	if filename.endswith(".pdf"):
	img_paths = pdf_to_images(temp_path, max_pages=max_pages)

	for page_idx, img_path in enumerate(img_paths):
	ocr_results = extract_text(img_path)

	file_record["pages"].append({
	"page_index": page_idx,
	"results": ocr_results
	})

	# -------------------------------
	# IMAGE
	# -------------------------------
	elif filename.endswith((".jpg", ".jpeg", ".png")):
	ocr_results = extract_text(temp_path)

	file_record["pages"].append({
	"page_index": 0,
	"results": ocr_results
	})

	else:
	raise HTTPException(status_code=400, detail=f"Unsupported type: {filename}")

	structured_output["files"].append(file_record)

	return JSONResponse(structured_output)