Spaces:

sharshar1
/

OCR

Sleeping

App Files Files Community

OCR / main.py

anwer-1

Upload main.py

c6ef147 verified 4 months ago

raw

history blame

6.51 kB

	from fastapi import FastAPI, File, UploadFile, HTTPException, Query
	from fastapi.middleware.cors import CORSMiddleware
	from typing import List, Dict
	from io import BytesIO
	from PIL import Image
	import uvicorn
	import os
	import numpy as np
	import cv2
	import re

	# PDF support
	try:
	from pdf2image import convert_from_bytes
	PDF_AVAILABLE = True
	except:
	PDF_AVAILABLE = False

	# Models
	paddle_detector = None
	paddle_recognizer = None

	app = FastAPI(title="OCR Scan Vision API", version="1.0.0")

	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# -------------------- تنظيف النص العربي --------------------
	def clean_arabic_text(text: str) -> str:
	if not text:
	return ""

	# 1️⃣ تحويل الرموز المهمة لمسافات
	text = re.sub(r"[:\-_/]", " ", text)

	# 2️⃣ إزالة التشكيل
	text = re.sub(r"[\u064B-\u065F]", "", text)

	# 3️⃣ إزالة أي رموز غير عربي / أرقام / مسافة
	text = re.sub(r"[^\u0600-\u06FF0-9\s]", "", text)

	# 4️⃣ حل مشكلة الكلمات اللاصقة (عربي + عربي)
	text = re.sub(r"([\u0600-\u06FF]{2,})([\u0600-\u06FF]{2,})", r"\1 \2", text)

	# 5️⃣ إصلاح أشهر السنة (شائع في العقود)
	months = [
	"يناير","فبراير","مارس","ابريل","أبريل","مايو","يونيو",
	"يوليو","اغسطس","أغسطس","سبتمبر","اكتوبر","أكتوبر",
	"نوفمبر","ديسمبر"
	]
	for m in months:
	text = re.sub(rf"(\D)({m})", r"\1 \2", text)

	# 6️⃣ ضبط المسافات
	text = re.sub(r"\s+", " ", text)

	return text.strip()


	def get_models():
	global paddle_detector, paddle_recognizer

	if paddle_detector is None or paddle_recognizer is None:
	try:
	from paddlex import create_model
	print("Loading PaddleX OCR models...")
	paddle_detector = create_model("PP-OCRv5_server_det")
	paddle_recognizer = create_model("arabic_PP-OCRv5_mobile_rec")
	print("Models loaded.")
	except Exception as e:
	raise HTTPException(
	status_code=500,
	detail=f"OCR models failed to load: {str(e)}"
	)

	return paddle_detector, paddle_recognizer


	def process_image(img: np.ndarray, detector, recognizer, min_conf: float) -> List[Dict]:
	h_img, w_img = img.shape[:2]

	# 1️⃣ كشف النصوص
	results = detector.predict(img)

	all_rois = []
	all_bboxes = []

	for result in results:
	boxes = result.get("dt_polys", [])
	for box in boxes:
	pts = np.array(box, dtype=np.int32)
	x, y, w, h = cv2.boundingRect(pts)

	x1 = max(x, 0)
	y1 = max(y, 0)
	x2 = min(x + w, w_img)
	y2 = min(y + h, h_img)

	if x2 > x1 and y2 > y1:
	roi = img[y1:y2, x1:x2]
	if roi.size > 0:
	all_rois.append(roi)
	all_bboxes.append([x1, y1, x2, y2])

	# 2️⃣ التعرف على النصوص
	ocr_results = []

	for i, roi in enumerate(all_rois):
	try:
	rec_gen = recognizer.predict(roi)
	rec = next(rec_gen)
	raw_text = rec.get("rec_text", "")
	score = float(rec.get("rec_score", 0.0))
	text = clean_arabic_text(raw_text)
	except:
	text = ""
	score = 0.0

	if score >= min_conf and text:
	ocr_results.append({
	"box_id": i + 1,
	"text": text,
	"confidence": round(score, 4),
	"bbox": all_bboxes[i]
	})

	# ✅ ترتيب عربي: فوق → تحت ، يمين → شمال
	ocr_results.sort(
	key=lambda x: (
	x["bbox"][1], # Y
	-x["bbox"][0] # X (RTL)
	)
	)

	return ocr_results


	@app.get("/")
	def root():
	return {"name": "OCR Scan Vision API", "status": "ok", "pdf_support": PDF_AVAILABLE}


	@app.get("/health")
	def health():
	return {"status": "healthy"}


	@app.post("/ocr")
	async def ocr_image(
	file: UploadFile = File(...),
	min_conf: float = Query(default=0.0, ge=0.0, le=1.0),
	):
	try:
	contents = await file.read()
	pil_img = Image.open(BytesIO(contents)).convert("RGB")
	img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
	except:
	raise HTTPException(status_code=400, detail="Invalid image file")

	detector, recognizer = get_models()
	ocr_results = process_image(img, detector, recognizer, min_conf)

	full_text = "\n".join([r["text"] for r in ocr_results])

	return {
	"items": ocr_results,
	"text": full_text,
	"total_boxes": len(ocr_results)
	}


	@app.post("/ocr-pdf")
	async def ocr_pdf(
	file: UploadFile = File(...),
	dpi: int = Query(default=300, ge=72, le=600),
	min_conf: float = Query(default=0.0, ge=0.0, le=1.0),
	):
	if not PDF_AVAILABLE:
	raise HTTPException(status_code=500, detail="PDF support not available")

	try:
	contents = await file.read()
	pages = convert_from_bytes(contents, dpi=dpi)
	except Exception as e:
	raise HTTPException(status_code=400, detail=f"Invalid PDF file: {e}")

	detector, recognizer = get_models()

	all_results = []
	all_text = []

	for page_num, pil_img in enumerate(pages, start=1):
	img = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
	page_results = process_image(img, detector, recognizer, min_conf)

	for item in page_results:
	item["page"] = page_num

	all_results.extend(page_results)

	page_text = "\n".join([r["text"] for r in page_results])
	if page_text:
	all_text.append(f"--- Page {page_num} ---\n{page_text}")

	return {
	"pages": len(pages),
	"items": all_results,
	"text": "\n\n".join(all_text),
	"total_boxes": len(all_results)
	}


	if __name__ == "__main__":
	port = int(os.environ.get("PORT", 7860))
	uvicorn.run("main:app", host="0.0.0.0", port=port)