OCR / main.py
anwer-1's picture
Upload main.py
c6ef147 verified
raw
history blame
6.51 kB
from fastapi import FastAPI, File, UploadFile, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from typing import List, Dict
from io import BytesIO
from PIL import Image
import uvicorn
import os
import numpy as np
import cv2
import re
# PDF support
try:
from pdf2image import convert_from_bytes
PDF_AVAILABLE = True
except:
PDF_AVAILABLE = False
# Models
paddle_detector = None
paddle_recognizer = None
app = FastAPI(title="OCR Scan Vision API", version="1.0.0")
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# -------------------- تنظيف النص العربي --------------------
def clean_arabic_text(text: str) -> str:
if not text:
return ""
# 1️⃣ تحويل الرموز المهمة لمسافات
text = re.sub(r"[:\-_/]", " ", text)
# 2️⃣ إزالة التشكيل
text = re.sub(r"[\u064B-\u065F]", "", text)
# 3️⃣ إزالة أي رموز غير عربي / أرقام / مسافة
text = re.sub(r"[^\u0600-\u06FF0-9\s]", "", text)
# 4️⃣ حل مشكلة الكلمات اللاصقة (عربي + عربي)
text = re.sub(r"([\u0600-\u06FF]{2,})([\u0600-\u06FF]{2,})", r"\1 \2", text)
# 5️⃣ إصلاح أشهر السنة (شائع في العقود)
months = [
"يناير","فبراير","مارس","ابريل","أبريل","مايو","يونيو",
"يوليو","اغسطس","أغسطس","سبتمبر","اكتوبر","أكتوبر",
"نوفمبر","ديسمبر"
]
for m in months:
text = re.sub(rf"(\D)({m})", r"\1 \2", text)
# 6️⃣ ضبط المسافات
text = re.sub(r"\s+", " ", text)
return text.strip()
def get_models():
global paddle_detector, paddle_recognizer
if paddle_detector is None or paddle_recognizer is None:
try:
from paddlex import create_model
print("Loading PaddleX OCR models...")
paddle_detector = create_model("PP-OCRv5_server_det")
paddle_recognizer = create_model("arabic_PP-OCRv5_mobile_rec")
print("Models loaded.")
except Exception as e:
raise HTTPException(
status_code=500,
detail=f"OCR models failed to load: {str(e)}"
)
return paddle_detector, paddle_recognizer
def process_image(img: np.ndarray, detector, recognizer, min_conf: float) -> List[Dict]:
h_img, w_img = img.shape[:2]
# 1️⃣ كشف النصوص
results = detector.predict(img)
all_rois = []
all_bboxes = []
for result in results:
boxes = result.get("dt_polys", [])
for box in boxes:
pts = np.array(box, dtype=np.int32)
x, y, w, h = cv2.boundingRect(pts)
x1 = max(x, 0)
y1 = max(y, 0)
x2 = min(x + w, w_img)
y2 = min(y + h, h_img)
if x2 > x1 and y2 > y1:
roi = img[y1:y2, x1:x2]
if roi.size > 0:
all_rois.append(roi)
all_bboxes.append([x1, y1, x2, y2])
# 2️⃣ التعرف على النصوص
ocr_results = []
for i, roi in enumerate(all_rois):
try:
rec_gen = recognizer.predict(roi)
rec = next(rec_gen)
raw_text = rec.get("rec_text", "")
score = float(rec.get("rec_score", 0.0))
text = clean_arabic_text(raw_text)
except:
text = ""
score = 0.0
if score >= min_conf and text:
ocr_results.append({
"box_id": i + 1,
"text": text,
"confidence": round(score, 4),
"bbox": all_bboxes[i]
})
# ✅ ترتيب عربي: فوق → تحت ، يمين → شمال
ocr_results.sort(
key=lambda x: (
x["bbox"][1], # Y
-x["bbox"][0] # X (RTL)
)
)
return ocr_results
@app.get("/")
def root():
return {"name": "OCR Scan Vision API", "status": "ok", "pdf_support": PDF_AVAILABLE}
@app.get("/health")
def health():
return {"status": "healthy"}
@app.post("/ocr")
async def ocr_image(
file: UploadFile = File(...),
min_conf: float = Query(default=0.0, ge=0.0, le=1.0),
):
try:
contents = await file.read()
pil_img = Image.open(BytesIO(contents)).convert("RGB")
img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
except:
raise HTTPException(status_code=400, detail="Invalid image file")
detector, recognizer = get_models()
ocr_results = process_image(img, detector, recognizer, min_conf)
full_text = "\n".join([r["text"] for r in ocr_results])
return {
"items": ocr_results,
"text": full_text,
"total_boxes": len(ocr_results)
}
@app.post("/ocr-pdf")
async def ocr_pdf(
file: UploadFile = File(...),
dpi: int = Query(default=300, ge=72, le=600),
min_conf: float = Query(default=0.0, ge=0.0, le=1.0),
):
if not PDF_AVAILABLE:
raise HTTPException(status_code=500, detail="PDF support not available")
try:
contents = await file.read()
pages = convert_from_bytes(contents, dpi=dpi)
except Exception as e:
raise HTTPException(status_code=400, detail=f"Invalid PDF file: {e}")
detector, recognizer = get_models()
all_results = []
all_text = []
for page_num, pil_img in enumerate(pages, start=1):
img = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR)
page_results = process_image(img, detector, recognizer, min_conf)
for item in page_results:
item["page"] = page_num
all_results.extend(page_results)
page_text = "\n".join([r["text"] for r in page_results])
if page_text:
all_text.append(f"--- Page {page_num} ---\n{page_text}")
return {
"pages": len(pages),
"items": all_results,
"text": "\n\n".join(all_text),
"total_boxes": len(all_results)
}
if __name__ == "__main__":
port = int(os.environ.get("PORT", 7860))
uvicorn.run("main:app", host="0.0.0.0", port=port)