Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, File, UploadFile, HTTPException, Query | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from typing import List, Dict | |
| from io import BytesIO | |
| from PIL import Image | |
| import uvicorn | |
| import os | |
| import numpy as np | |
| import cv2 | |
| import re | |
| # PDF support | |
| try: | |
| from pdf2image import convert_from_bytes | |
| PDF_AVAILABLE = True | |
| except: | |
| PDF_AVAILABLE = False | |
| # Models | |
| paddle_detector = None | |
| paddle_recognizer = None | |
| app = FastAPI(title="OCR Scan Vision API", version="1.0.0") | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # -------------------- تنظيف النص العربي -------------------- | |
| def clean_arabic_text(text: str) -> str: | |
| if not text: | |
| return "" | |
| # 1️⃣ تحويل الرموز المهمة لمسافات | |
| text = re.sub(r"[:\-_/]", " ", text) | |
| # 2️⃣ إزالة التشكيل | |
| text = re.sub(r"[\u064B-\u065F]", "", text) | |
| # 3️⃣ إزالة أي رموز غير عربي / أرقام / مسافة | |
| text = re.sub(r"[^\u0600-\u06FF0-9\s]", "", text) | |
| # 4️⃣ حل مشكلة الكلمات اللاصقة (عربي + عربي) | |
| text = re.sub(r"([\u0600-\u06FF]{2,})([\u0600-\u06FF]{2,})", r"\1 \2", text) | |
| # 5️⃣ إصلاح أشهر السنة (شائع في العقود) | |
| months = [ | |
| "يناير","فبراير","مارس","ابريل","أبريل","مايو","يونيو", | |
| "يوليو","اغسطس","أغسطس","سبتمبر","اكتوبر","أكتوبر", | |
| "نوفمبر","ديسمبر" | |
| ] | |
| for m in months: | |
| text = re.sub(rf"(\D)({m})", r"\1 \2", text) | |
| # 6️⃣ ضبط المسافات | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def get_models(): | |
| global paddle_detector, paddle_recognizer | |
| if paddle_detector is None or paddle_recognizer is None: | |
| try: | |
| from paddlex import create_model | |
| print("Loading PaddleX OCR models...") | |
| paddle_detector = create_model("PP-OCRv5_server_det") | |
| paddle_recognizer = create_model("arabic_PP-OCRv5_mobile_rec") | |
| print("Models loaded.") | |
| except Exception as e: | |
| raise HTTPException( | |
| status_code=500, | |
| detail=f"OCR models failed to load: {str(e)}" | |
| ) | |
| return paddle_detector, paddle_recognizer | |
| def process_image(img: np.ndarray, detector, recognizer, min_conf: float) -> List[Dict]: | |
| h_img, w_img = img.shape[:2] | |
| # 1️⃣ كشف النصوص | |
| results = detector.predict(img) | |
| all_rois = [] | |
| all_bboxes = [] | |
| for result in results: | |
| boxes = result.get("dt_polys", []) | |
| for box in boxes: | |
| pts = np.array(box, dtype=np.int32) | |
| x, y, w, h = cv2.boundingRect(pts) | |
| x1 = max(x, 0) | |
| y1 = max(y, 0) | |
| x2 = min(x + w, w_img) | |
| y2 = min(y + h, h_img) | |
| if x2 > x1 and y2 > y1: | |
| roi = img[y1:y2, x1:x2] | |
| if roi.size > 0: | |
| all_rois.append(roi) | |
| all_bboxes.append([x1, y1, x2, y2]) | |
| # 2️⃣ التعرف على النصوص | |
| ocr_results = [] | |
| for i, roi in enumerate(all_rois): | |
| try: | |
| rec_gen = recognizer.predict(roi) | |
| rec = next(rec_gen) | |
| raw_text = rec.get("rec_text", "") | |
| score = float(rec.get("rec_score", 0.0)) | |
| text = clean_arabic_text(raw_text) | |
| except: | |
| text = "" | |
| score = 0.0 | |
| if score >= min_conf and text: | |
| ocr_results.append({ | |
| "box_id": i + 1, | |
| "text": text, | |
| "confidence": round(score, 4), | |
| "bbox": all_bboxes[i] | |
| }) | |
| # ✅ ترتيب عربي: فوق → تحت ، يمين → شمال | |
| ocr_results.sort( | |
| key=lambda x: ( | |
| x["bbox"][1], # Y | |
| -x["bbox"][0] # X (RTL) | |
| ) | |
| ) | |
| return ocr_results | |
| def root(): | |
| return {"name": "OCR Scan Vision API", "status": "ok", "pdf_support": PDF_AVAILABLE} | |
| def health(): | |
| return {"status": "healthy"} | |
| async def ocr_image( | |
| file: UploadFile = File(...), | |
| min_conf: float = Query(default=0.0, ge=0.0, le=1.0), | |
| ): | |
| try: | |
| contents = await file.read() | |
| pil_img = Image.open(BytesIO(contents)).convert("RGB") | |
| img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) | |
| except: | |
| raise HTTPException(status_code=400, detail="Invalid image file") | |
| detector, recognizer = get_models() | |
| ocr_results = process_image(img, detector, recognizer, min_conf) | |
| full_text = "\n".join([r["text"] for r in ocr_results]) | |
| return { | |
| "items": ocr_results, | |
| "text": full_text, | |
| "total_boxes": len(ocr_results) | |
| } | |
| async def ocr_pdf( | |
| file: UploadFile = File(...), | |
| dpi: int = Query(default=300, ge=72, le=600), | |
| min_conf: float = Query(default=0.0, ge=0.0, le=1.0), | |
| ): | |
| if not PDF_AVAILABLE: | |
| raise HTTPException(status_code=500, detail="PDF support not available") | |
| try: | |
| contents = await file.read() | |
| pages = convert_from_bytes(contents, dpi=dpi) | |
| except Exception as e: | |
| raise HTTPException(status_code=400, detail=f"Invalid PDF file: {e}") | |
| detector, recognizer = get_models() | |
| all_results = [] | |
| all_text = [] | |
| for page_num, pil_img in enumerate(pages, start=1): | |
| img = cv2.cvtColor(np.array(pil_img.convert("RGB")), cv2.COLOR_RGB2BGR) | |
| page_results = process_image(img, detector, recognizer, min_conf) | |
| for item in page_results: | |
| item["page"] = page_num | |
| all_results.extend(page_results) | |
| page_text = "\n".join([r["text"] for r in page_results]) | |
| if page_text: | |
| all_text.append(f"--- Page {page_num} ---\n{page_text}") | |
| return { | |
| "pages": len(pages), | |
| "items": all_results, | |
| "text": "\n\n".join(all_text), | |
| "total_boxes": len(all_results) | |
| } | |
| if __name__ == "__main__": | |
| port = int(os.environ.get("PORT", 7860)) | |
| uvicorn.run("main:app", host="0.0.0.0", port=port) | |