Spaces:

sharshar1
/

OCR

Sleeping

App Files Files Community

Upload main.py

by anwer-1 - opened Dec 18, 2025

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

+23

-20

Files changed (1) hide show

main.py +23 -20

main.py CHANGED Viewed

@@ -30,25 +30,34 @@ app.add_middleware(
     allow_headers=["*"],
 )
-@app.on_event("startup")
-async def startup_event():
-    print("Server started. OCR models will be loaded lazily on first request.")
 # -------------------- تنظيف النص العربي --------------------
 def clean_arabic_text(text: str) -> str:
     if not text:
         return ""
-    # إزالة أي شيء غير عربي أو أرقام
-    text = re.sub(r"[^\u0600-\u06FF0-9]", "", text)
-    # إزالة التشكيل
     text = re.sub(r"[\u064B-\u065F]", "", text)
-    # إزالة أي مسافات
-    text = re.sub(r"\s+", "", text)
     return text.strip()
@@ -72,16 +81,10 @@ def get_models():
     return paddle_detector, paddle_recognizer
-def process_image(
-    img: np.ndarray,
-    detector,
-    recognizer,
-    min_conf: float
-) -> List[Dict]:
     h_img, w_img = img.shape[:2]
-    # 1️⃣ Detect text
     results = detector.predict(img)
     all_rois = []
@@ -104,7 +107,7 @@ def process_image(
                     all_rois.append(roi)
                     all_bboxes.append([x1, y1, x2, y2])
-    # 2️⃣ Recognize text
     ocr_results = []
     for i, roi in enumerate(all_rois):

     allow_headers=["*"],
 )
 # -------------------- تنظيف النص العربي --------------------
 def clean_arabic_text(text: str) -> str:
     if not text:
         return ""
+    # 1️⃣ تحويل الرموز المهمة لمسافات
+    text = re.sub(r"[:\-_/]", " ", text)
+    # 2️⃣ إزالة التشكيل
     text = re.sub(r"[\u064B-\u065F]", "", text)
+    # 3️⃣ إزالة أي رموز غير عربي / أرقام / مسافة
+    text = re.sub(r"[^\u0600-\u06FF0-9\s]", "", text)
+    # 4️⃣ حل مشكلة الكلمات اللاصقة (عربي + عربي)
+    text = re.sub(r"([\u0600-\u06FF]{2,})([\u0600-\u06FF]{2,})", r"\1 \2", text)
+    # 5️⃣ إصلاح أشهر السنة (شائع في العقود)
+    months = [
+        "يناير","فبراير","مارس","ابريل","أبريل","مايو","يونيو",
+        "يوليو","اغسطس","أغسطس","سبتمبر","اكتوبر","أكتوبر",
+        "نوفمبر","ديسمبر"
+    ]
+    for m in months:
+        text = re.sub(rf"(\D)({m})", r"\1 \2", text)
+    # 6️⃣ ضبط المسافات
+    text = re.sub(r"\s+", " ", text)
     return text.strip()
     return paddle_detector, paddle_recognizer
+def process_image(img: np.ndarray, detector, recognizer, min_conf: float) -> List[Dict]:
     h_img, w_img = img.shape[:2]
+    # 1️⃣ كشف النصوص
     results = detector.predict(img)
     all_rois = []
                     all_rois.append(roi)
                     all_bboxes.append([x1, y1, x2, y2])
+    # 2️⃣ التعرف على النصوص
     ocr_results = []
     for i, roi in enumerate(all_rois):