Spaces:
Running
Running
Upload main.py
#5
by
anwer-1 - opened
main.py
CHANGED
|
@@ -30,25 +30,34 @@ app.add_middleware(
|
|
| 30 |
allow_headers=["*"],
|
| 31 |
)
|
| 32 |
|
| 33 |
-
|
| 34 |
-
@app.on_event("startup")
|
| 35 |
-
async def startup_event():
|
| 36 |
-
print("Server started. OCR models will be loaded lazily on first request.")
|
| 37 |
-
|
| 38 |
-
|
| 39 |
# -------------------- تنظيف النص العربي --------------------
|
| 40 |
def clean_arabic_text(text: str) -> str:
|
| 41 |
if not text:
|
| 42 |
return ""
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
text = re.sub(r"[
|
| 46 |
|
| 47 |
-
# إزالة التشكيل
|
| 48 |
text = re.sub(r"[\u064B-\u065F]", "", text)
|
| 49 |
|
| 50 |
-
# إزالة أي مساف
|
| 51 |
-
text = re.sub(r"\s
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
return text.strip()
|
| 54 |
|
|
@@ -72,16 +81,10 @@ def get_models():
|
|
| 72 |
return paddle_detector, paddle_recognizer
|
| 73 |
|
| 74 |
|
| 75 |
-
def process_image(
|
| 76 |
-
img: np.ndarray,
|
| 77 |
-
detector,
|
| 78 |
-
recognizer,
|
| 79 |
-
min_conf: float
|
| 80 |
-
) -> List[Dict]:
|
| 81 |
-
|
| 82 |
h_img, w_img = img.shape[:2]
|
| 83 |
|
| 84 |
-
# 1️⃣
|
| 85 |
results = detector.predict(img)
|
| 86 |
|
| 87 |
all_rois = []
|
|
@@ -104,7 +107,7 @@ def process_image(
|
|
| 104 |
all_rois.append(roi)
|
| 105 |
all_bboxes.append([x1, y1, x2, y2])
|
| 106 |
|
| 107 |
-
# 2️⃣
|
| 108 |
ocr_results = []
|
| 109 |
|
| 110 |
for i, roi in enumerate(all_rois):
|
|
|
|
| 30 |
allow_headers=["*"],
|
| 31 |
)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
# -------------------- تنظيف النص العربي --------------------
|
| 34 |
def clean_arabic_text(text: str) -> str:
|
| 35 |
if not text:
|
| 36 |
return ""
|
| 37 |
|
| 38 |
+
# 1️⃣ تحويل الرموز المهمة لمسافات
|
| 39 |
+
text = re.sub(r"[:\-_/]", " ", text)
|
| 40 |
|
| 41 |
+
# 2️⃣ إزالة التشكيل
|
| 42 |
text = re.sub(r"[\u064B-\u065F]", "", text)
|
| 43 |
|
| 44 |
+
# 3️⃣ إزالة أي رموز غير عربي / أرقام / مسافة
|
| 45 |
+
text = re.sub(r"[^\u0600-\u06FF0-9\s]", "", text)
|
| 46 |
+
|
| 47 |
+
# 4️⃣ حل مشكلة الكلمات اللاصقة (عربي + عربي)
|
| 48 |
+
text = re.sub(r"([\u0600-\u06FF]{2,})([\u0600-\u06FF]{2,})", r"\1 \2", text)
|
| 49 |
+
|
| 50 |
+
# 5️⃣ إصلاح أشهر السنة (شائع في العقود)
|
| 51 |
+
months = [
|
| 52 |
+
"يناير","فبراير","مارس","ابريل","أبريل","مايو","يونيو",
|
| 53 |
+
"يوليو","اغسطس","أغسطس","سبتمبر","اكتوبر","أكتوبر",
|
| 54 |
+
"نوفمبر","ديسمبر"
|
| 55 |
+
]
|
| 56 |
+
for m in months:
|
| 57 |
+
text = re.sub(rf"(\D)({m})", r"\1 \2", text)
|
| 58 |
+
|
| 59 |
+
# 6️⃣ ضبط المسافات
|
| 60 |
+
text = re.sub(r"\s+", " ", text)
|
| 61 |
|
| 62 |
return text.strip()
|
| 63 |
|
|
|
|
| 81 |
return paddle_detector, paddle_recognizer
|
| 82 |
|
| 83 |
|
| 84 |
+
def process_image(img: np.ndarray, detector, recognizer, min_conf: float) -> List[Dict]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
h_img, w_img = img.shape[:2]
|
| 86 |
|
| 87 |
+
# 1️⃣ كشف النصوص
|
| 88 |
results = detector.predict(img)
|
| 89 |
|
| 90 |
all_rois = []
|
|
|
|
| 107 |
all_rois.append(roi)
|
| 108 |
all_bboxes.append([x1, y1, x2, y2])
|
| 109 |
|
| 110 |
+
# 2️⃣ التعرف على النصوص
|
| 111 |
ocr_results = []
|
| 112 |
|
| 113 |
for i, roi in enumerate(all_rois):
|