Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,432 +1,464 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
# -*- coding: utf-8 -*-
|
| 3 |
-
|
| 4 |
-
import os
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
import
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
import
|
| 12 |
-
|
| 13 |
-
import
|
| 14 |
-
from
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
import
|
| 19 |
-
import
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
return
|
| 70 |
-
|
| 71 |
-
def
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
"""
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
for
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
)
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
if
|
| 305 |
-
return
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
"
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
"
|
| 337 |
-
|
| 338 |
-
"
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
logs.append(f"
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
|
| 4 |
+
import os
|
| 5 |
+
import io
|
| 6 |
+
import json
|
| 7 |
+
import uuid
|
| 8 |
+
import random
|
| 9 |
+
import tempfile
|
| 10 |
+
import shutil
|
| 11 |
+
import unicodedata
|
| 12 |
+
from dataclasses import dataclass
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import List, Tuple
|
| 15 |
+
|
| 16 |
+
import pandas as pd
|
| 17 |
+
from PIL import Image
|
| 18 |
+
from pypdf import PdfReader
|
| 19 |
+
import fitz # PyMuPDF
|
| 20 |
+
import regex as re2
|
| 21 |
+
import yake
|
| 22 |
+
from tqdm import tqdm
|
| 23 |
+
|
| 24 |
+
# ملاحظة: سنستورد torch/transformers داخل الدوال (تحميل كسول) لسرعة الإقلاع.
|
| 25 |
+
|
| 26 |
+
# =========================
|
| 27 |
+
# إعدادات عامة
|
| 28 |
+
# =========================
|
| 29 |
+
random.seed(42)
|
| 30 |
+
DEFAULT_LANG = "ar"
|
| 31 |
+
DEFAULT_NUM_QUESTIONS = 8
|
| 32 |
+
DEFAULT_TROCR_MODEL = "microsoft/trocr-base-printed" # أسرع من large
|
| 33 |
+
DEFAULT_TROCR_ZOOM = 2.8
|
| 34 |
+
|
| 35 |
+
# كاش بسيط للـ OCR pipeline
|
| 36 |
+
_OCR_PIPE = {}
|
| 37 |
+
def _get_ocr_pipeline(model_id: str):
|
| 38 |
+
"""تحميل كسول + كاش لنموذج TrOCR."""
|
| 39 |
+
from transformers import pipeline # استيراد متأخر
|
| 40 |
+
import torch # استيراد متأخر
|
| 41 |
+
device = 0 if torch.cuda.is_available() else -1
|
| 42 |
+
if model_id not in _OCR_PIPE:
|
| 43 |
+
_OCR_PIPE[model_id] = pipeline("image-to-text", model=model_id, device=device)
|
| 44 |
+
return _OCR_PIPE[model_id]
|
| 45 |
+
|
| 46 |
+
# =========================
|
| 47 |
+
# 2) استخراج النص من PDF
|
| 48 |
+
# =========================
|
| 49 |
+
def extract_text_with_pypdf(pdf_path: str) -> str:
|
| 50 |
+
reader = PdfReader(pdf_path)
|
| 51 |
+
texts = []
|
| 52 |
+
for page in reader.pages:
|
| 53 |
+
try:
|
| 54 |
+
t = page.extract_text() or ""
|
| 55 |
+
except Exception:
|
| 56 |
+
t = ""
|
| 57 |
+
texts.append(t)
|
| 58 |
+
return "\n".join(texts).strip()
|
| 59 |
+
|
| 60 |
+
def pdf_pages_to_images(pdf_path: str, zoom: float = 2.5) -> List[Image.Image]:
|
| 61 |
+
doc = fitz.open(pdf_path)
|
| 62 |
+
imgs = []
|
| 63 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 64 |
+
for page in doc:
|
| 65 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 66 |
+
img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
|
| 67 |
+
imgs.append(img)
|
| 68 |
+
doc.close()
|
| 69 |
+
return imgs
|
| 70 |
+
|
| 71 |
+
def extract_text_with_ocr(pdf_path: str, model_id: str, zoom: float = 2.5, disable_tqdm: bool = True) -> str:
|
| 72 |
+
ocr = _get_ocr_pipeline(model_id)
|
| 73 |
+
images = pdf_pages_to_images(pdf_path, zoom=zoom)
|
| 74 |
+
page_texts = []
|
| 75 |
+
pbar = tqdm(images, desc="TrOCR OCR", unit="p", disable=disable_tqdm)
|
| 76 |
+
for idx, img in enumerate(pbar):
|
| 77 |
+
try:
|
| 78 |
+
out = ocr(img)
|
| 79 |
+
txt = out[0]["generated_text"].strip() if out and "generated_text" in out[0] else ""
|
| 80 |
+
except Exception:
|
| 81 |
+
txt = ""
|
| 82 |
+
page_texts.append(f"--- [Page {idx+1}] ---\n{txt}")
|
| 83 |
+
return "\n\n".join(page_texts).strip()
|
| 84 |
+
|
| 85 |
+
def is_extraction_good(text: str, min_chars: int = 250, min_alpha_ratio: float = 0.15) -> bool:
|
| 86 |
+
if len(text) < min_chars:
|
| 87 |
+
return False
|
| 88 |
+
alnum = sum(ch.isalnum() for ch in text)
|
| 89 |
+
ratio = alnum / max(1, len(text))
|
| 90 |
+
return ratio >= min_alpha_ratio
|
| 91 |
+
|
| 92 |
+
def save_text(text: str, out_path: str) -> None:
|
| 93 |
+
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
|
| 94 |
+
with open(out_path, "w", encoding="utf-8") as f:
|
| 95 |
+
f.write(text)
|
| 96 |
+
|
| 97 |
+
def pdf_to_txt(pdf_path: str, out_txt_path: str = None,
|
| 98 |
+
ocr_model: str = DEFAULT_TROCR_MODEL,
|
| 99 |
+
ocr_zoom: float = DEFAULT_TROCR_ZOOM) -> Tuple[str, str, str]:
|
| 100 |
+
assert os.path.isfile(pdf_path), f"File not found: {pdf_path}"
|
| 101 |
+
|
| 102 |
+
embedded_text = extract_text_with_pypdf(pdf_path)
|
| 103 |
+
if is_extraction_good(embedded_text):
|
| 104 |
+
final_text = embedded_text
|
| 105 |
+
method = "embedded (pypdf)"
|
| 106 |
+
else:
|
| 107 |
+
if not ocr_model:
|
| 108 |
+
# وضع تجريبي بلا OCR
|
| 109 |
+
final_text = embedded_text
|
| 110 |
+
method = "embedded (pypdf: weak)"
|
| 111 |
+
else:
|
| 112 |
+
final_text = extract_text_with_ocr(pdf_path, model_id=ocr_model, zoom=ocr_zoom)
|
| 113 |
+
method = "OCR (Hugging Face TrOCR)"
|
| 114 |
+
|
| 115 |
+
if out_txt_path is None:
|
| 116 |
+
base, _ = os.path.splitext(pdf_path)
|
| 117 |
+
out_txt_path = base + ".txt"
|
| 118 |
+
|
| 119 |
+
header = f"[[ Extraction method: {method} ]]\n\n"
|
| 120 |
+
save_text(header + final_text, out_txt_path)
|
| 121 |
+
return final_text, out_txt_path, method
|
| 122 |
+
|
| 123 |
+
# =========================
|
| 124 |
+
# 3) تطبيع/تصحيح عربي
|
| 125 |
+
# =========================
|
| 126 |
+
def strip_page_headers(text: str) -> str:
|
| 127 |
+
lines = text.splitlines()
|
| 128 |
+
out = []
|
| 129 |
+
for ln in lines:
|
| 130 |
+
if re2.match(r"^\s*--- \[Page \d+\] ---\s*$", ln):
|
| 131 |
+
continue
|
| 132 |
+
if re2.match(r"^\s*(Page\s*\d+|صفحة\s*\d+)\s*$", ln):
|
| 133 |
+
continue
|
| 134 |
+
if re2.match(r"^\s*[-–—_*]{3,}\s*$", ln):
|
| 135 |
+
continue
|
| 136 |
+
out.append(ln)
|
| 137 |
+
return "\n".join(out)
|
| 138 |
+
|
| 139 |
+
AR_DIAC = r"[ًٌٍَُِّْ]"
|
| 140 |
+
def normalize_arabic(text: str) -> str:
|
| 141 |
+
text = unicodedata.normalize("NFKC", text)
|
| 142 |
+
text = re2.sub(r"[ـ]", "", text)
|
| 143 |
+
text = re2.sub(AR_DIAC, "", text)
|
| 144 |
+
text = re2.sub(r"[إأآا]", "ا", text)
|
| 145 |
+
text = re2.sub(r"[يى]", "ي", text)
|
| 146 |
+
text = re2.sub(r"\s+", " ", text)
|
| 147 |
+
return text.strip()
|
| 148 |
+
|
| 149 |
+
def arabic_ocr_fixes(text: str) -> str:
|
| 150 |
+
fixes = {
|
| 151 |
+
" الصطناعي": " الاصطناعي",
|
| 152 |
+
"صطناعي": "اصطناعي",
|
| 153 |
+
"التعل م": "التعلم",
|
| 154 |
+
"الذكاء الاصطناعيي": "الذكاء الاصطناعي",
|
| 155 |
+
"ذكاء صطناعي": "ذكاء اصطناعي",
|
| 156 |
+
"الذكاء الاصطناعي.": "الذكاء الاصطناعي.",
|
| 157 |
+
"التعليم ": "التعليم ",
|
| 158 |
+
" مع غني": " غني",
|
| 159 |
+
"مع غني ": " غني ",
|
| 160 |
+
" غير المشبعة": " غيرُ المشبعة",
|
| 161 |
+
}
|
| 162 |
+
for wrong, right in fixes.items():
|
| 163 |
+
text = text.replace(wrong, right)
|
| 164 |
+
return text
|
| 165 |
+
|
| 166 |
+
def postprocess_text(raw_text: str, lang: str = "ar") -> str:
|
| 167 |
+
t = strip_page_headers(raw_text)
|
| 168 |
+
t = t.replace("\r", "\n")
|
| 169 |
+
t = re2.sub(r"\n{3,}", "\n\n", t)
|
| 170 |
+
t = re2.sub(r"\d+\s*[\[\(][^\]\)]*[\]\)]", " ", t)
|
| 171 |
+
t = re2.sub(r"\[\d+\]", " ", t)
|
| 172 |
+
if lang == "ar":
|
| 173 |
+
t = normalize_arabic(t)
|
| 174 |
+
t = arabic_ocr_fixes(t)
|
| 175 |
+
return t
|
| 176 |
+
|
| 177 |
+
# =========================
|
| 178 |
+
# 4) YAKE + تقسيم الجمل
|
| 179 |
+
# =========================
|
| 180 |
+
SENT_SPLIT = re2.compile(r"(?<=[\.!؟\?])\s+")
|
| 181 |
+
AR_STOP = set("""
|
| 182 |
+
في على من إلى عن مع لدى ذلك هذه هذا الذين التي الذي اللواتي اللواتيا أو أم إن أن كان تكون كانوا كانت كنت كنا كانا كانتِ ثم قد لقد ربما بل لكن لكنَّ إلا سوى حتى حيث كما لما لماّ لماَّ لماً ما ماذا لماذا متى أين كيف أي أيّ أيُّ هناك هنا هناكَ تلك ذلكم ذلكن أولئك هؤلاء هما هن هم أنتِ أنتَ أنتما أنتن أنتم أنا نحن هي هو هنَّ همَّ
|
| 183 |
+
و أو كما بين بسبب بدون خلال عبر لدى لدىً حتى حيث ضمن عبره عليها عليه عليهم علي على إلي إليك إليه إليها لديك لديكِ لديه لديها لكم لكنكما لكنكن ولكن
|
| 184 |
+
هذا هذه ذلك تلك هؤلاء أولئك كل بعض أي أيّ أيًا أحد شيء شيئًا أشياء
|
| 185 |
+
"وهنا","اليه","الي","له","لها","لدي","لديه","لديها","لنا","عنده","عندها","مع","عبر","ضمن","حسب","حيث","كما","قد","بل","لكن","إذ","اذ","اذا","إن","أن","أيضا","فإن","فانه","فإنه","انه","إنه","مثلا","مثلاً","مثلاَ"
|
| 186 |
+
""".split())
|
| 187 |
+
|
| 188 |
+
def top_keywords_yake(text: str, max_k: int = 120, lan: str = 'ar') -> List[str]:
|
| 189 |
+
kw_extractor = yake.KeywordExtractor(lan=lan, n=1, top=max_k)
|
| 190 |
+
candidates = [kw for kw, _ in kw_extractor.extract_keywords(text)]
|
| 191 |
+
seen, out = set(), []
|
| 192 |
+
for k in candidates:
|
| 193 |
+
kk = k.strip()
|
| 194 |
+
if not kk or kk in seen:
|
| 195 |
+
continue
|
| 196 |
+
if lan == "ar" and kk in AR_STOP:
|
| 197 |
+
continue
|
| 198 |
+
if len(kk) < 3:
|
| 199 |
+
continue
|
| 200 |
+
if re2.match(r"^[\p{P}\p{S}]+$", kk):
|
| 201 |
+
continue
|
| 202 |
+
seen.add(kk)
|
| 203 |
+
out.append(kk)
|
| 204 |
+
return out
|
| 205 |
+
|
| 206 |
+
# =========================
|
| 207 |
+
# 5) مُولِّد MCQ
|
| 208 |
+
# =========================
|
| 209 |
+
@dataclass
|
| 210 |
+
class MCQ:
|
| 211 |
+
id: str
|
| 212 |
+
question: str
|
| 213 |
+
choices: List[str]
|
| 214 |
+
answer_index: int
|
| 215 |
+
explanation: str
|
| 216 |
+
|
| 217 |
+
def split_sentences(text: str) -> List[str]:
|
| 218 |
+
sents = [s.strip() for s in SENT_SPLIT.split(text) if s.strip()]
|
| 219 |
+
return [s for s in sents if len(s) >= 25]
|
| 220 |
+
|
| 221 |
+
def build_distractors(correct: str, pool: List[str], k: int = 3) -> List[str]:
|
| 222 |
+
cand = []
|
| 223 |
+
for w in pool:
|
| 224 |
+
if not w:
|
| 225 |
+
continue
|
| 226 |
+
w2 = w.strip()
|
| 227 |
+
if w2 == correct.strip():
|
| 228 |
+
continue
|
| 229 |
+
if len(w2) < 3:
|
| 230 |
+
continue
|
| 231 |
+
if w2 in AR_STOP:
|
| 232 |
+
continue
|
| 233 |
+
cand.append(w2)
|
| 234 |
+
|
| 235 |
+
random.shuffle(cand)
|
| 236 |
+
out = []
|
| 237 |
+
for w in cand:
|
| 238 |
+
out.append(w)
|
| 239 |
+
if len(out) == k:
|
| 240 |
+
break
|
| 241 |
+
|
| 242 |
+
fillers = ["—", "-", "—-"]
|
| 243 |
+
while len(out) < k:
|
| 244 |
+
out.append(random.choice(fillers))
|
| 245 |
+
return out
|
| 246 |
+
|
| 247 |
+
def make_mcqs_from_text(text: str, n: int = 8, lang: str = 'ar') -> List[MCQ]:
|
| 248 |
+
sentences = split_sentences(text)
|
| 249 |
+
if not sentences:
|
| 250 |
+
raise ValueError("النص قصير جدًا أو غير صالح لتوليد أسئلة.")
|
| 251 |
+
|
| 252 |
+
keywords = top_keywords_yake(text, max_k=160, lan=lang)
|
| 253 |
+
if not keywords:
|
| 254 |
+
toks = re2.findall(r"[\p{L}\p{N}_]+", text)
|
| 255 |
+
toks = [t for t in toks if not (lang == "ar" and t in AR_STOP)]
|
| 256 |
+
freq = {}
|
| 257 |
+
for t in toks:
|
| 258 |
+
freq[t] = freq.get(t, 0) + 1
|
| 259 |
+
keywords = [w for w, c in sorted(freq.items(), key=lambda x: -x[1])][:80]
|
| 260 |
+
|
| 261 |
+
sent_for_kw = {}
|
| 262 |
+
for s in sentences:
|
| 263 |
+
for kw in keywords:
|
| 264 |
+
if re2.search(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", s) and kw not in sent_for_kw:
|
| 265 |
+
sent_for_kw[kw] = s
|
| 266 |
+
|
| 267 |
+
items: List[MCQ] = []
|
| 268 |
+
used_sents = set()
|
| 269 |
+
pool_iter = [kw for kw in keywords if kw in sent_for_kw]
|
| 270 |
+
|
| 271 |
+
for kw in pool_iter:
|
| 272 |
+
if len(items) >= n:
|
| 273 |
+
break
|
| 274 |
+
s = sent_for_kw[kw]
|
| 275 |
+
if s in used_sents:
|
| 276 |
+
continue
|
| 277 |
+
blanked = re2.sub(rf"(?<!\p{{L}}){re2.escape(kw)}(?!\p{{L}})", "_____", s, count=1)
|
| 278 |
+
correct = kw
|
| 279 |
+
distractors = build_distractors(correct, [x for x in keywords if x != kw], k=3)
|
| 280 |
+
choices = distractors + [correct]
|
| 281 |
+
random.shuffle(choices)
|
| 282 |
+
ans_idx = choices.index(correct)
|
| 283 |
+
exp = f"مقتبس من الجملة: {s[:220]}" + ("..." if len(s) > 220 else "")
|
| 284 |
+
items.append(MCQ(
|
| 285 |
+
id=str(uuid.uuid4())[:8],
|
| 286 |
+
question=blanked,
|
| 287 |
+
choices=choices,
|
| 288 |
+
answer_index=ans_idx,
|
| 289 |
+
explanation=exp
|
| 290 |
+
))
|
| 291 |
+
used_sents.add(s)
|
| 292 |
+
|
| 293 |
+
if not items:
|
| 294 |
+
raise RuntimeError("تعذر توليد أسئلة من النص. جرّب نصاً أطول أو مختلفاً.")
|
| 295 |
+
return items
|
| 296 |
+
|
| 297 |
+
# =========================
|
| 298 |
+
# 6) بناء JSON للإخراج
|
| 299 |
+
# =========================
|
| 300 |
+
AR_PUNCT = "،؛؟"
|
| 301 |
+
EN_PUNCT = ",;?"
|
| 302 |
+
|
| 303 |
+
def normalize_punct(s: str) -> str:
|
| 304 |
+
if not s:
|
| 305 |
+
return ""
|
| 306 |
+
s = s.replace(",", "،").replace(";", "؛").replace("?", "؟")
|
| 307 |
+
return s.strip().strip(AR_PUNCT + EN_PUNCT).strip()
|
| 308 |
+
|
| 309 |
+
def is_bad_choice(txt: str) -> bool:
|
| 310 |
+
if not txt:
|
| 311 |
+
return True
|
| 312 |
+
txt = txt.strip()
|
| 313 |
+
BAD_NOISE = {"وهنا","اليه","الي","ليبق","لان","لانها","لانّه","ذلك","هذا","هذه"}
|
| 314 |
+
if txt in BAD_NOISE:
|
| 315 |
+
return True
|
| 316 |
+
if len(txt) > 18 and " " not in txt:
|
| 317 |
+
return True
|
| 318 |
+
if len(txt) < 2:
|
| 319 |
+
return True
|
| 320 |
+
if txt in AR_STOP:
|
| 321 |
+
return True
|
| 322 |
+
if re2.match(r"^[\p{P}\p{S}]+$", txt):
|
| 323 |
+
return True
|
| 324 |
+
return False
|
| 325 |
+
|
| 326 |
+
def build_json_records(items: List[MCQ], lang: str, source_pdf: str, method: str):
|
| 327 |
+
json_data = []
|
| 328 |
+
letters = ["A", "B", "C", "D"]
|
| 329 |
+
for it in items:
|
| 330 |
+
opts = []
|
| 331 |
+
seen = set()
|
| 332 |
+
for idx, lbl in enumerate(letters):
|
| 333 |
+
raw = it.choices[idx] if idx < len(it.choices) else ""
|
| 334 |
+
txt = normalize_punct(raw)
|
| 335 |
+
if is_bad_choice(txt):
|
| 336 |
+
txt = "—"
|
| 337 |
+
if txt in seen:
|
| 338 |
+
txt += " "
|
| 339 |
+
seen.add(txt)
|
| 340 |
+
opts.append({
|
| 341 |
+
"id": lbl,
|
| 342 |
+
"text": txt,
|
| 343 |
+
"is_correct": (it.answer_index == idx)
|
| 344 |
+
})
|
| 345 |
+
q_clean = normalize_punct(it.question)
|
| 346 |
+
exp_clean = normalize_punct(it.explanation)
|
| 347 |
+
record = {
|
| 348 |
+
"id": it.id,
|
| 349 |
+
"question": q_clean,
|
| 350 |
+
"options": opts,
|
| 351 |
+
"explanation": exp_clean,
|
| 352 |
+
"meta": {
|
| 353 |
+
"lang": lang,
|
| 354 |
+
"normalized": True,
|
| 355 |
+
"source_pdf": source_pdf,
|
| 356 |
+
"extraction_method": method
|
| 357 |
+
}
|
| 358 |
+
}
|
| 359 |
+
json_data.append(record)
|
| 360 |
+
return json_data
|
| 361 |
+
|
| 362 |
+
# =========================
|
| 363 |
+
# 7) الدالة الرئيسية (تتعامل مع Filepath من Gradio)
|
| 364 |
+
# =========================
|
| 365 |
+
def process_pdf(pdf_file_path,
|
| 366 |
+
num_questions=DEFAULT_NUM_QUESTIONS,
|
| 367 |
+
lang=DEFAULT_LANG,
|
| 368 |
+
trocr_model=DEFAULT_TROCR_MODEL,
|
| 369 |
+
trocr_zoom=DEFAULT_TROCR_ZOOM):
|
| 370 |
+
logs = []
|
| 371 |
+
try:
|
| 372 |
+
if not pdf_file_path:
|
| 373 |
+
return {}, None, "يرجى رفع ملف PDF أولاً."
|
| 374 |
+
|
| 375 |
+
# pdf_file_path قد يكون str أو NamedString -> خذه كمسار
|
| 376 |
+
src_path = str(pdf_file_path)
|
| 377 |
+
# اسم ملف مناسب
|
| 378 |
+
name_guess = getattr(pdf_file_path, "name", "") if hasattr(pdf_file_path, "name") else ""
|
| 379 |
+
filename = Path(name_guess).name or Path(src_path).name or "input.pdf"
|
| 380 |
+
if not Path(filename).suffix:
|
| 381 |
+
filename += ".pdf"
|
| 382 |
+
|
| 383 |
+
workdir = tempfile.mkdtemp(prefix="mcq_")
|
| 384 |
+
pdf_path = os.path.join(workdir, filename)
|
| 385 |
+
shutil.copy(src_path, pdf_path)
|
| 386 |
+
logs.append(f"تم نسخ الملف إلى: {pdf_path}")
|
| 387 |
+
|
| 388 |
+
# 1) استخراج النص
|
| 389 |
+
raw_text, out_txt_path, method = pdf_to_txt(
|
| 390 |
+
pdf_path=pdf_path,
|
| 391 |
+
ocr_model=trocr_model,
|
| 392 |
+
ocr_zoom=float(trocr_zoom)
|
| 393 |
+
)
|
| 394 |
+
logs.append(f"طريقة الاستخراج: {method}")
|
| 395 |
+
|
| 396 |
+
# 2) تنظيف/تطبيع
|
| 397 |
+
cleaned_text = postprocess_text(raw_text, lang=lang)
|
| 398 |
+
save_text(cleaned_text, os.path.join(workdir, "cleaned.txt"))
|
| 399 |
+
logs.append("تم تنظيف/تطبيع النص.")
|
| 400 |
+
|
| 401 |
+
# 3) توليد أسئلة
|
| 402 |
+
items = make_mcqs_from_text(cleaned_text, n=int(num_questions), lang=lang)
|
| 403 |
+
logs.append(f"تم توليد {len(items)} سؤالاً.")
|
| 404 |
+
|
| 405 |
+
# 4) بناء JSON
|
| 406 |
+
json_records = build_json_records(items, lang=lang, source_pdf=Path(filename).name, method=method)
|
| 407 |
+
json_str = json.dumps(json_records, ensure_ascii=False, indent=2)
|
| 408 |
+
|
| 409 |
+
# 5) حفظ ملف JSON للتنزيل
|
| 410 |
+
json_path = os.path.join(workdir, "mcqs.json")
|
| 411 |
+
with open(json_path, "w", encoding="utf-8") as fj:
|
| 412 |
+
fj.write(json_str)
|
| 413 |
+
logs.append("تم إنشاء ملف mcqs.json.")
|
| 414 |
+
|
| 415 |
+
return json_records, json_path, "\n".join(logs)
|
| 416 |
+
|
| 417 |
+
except Exception as e:
|
| 418 |
+
logs.append(f"خطأ: {e}")
|
| 419 |
+
return {}, None, "\n".join(logs)
|
| 420 |
+
|
| 421 |
+
# =========================
|
| 422 |
+
# 8) واجهة Gradio (v5)
|
| 423 |
+
# =========================
|
| 424 |
+
import gradio as gr
|
| 425 |
+
|
| 426 |
+
with gr.Blocks(title="PDF → MCQ JSON (Arabic YAKE / TrOCR)") as demo:
|
| 427 |
+
gr.Markdown("## تحويل PDF إلى أسئلة اختيار من متعدد وإرجاع JSON جاهز للواجهة")
|
| 428 |
+
|
| 429 |
+
with gr.Row():
|
| 430 |
+
inp_pdf = gr.File(
|
| 431 |
+
label="ارفع PDF",
|
| 432 |
+
file_count="single",
|
| 433 |
+
file_types=[".pdf"],
|
| 434 |
+
type="filepath", # مهم: يُعيد مسار الملف
|
| 435 |
+
)
|
| 436 |
+
with gr.Column():
|
| 437 |
+
num_q = gr.Slider(4, 20, value=DEFAULT_NUM_QUESTIONS, step=1, label="عدد الأسئلة")
|
| 438 |
+
trocr_zoom = gr.Slider(2.0, 3.5, value=DEFAULT_TROCR_ZOOM, step=0.1, label="دقة تحويل PDF لصور (Zoom)")
|
| 439 |
+
trocr_model = gr.Dropdown(
|
| 440 |
+
choices=[
|
| 441 |
+
"microsoft/trocr-base-printed",
|
| 442 |
+
"microsoft/trocr-large-printed",
|
| 443 |
+
"microsoft/trocr-base-handwritten",
|
| 444 |
+
"microsoft/trocr-large-handwritten",
|
| 445 |
+
],
|
| 446 |
+
value=DEFAULT_TROCR_MODEL,
|
| 447 |
+
label="موديل TrOCR"
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
btn = gr.Button("تشغيل المعالجة", variant="primary")
|
| 451 |
+
out_json = gr.JSON(label="النتيجة (JSON)")
|
| 452 |
+
out_file = gr.File(label="تحميل ملف JSON")
|
| 453 |
+
out_log = gr.Textbox(label="Logs", lines=10)
|
| 454 |
+
|
| 455 |
+
btn.click(
|
| 456 |
+
fn=process_pdf,
|
| 457 |
+
inputs=[inp_pdf, num_q, gr.State(DEFAULT_LANG), trocr_model, trocr_zoom],
|
| 458 |
+
outputs=[out_json, out_file, out_log]
|
| 459 |
+
)
|
| 460 |
+
|
| 461 |
+
# ملاحظة: Spaces تتعرف تلقائياً على المتغير "demo".
|
| 462 |
+
# لو شغّلت محلياً:
|
| 463 |
+
if __name__ == "__main__":
|
| 464 |
+
demo.queue().launch()
|