Spaces:

Mazenbs
/

extract_html_full

Running

File size: 1,245 Bytes

225417c
c72529e
b498e97
33ad481
414bb93
e5a8aa5
b498e97
e5a8aa5
33ad481
e5a8aa5
33ad481
b498e97
 
 
 
c72529e
 
b498e97
c72529e
 
 
b498e97
e5a8aa5
 
414bb93
8db4722
c72529e
8db4722
c72529e
b498e97
8db4722
20ee4ec
b498e97
e5a8aa5
c72529e
414bb93
b498e97
e5a8aa5
 
414bb93
33ad481
414bb93

# parser/article_extractor.py
import re
from helpers.utils import is_article, extract_article_number, normalize_digits

def extract_articles(texts: list):
    """
    استخراج المواد باستخدام الأنماط المعيارية من utils.py
    """
    articles = []
    current = None

    for raw in texts:
        t = raw.strip()

        # تحويل الأرقام الهندية + تنظيف
        tn = normalize_digits(t)

        # هل هي بداية مادة؟
        if is_article(tn):
            number = extract_article_number(tn)

            # احفظ السابقة
            if current:
                current["text"] = current["text"].strip()
                articles.append(current)

            # ابدأ مادة جديدة
            current = {
                "number": number,
                "text": t  # نحتفظ بالنص الأصلي وليس المنظّم
            }
        else:
            # تابع تجميع النص داخل المادة الحالية
            if current:
                current["text"] += "\n" + t

    # أضف آخر مادة
    if current:
        current["text"] = current["text"].strip()
        articles.append(current)

    return articles