Spaces:

Mazenbs
/

extract_html_full

Running

File size: 1,159 Bytes

6378a68
b771743
3aa15bc
8734fe8
6378a68
 
8734fe8
828df36
3aa15bc
ef66367
3aa15bc
6a895a1
 
b771743
6a895a1
6378a68
6a895a1
6378a68
 
 
8734fe8
b794d8c
b771743
6378a68
6a895a1
b771743
3aa15bc
6378a68
6a895a1
b794d8c
6a895a1
8734fe8
3aa15bc
19dc866
3aa15bc

from helpers.utils import normalize_digits, detect_line_type

def extract_sections(texts: list):
    """
    تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية.
    يعتمد على detect_line_type لتحديد نوع السطر.
    """
    sections = []
    current = {"name": "", "texts": []}

    for t in texts:
        if not isinstance(t, str):
            continue

        t_norm = normalize_digits(t.strip())
        line_type = detect_line_type(t_norm)

        # إذا كان السطر بداية قسم/باب/فصل
        if line_type == "section":
            # حفظ القسم السابق إذا وجد
            if current["texts"] or current["name"]:
                sections.append(current)

            # بدء قسم جديد
            current = {"name": t_norm, "texts": []}

        else:
            # إضافة السطر إلى القسم الحالي (سواء كان مادة أو نص عادي)
            current["texts"].append(t_norm)

    # إضافة آخر قسم
    if current["texts"] or current["name"]:
        sections.append(current)

    return sections