Spaces:

Mazenbs
/

extract_html_full

Running

extract_html_full / parser /section_extractor.py

Update parser/section_extractor.py

6378a68 verified about 1 month ago

1.16 kB

	from helpers.utils import normalize_digits, detect_line_type

	def extract_sections(texts: list):
	"""
	تقسيم النصوص إلى أقسام وفصول باستخدام الأنماط الذكية.
	يعتمد على detect_line_type لتحديد نوع السطر.
	"""
	sections = []
	current = {"name": "", "texts": []}

	for t in texts:
	if not isinstance(t, str):
	continue

	t_norm = normalize_digits(t.strip())
	line_type = detect_line_type(t_norm)

	# إذا كان السطر بداية قسم/باب/فصل
	if line_type == "section":
	# حفظ القسم السابق إذا وجد
	if current["texts"] or current["name"]:
	sections.append(current)

	# بدء قسم جديد
	current = {"name": t_norm, "texts": []}

	else:
	# إضافة السطر إلى القسم الحالي (سواء كان مادة أو نص عادي)
	current["texts"].append(t_norm)

	# إضافة آخر قسم
	if current["texts"] or current["name"]:
	sections.append(current)

	return sections