Spaces:

Mazenbs
/

extract_html_full

Running

extract_html_full / parser /article_extractor.py

Update parser/article_extractor.py

225417c verified about 1 month ago

1.25 kB

	# parser/article_extractor.py
	import re
	from helpers.utils import is_article, extract_article_number, normalize_digits

	def extract_articles(texts: list):
	"""
	استخراج المواد باستخدام الأنماط المعيارية من utils.py
	"""
	articles = []
	current = None

	for raw in texts:
	t = raw.strip()

	# تحويل الأرقام الهندية + تنظيف
	tn = normalize_digits(t)

	# هل هي بداية مادة؟
	if is_article(tn):
	number = extract_article_number(tn)

	# احفظ السابقة
	if current:
	current["text"] = current["text"].strip()
	articles.append(current)

	# ابدأ مادة جديدة
	current = {
	"number": number,
	"text": t # نحتفظ بالنص الأصلي وليس المنظّم
	}
	else:
	# تابع تجميع النص داخل المادة الحالية
	if current:
	current["text"] += "\n" + t

	# أضف آخر مادة
	if current:
	current["text"] = current["text"].strip()
	articles.append(current)

	return articles