Spaces:

Mazenbs
/

extract_html_full

Running

extract_html_full / parser /preamble_extractor.py

Update parser/preamble_extractor.py

d54b4e8 verified about 1 month ago

973 Bytes

	# parser/preamble_extractor.py
	import re
	from typing import List, Tuple
	from parser.article_extractor import ARTICLE_PATTERN
	from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE

	def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]:
	preamble_lines = []
	remaining_lines = []
	found_structure = False

	for block in text_blocks:
	for line in block.splitlines():
	line = line.strip()
	if not line:
	continue

	# إذا وجدنا أول قسم أو مادة
	if not found_structure and (re.match(rf"^\s*(?:{'\|'.join(SECTION_KEYWORDS)})\b", line) or re.match(ARTICLE_PATTERN, line)):
	found_structure = True

	if found_structure:
	remaining_lines.append(line)
	else:
	preamble_lines.append(line)

	return preamble_lines, remaining_lines # يجب أن يكون tuple من قيمتين فقط