extract_html_full / parser /preamble_extractor.py
Mazenbs's picture
Update parser/preamble_extractor.py
d54b4e8 verified
# parser/preamble_extractor.py
import re
from typing import List, Tuple
from parser.article_extractor import ARTICLE_PATTERN
from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE
def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]:
preamble_lines = []
remaining_lines = []
found_structure = False
for block in text_blocks:
for line in block.splitlines():
line = line.strip()
if not line:
continue
# ุฅุฐุง ูˆุฌุฏู†ุง ุฃูˆู„ ู‚ุณู… ุฃูˆ ู…ุงุฏุฉ
if not found_structure and (re.match(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", line) or re.match(ARTICLE_PATTERN, line)):
found_structure = True
if found_structure:
remaining_lines.append(line)
else:
preamble_lines.append(line)
return preamble_lines, remaining_lines # ูŠุฌุจ ุฃู† ูŠูƒูˆู† tuple ู…ู† ู‚ูŠู…ุชูŠู† ูู‚ุท