Spaces:
Sleeping
Sleeping
File size: 973 Bytes
4675394 d54b4e8 4675394 d54b4e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
# parser/preamble_extractor.py
import re
from typing import List, Tuple
from parser.article_extractor import ARTICLE_PATTERN
from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE
def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]:
preamble_lines = []
remaining_lines = []
found_structure = False
for block in text_blocks:
for line in block.splitlines():
line = line.strip()
if not line:
continue
# ุฅุฐุง ูุฌุฏูุง ุฃูู ูุณู
ุฃู ู
ุงุฏุฉ
if not found_structure and (re.match(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", line) or re.match(ARTICLE_PATTERN, line)):
found_structure = True
if found_structure:
remaining_lines.append(line)
else:
preamble_lines.append(line)
return preamble_lines, remaining_lines # ูุฌุจ ุฃู ูููู tuple ู
ู ููู
ุชูู ููุท |