Spaces:
Running
Running
| # parser/preamble_extractor.py | |
| import re | |
| from typing import List, Tuple | |
| from parser.article_extractor import ARTICLE_PATTERN | |
| from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE | |
| def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]: | |
| preamble_lines = [] | |
| remaining_lines = [] | |
| found_structure = False | |
| for block in text_blocks: | |
| for line in block.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # ุฅุฐุง ูุฌุฏูุง ุฃูู ูุณู ุฃู ู ุงุฏุฉ | |
| if not found_structure and (re.match(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", line) or re.match(ARTICLE_PATTERN, line)): | |
| found_structure = True | |
| if found_structure: | |
| remaining_lines.append(line) | |
| else: | |
| preamble_lines.append(line) | |
| return preamble_lines, remaining_lines # ูุฌุจ ุฃู ูููู tuple ู ู ููู ุชูู ููุท |