File size: 973 Bytes
4675394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d54b4e8
 
4675394
 
 
 
 
 
 
d54b4e8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# parser/preamble_extractor.py
import re
from typing import List, Tuple
from parser.article_extractor import ARTICLE_PATTERN
from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE

def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]:
    preamble_lines = []
    remaining_lines = []
    found_structure = False

    for block in text_blocks:
        for line in block.splitlines():
            line = line.strip()
            if not line:
                continue

            # ุฅุฐุง ูˆุฌุฏู†ุง ุฃูˆู„ ู‚ุณู… ุฃูˆ ู…ุงุฏุฉ
            if not found_structure and (re.match(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", line) or re.match(ARTICLE_PATTERN, line)):
                found_structure = True

            if found_structure:
                remaining_lines.append(line)
            else:
                preamble_lines.append(line)

    return preamble_lines, remaining_lines  # ูŠุฌุจ ุฃู† ูŠูƒูˆู† tuple ู…ู† ู‚ูŠู…ุชูŠู† ูู‚ุท