Mazenbs commited on
Commit
4675394
·
verified ·
1 Parent(s): 9079299

Create parser/preamble_extractor.py

Browse files
Files changed (1) hide show
  1. parser/preamble_extractor.py +26 -0
parser/preamble_extractor.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser/preamble_extractor.py
2
+ import re
3
+ from typing import List, Tuple
4
+ from parser.article_extractor import ARTICLE_PATTERN
5
+ from parser.section_extractor import SECTION_KEYWORDS, SECTION_RE
6
+
7
+ def extract_preamble(text_blocks: List[str]) -> Tuple[List[str], List[str]]:
8
+ preamble_lines = []
9
+ remaining_lines = []
10
+ found_structure = False
11
+
12
+ for block in text_blocks:
13
+ for line in block.splitlines():
14
+ line = line.strip()
15
+ if not line:
16
+ continue
17
+
18
+ if re.match(SECTION_RE, line) or ARTICLE_PATTERN.match(line):
19
+ found_structure = True
20
+
21
+ if found_structure:
22
+ remaining_lines.append(line)
23
+ else:
24
+ preamble_lines.append(line)
25
+
26
+ return preamble_lines, remaining_lines