Mazenbs commited on
Commit
19dc866
·
verified ·
1 Parent(s): 9bbdf9a

Create parser/section_extractor.py

Browse files
Files changed (1) hide show
  1. parser/section_extractor.py +38 -0
parser/section_extractor.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser/section_extractor.py
2
+ import re
3
+ from typing import List, Dict, Any, Tuple
4
+
5
+ SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
6
+
7
+ def is_section_line(line: str) -> bool:
8
+ return bool(re.match(rf"^(?:{'|'.join(SECTION_KEYWORDS)})\b", line))
9
+
10
+ def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
11
+ sections = []
12
+ preamble = []
13
+ current = None
14
+ found = False
15
+
16
+ for block in text_blocks:
17
+ parts = re.split(r"(?<=\.)|(?=\b(?:الكتاب|الباب|الفصل)\b)", block)
18
+
19
+ for part in parts:
20
+ part = part.strip()
21
+ if not part:
22
+ continue
23
+
24
+ if is_section_line(part):
25
+ found = True
26
+ current = {"title": part, "lines": []}
27
+ sections.append(current)
28
+ continue
29
+
30
+ if not found:
31
+ preamble.append(part)
32
+ else:
33
+ if current is None:
34
+ current = {"title": "", "lines": []}
35
+ sections.append(current)
36
+ current["lines"].append(part)
37
+
38
+ return sections, preamble