Mazenbs commited on
Commit
828df36
·
verified ·
1 Parent(s): 20ee4ec

Update parser/section_extractor.py

Browse files
Files changed (1) hide show
  1. parser/section_extractor.py +21 -50
parser/section_extractor.py CHANGED
@@ -1,67 +1,38 @@
 
1
  import re
2
  from typing import List, Dict, Any, Tuple
3
- from parser.article_extractor import ARTICLE_PATTERN
4
 
5
- SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل", "القسم"]
6
- SECTION_RE = re.compile(rf"^\s*(?:{'|'.join(SECTION_KEYWORDS)})\b", re.IGNORECASE)
7
 
8
  def is_section_line(line: str) -> bool:
9
- return bool(SECTION_RE.match(line))
10
-
11
- def split_title_and_following(line: str) -> Tuple[str, str]:
12
- match = re.search(rf"\bمادة\b", line)
13
- if match:
14
- idx = match.start()
15
- title = line[:idx].strip()
16
- follow = line[idx:].strip()
17
- if not title:
18
- title = line.strip()
19
- follow = ""
20
- return title, follow
21
- return line.strip(), ""
22
 
23
  def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
24
- """
25
- ترجع:
26
- - sections: قائمة بالأقسام وكل قسم يحتوي "title" و "lines"
27
- - preamble: المقدمة الحقيقية، تتوقف عند أول قسم أو أول مادة
28
- """
29
- sections: List[Dict[str, Any]] = []
30
- preamble: List[str] = []
31
- current: Dict[str, Any] = None
32
- found_structure = False # يعني: وجدنا أول قسم أو أول مادة
33
 
34
  for block in text_blocks:
35
- for raw_line in block.splitlines():
36
- line = raw_line.strip()
37
- if not line:
38
- continue
39
-
40
- # التحقق من بداية القسم أولاً
41
- if not found_structure and is_section_line(line):
42
- found_structure = True
43
 
44
- # التحقق من بداية المادة
45
- elif not found_structure and ARTICLE_PATTERN.match(line):
46
- found_structure = True
47
-
48
- if not found_structure:
49
- preamble.append(line)
50
  continue
51
 
52
- # بدأنا الأقسام
53
- if is_section_line(line):
54
- title, follow = split_title_and_following(line)
55
- current = {"title": title, "lines": []}
56
  sections.append(current)
57
- if follow:
58
- current["lines"].append(follow)
59
  continue
60
 
61
- if current is None:
62
- current = {"title": "", "lines": []}
63
- sections.append(current)
64
-
65
- current["lines"].append(line)
 
 
66
 
67
  return sections, preamble
 
1
+ # parser/section_extractor.py
2
  import re
3
  from typing import List, Dict, Any, Tuple
 
4
 
5
+ SECTION_KEYWORDS = ["الكتاب", "الباب", "الفصل"]
 
6
 
7
  def is_section_line(line: str) -> bool:
8
+ return bool(re.match(rf"^(?:{'|'.join(SECTION_KEYWORDS)})\b", line))
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  def extract_sections_from_text_blocks(text_blocks: List[str]) -> Tuple[List[Dict[str, Any]], List[str]]:
11
+ sections = []
12
+ preamble = []
13
+ current = None
14
+ found = False
 
 
 
 
 
15
 
16
  for block in text_blocks:
17
+ parts = re.split(r"(?<=\.)|(?=\b(?:الكتاب|الباب|الفصل)\b)", block)
 
 
 
 
 
 
 
18
 
19
+ for part in parts:
20
+ part = part.strip()
21
+ if not part:
 
 
 
22
  continue
23
 
24
+ if is_section_line(part):
25
+ found = True
26
+ current = {"title": part, "lines": []}
 
27
  sections.append(current)
 
 
28
  continue
29
 
30
+ if not found:
31
+ preamble.append(part)
32
+ else:
33
+ if current is None:
34
+ current = {"title": "", "lines": []}
35
+ sections.append(current)
36
+ current["lines"].append(part)
37
 
38
  return sections, preamble