Mazenbs commited on
Commit
33ad481
·
verified ·
1 Parent(s): 19dc866

Create parser/article_extractor.py

Browse files
Files changed (1) hide show
  1. parser/article_extractor.py +36 -0
parser/article_extractor.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # parser/article_extractor.py
2
+ import re
3
+ from typing import List, Dict, Any, Optional
4
+ from helpers.utils import normalize_digits
5
+
6
+ ARTICLE_KEYWORD = "مادة"
7
+
8
+ def is_article_line(line: str) -> bool:
9
+ return ARTICLE_KEYWORD in line
10
+
11
+ def extract_article_number(line: str) -> Optional[str]:
12
+ m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
13
+ return normalize_digits(m.group(1)) if m else None
14
+
15
+ def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
16
+ content = []
17
+ articles = []
18
+ current = None
19
+
20
+ for line in lines:
21
+ if is_article_line(line):
22
+ num = extract_article_number(line)
23
+ current = {"number": num or "", "text": line}
24
+ articles.append(current)
25
+ continue
26
+
27
+ if current is None:
28
+ content.append(line)
29
+ else:
30
+ if line not in current["text"].split("\n"):
31
+ current["text"] += "\n" + line
32
+
33
+ return {
34
+ "content": "\n".join(content).strip(),
35
+ "articles": articles
36
+ }