Mazenbs commited on
Commit
414bb93
·
verified ·
1 Parent(s): a9c7b87

Update parser/article_extractor.py

Browse files
Files changed (1) hide show
  1. parser/article_extractor.py +13 -30
parser/article_extractor.py CHANGED
@@ -1,36 +1,19 @@
1
- # parser/article_extractor.py
2
  import re
3
- from typing import List, Dict, Any, Optional
4
- from helpers.utils import normalize_digits
5
 
6
- ARTICLE_KEYWORD = "مادة"
7
-
8
- def is_article_line(line: str) -> bool:
9
- return ARTICLE_KEYWORD in line
10
-
11
- def extract_article_number(line: str) -> Optional[str]:
12
- m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
13
- return normalize_digits(m.group(1)) if m else None
14
-
15
- def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
16
- content = []
17
  articles = []
18
- current = None
19
 
20
- for line in lines:
21
- if is_article_line(line):
22
- num = extract_article_number(line)
23
- current = {"number": num or "", "text": line}
24
- articles.append(current)
25
- continue
26
-
27
- if current is None:
28
- content.append(line)
29
  else:
30
- if line not in current["text"].split("\n"):
31
- current["text"] += "\n" + line
 
 
32
 
33
- return {
34
- "content": "\n".join(content).strip(),
35
- "articles": articles
36
- }
 
 
1
  import re
 
 
2
 
3
+ def extract_articles(texts: list):
 
 
 
 
 
 
 
 
 
 
4
  articles = []
5
+ current = {"number": None, "text": ""}
6
 
7
+ for t in texts:
8
+ m = re.match(r"مادة\s*\((\d+)\)", t)
9
+ if m:
10
+ if current["number"]:
11
+ articles.append(current)
12
+ current = {"number": m.group(1), "text": ""}
 
 
 
13
  else:
14
+ current["text"] += " " + t
15
+
16
+ if current["number"]:
17
+ articles.append(current)
18
 
19
+ return articles