Mazenbs commited on
Commit
20ee4ec
·
verified ·
1 Parent(s): c21cb88

Update parser/article_extractor.py

Browse files
Files changed (1) hide show
  1. parser/article_extractor.py +15 -18
parser/article_extractor.py CHANGED
@@ -1,39 +1,36 @@
 
1
  import re
2
  from typing import List, Dict, Any, Optional
3
  from helpers.utils import normalize_digits
4
 
5
  ARTICLE_KEYWORD = "مادة"
6
- ARTICLE_PATTERN = re.compile(
7
- r"^\s*(?:المادة|مادة)\s*[\(\[:\-–]?\s*([\d\u0660-\u0669]+)\s*[\)\]:\-–]?",
8
- re.UNICODE
9
- )
10
 
11
  def is_article_line(line: str) -> bool:
12
- return bool(ARTICLE_PATTERN.match(line.strip()))
13
 
14
  def extract_article_number(line: str) -> Optional[str]:
15
- m = ARTICLE_PATTERN.match(line.strip())
16
  return normalize_digits(m.group(1)) if m else None
17
 
18
  def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
 
19
  articles = []
20
- current_article = None
21
-
22
- for raw_line in lines:
23
- line = raw_line.strip()
24
- if not line:
25
- continue
26
 
 
27
  if is_article_line(line):
28
- number = extract_article_number(line)
29
- current_article = {"number": number or "", "text": line}
30
- articles.append(current_article)
31
  continue
32
 
33
- if current_article is not None:
34
- current_article["text"] += "\n" + line
 
 
 
35
 
36
  return {
37
- "content": "",
38
  "articles": articles
39
  }
 
1
+ # parser/article_extractor.py
2
  import re
3
  from typing import List, Dict, Any, Optional
4
  from helpers.utils import normalize_digits
5
 
6
  ARTICLE_KEYWORD = "مادة"
 
 
 
 
7
 
8
  def is_article_line(line: str) -> bool:
9
+ return ARTICLE_KEYWORD in line
10
 
11
  def extract_article_number(line: str) -> Optional[str]:
12
+ m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
13
  return normalize_digits(m.group(1)) if m else None
14
 
15
  def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
16
+ content = []
17
  articles = []
18
+ current = None
 
 
 
 
 
19
 
20
+ for line in lines:
21
  if is_article_line(line):
22
+ num = extract_article_number(line)
23
+ current = {"number": num or "", "text": line}
24
+ articles.append(current)
25
  continue
26
 
27
+ if current is None:
28
+ content.append(line)
29
+ else:
30
+ if line not in current["text"].split("\n"):
31
+ current["text"] += "\n" + line
32
 
33
  return {
34
+ "content": "\n".join(content).strip(),
35
  "articles": articles
36
  }