Mazenbs commited on
Commit
3f58b85
·
verified ·
1 Parent(s): 651c48c

Update parser/article_extractor.py

Browse files
Files changed (1) hide show
  1. parser/article_extractor.py +27 -15
parser/article_extractor.py CHANGED
@@ -1,36 +1,48 @@
1
- # parser/article_extractor.py
2
  import re
3
  from typing import List, Dict, Any, Optional
4
  from helpers.utils import normalize_digits
5
 
6
  ARTICLE_KEYWORD = "مادة"
7
 
 
 
 
 
 
 
8
  def is_article_line(line: str) -> bool:
9
- return ARTICLE_KEYWORD in line
10
 
11
  def extract_article_number(line: str) -> Optional[str]:
12
- m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
13
  return normalize_digits(m.group(1)) if m else None
14
 
15
  def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
16
- content = []
17
  articles = []
18
- current = None
 
 
 
 
 
19
 
20
- for line in lines:
21
  if is_article_line(line):
22
- num = extract_article_number(line)
23
- current = {"number": num or "", "text": line}
24
- articles.append(current)
 
 
 
 
 
25
  continue
26
 
27
- if current is None:
28
- content.append(line)
29
- else:
30
- if line not in current["text"].split("\n"):
31
- current["text"] += "\n" + line
32
 
33
  return {
34
- "content": "\n".join(content).strip(),
35
  "articles": articles
36
  }
 
 
1
  import re
2
  from typing import List, Dict, Any, Optional
3
  from helpers.utils import normalize_digits
4
 
5
  ARTICLE_KEYWORD = "مادة"
6
 
7
+ # نمط أكثر قوة للتعرف على بداية المادة
8
+ ARTICLE_PATTERN = re.compile(
9
+ r"^\s*(?:المادة|مادة)\s*[\(\[:\-–]?\s*([\d\u0660-\u0669]+)\s*[\)\]:\-–]?",
10
+ re.UNICODE
11
+ )
12
+
13
  def is_article_line(line: str) -> bool:
14
+ return bool(ARTICLE_PATTERN.match(line.strip()))
15
 
16
  def extract_article_number(line: str) -> Optional[str]:
17
+ m = ARTICLE_PATTERN.match(line.strip())
18
  return normalize_digits(m.group(1)) if m else None
19
 
20
  def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
21
+ content_before_first_article = []
22
  articles = []
23
+ current_article = None
24
+
25
+ for raw_line in lines:
26
+ line = raw_line.strip()
27
+ if not line:
28
+ continue # تجاهل الأسطر الفارغة
29
 
30
+ # هل هذا بداية مادة جديدة؟
31
  if is_article_line(line):
32
+ number = extract_article_number(line)
33
+ current_article = {"number": number or "", "text": line}
34
+ articles.append(current_article)
35
+ continue
36
+
37
+ # إذا لم نصل بعد إلى أول مادة → نضع النص في المحتوى العام
38
+ if current_article is None:
39
+ content_before_first_article.append(line)
40
  continue
41
 
42
+ # إضافة باقي نص المادة
43
+ current_article["text"] += "\n" + line
 
 
 
44
 
45
  return {
46
+ "content": "\n".join(content_before_first_article).strip(),
47
  "articles": articles
48
  }