Spaces:
Running
Running
Update parser/article_extractor.py
Browse files- parser/article_extractor.py +13 -30
parser/article_extractor.py
CHANGED
|
@@ -1,36 +1,19 @@
|
|
| 1 |
-
# parser/article_extractor.py
|
| 2 |
import re
|
| 3 |
-
from typing import List, Dict, Any, Optional
|
| 4 |
-
from helpers.utils import normalize_digits
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
def is_article_line(line: str) -> bool:
|
| 9 |
-
return ARTICLE_KEYWORD in line
|
| 10 |
-
|
| 11 |
-
def extract_article_number(line: str) -> Optional[str]:
|
| 12 |
-
m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
|
| 13 |
-
return normalize_digits(m.group(1)) if m else None
|
| 14 |
-
|
| 15 |
-
def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
|
| 16 |
-
content = []
|
| 17 |
articles = []
|
| 18 |
-
current = None
|
| 19 |
|
| 20 |
-
for
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
current
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
if current is None:
|
| 28 |
-
content.append(line)
|
| 29 |
else:
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
-
return
|
| 34 |
-
"content": "\n".join(content).strip(),
|
| 35 |
-
"articles": articles
|
| 36 |
-
}
|
|
|
|
|
|
|
| 1 |
import re
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
def extract_articles(texts: list):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
articles = []
|
| 5 |
+
current = {"number": None, "text": ""}
|
| 6 |
|
| 7 |
+
for t in texts:
|
| 8 |
+
m = re.match(r"مادة\s*\((\d+)\)", t)
|
| 9 |
+
if m:
|
| 10 |
+
if current["number"]:
|
| 11 |
+
articles.append(current)
|
| 12 |
+
current = {"number": m.group(1), "text": ""}
|
|
|
|
|
|
|
|
|
|
| 13 |
else:
|
| 14 |
+
current["text"] += " " + t
|
| 15 |
+
|
| 16 |
+
if current["number"]:
|
| 17 |
+
articles.append(current)
|
| 18 |
|
| 19 |
+
return articles
|
|
|
|
|
|
|
|
|