Spaces:

Mazenbs
/

extract_html_full

Running

Mazenbs commited on Dec 3, 2025

Commit

414bb93

verified ·

1 Parent(s): a9c7b87

Update parser/article_extractor.py

Files changed (1) hide show

parser/article_extractor.py CHANGED Viewed

@@ -1,36 +1,19 @@
-# parser/article_extractor.py
 import re
-from typing import List, Dict, Any, Optional
-from helpers.utils import normalize_digits
-ARTICLE_KEYWORD = "مادة"
-def is_article_line(line: str) -> bool:
-    return ARTICLE_KEYWORD in line
-def extract_article_number(line: str) -> Optional[str]:
-    m = re.search(rf"\b{ARTICLE_KEYWORD}\b[^\d\u0660-\u0669]*([\d\u0660-\u0669]+)", line)
-    return normalize_digits(m.group(1)) if m else None
-def extract_articles_from_section_lines(lines: List[str]) -> Dict[str, Any]:
-    content = []
     articles = []
-    current = None
-    for line in lines:
-        if is_article_line(line):
-            num = extract_article_number(line)
-            current = {"number": num or "", "text": line}
-            articles.append(current)
-            continue
-        if current is None:
-            content.append(line)
         else:
-            if line not in current["text"].split("\n"):
-                current["text"] += "\n" + line
-    return {
-        "content": "\n".join(content).strip(),
-        "articles": articles
-    }

 import re
+def extract_articles(texts: list):
     articles = []
+    current = {"number": None, "text": ""}
+    for t in texts:
+        m = re.match(r"مادة\s*\((\d+)\)", t)
+        if m:
+            if current["number"]:
+                articles.append(current)
+            current = {"number": m.group(1), "text": ""}
         else:
+            current["text"] += " " + t
+    if current["number"]:
+        articles.append(current)
+    return articles