Mazenbs commited on
Commit
ac8768a
·
verified ·
1 Parent(s): eca2f80

Update parser/assembler.py

Browse files
Files changed (1) hide show
  1. parser/assembler.py +35 -4
parser/assembler.py CHANGED
@@ -1,8 +1,40 @@
 
1
  from typing import List, Dict
2
  from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
3
 
4
 
5
- def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[Dict[str, str]]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  title = ""
7
  preamble_blocks = []
8
 
@@ -10,7 +42,7 @@ def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[
10
  while blocks:
11
  block = blocks.pop(0)
12
  t = block.get("text", "").strip()
13
- if t.lower() != "html" and t != "":
14
  title = t
15
  break
16
 
@@ -21,9 +53,8 @@ def extract_title_and_preamble(blocks: List[Dict[str, str]]) -> (str, str, List[
21
  break
22
  preamble_blocks.append(blocks.pop(0))
23
 
24
- remaining_blocks = blocks
25
  preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
26
- return title, preamble, remaining_blocks
27
 
28
 
29
  def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]:
 
1
+ # parser/assembler.py
2
  from typing import List, Dict
3
  from helpers.utils import normalize_digits, extract_article_number, is_article, is_section
4
 
5
 
6
+
7
+
8
+
9
+ def extract_title_and_preamble(
10
+ blocks: List[Dict[str, str]],
11
+ *,
12
+ default_title: Optional[str] = None,
13
+ default_preamble: Optional[str] = None
14
+ ) -> Tuple[str, str, List[Dict[str, str]]]:
15
+ """
16
+ تعمل على قائمة من الكتل ويمكن أن تحتوي كل كتلة على مفتاح type مسبقاً:
17
+ type = "title" | "preamble" | "body"
18
+ إذا وُجد type يُستخدم مباشرة، وإلا تُستخدم الطريقة التقليدية.
19
+ يمكن أيضاً تمرير عنوان أو مقدمة افتراضية لتُستخدم بدلاً من الاستخراج التلقائي.
20
+ """
21
+ # إذا أُرسلت قيم افتراضية نستخدمها فوراً
22
+ if default_title is not None and default_preamble is not None:
23
+ # نفصل كتل body فقط (أي شيء لا يُعتبر title أو preamble)
24
+ body_blocks = [b for b in blocks if b.get("type") != "title" and b.get("type") != "preamble"]
25
+ return default_title, default_preamble, body_blocks
26
+
27
+ # هل القائمة تحتوي على حقل type مُحدد مسبقاً؟
28
+ if any(b.get("type") in {"title", "preamble", "body"} for b in blocks):
29
+ title_blocks = [b for b in blocks if b.get("type") == "title"]
30
+ preamble_blocks = [b for b in blocks if b.get("type") == "preamble"]
31
+ body_blocks = [b for b in blocks if b.get("type") == "body"]
32
+
33
+ title = "\n".join([b["text"].strip() for b in title_blocks]).strip()
34
+ preamble = "\n".join([b["text"].strip() for b in preamble_blocks]).strip()
35
+ return title, preamble, body_blocks
36
+
37
+ # الطريقة التقليدية (المنطق القديم دون تغيير)
38
  title = ""
39
  preamble_blocks = []
40
 
 
42
  while blocks:
43
  block = blocks.pop(0)
44
  t = block.get("text", "").strip()
45
+ if t.lower() != "html" and t:
46
  title = t
47
  break
48
 
 
53
  break
54
  preamble_blocks.append(blocks.pop(0))
55
 
 
56
  preamble = "\n".join([b["text"] for b in preamble_blocks]).strip()
57
+ return title, preamble, blocks
58
 
59
 
60
  def extract_articles_from_blocks(blocks: List[Dict[str, str]]) -> List[Dict]: