Aphasia_Classificifier

Sleeping

App Files Files Community

Ellie5757575757 commited on Aug 8, 2025

Commit

f869b0b

verified ·

1 Parent(s): 4393971

Update Cha_Json.py

Browse files

Files changed (1) hide show

Cha_Json.py +97 -61

Cha_Json.py CHANGED Viewed

@@ -1,36 +1,34 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
-cha2json.py ── 將單一 CLAN .cha 轉成 JSON（強化 %mor/%wor/%gra 對齊）
 用法：
-    # 直接改上方預設路徑
-    python3 cha2json.py
-    # 或用參數
-    python3 cha2json.py --input /path/to/input.cha --output /path/to/output.json
 """
-# ────────── 預設路徑：可改成你的固定路徑 ──────────
-INPUT_CHA   = "/workspace/SH001/vid_output/output.cha"
-OUTPUT_JSON = "/workspace/SH001/website/aphasia_website/aphasia_env/Output.json"
-# ───────────────────────────────────────────────
 import re
 import json
 import sys
 import argparse
 from pathlib import Path
 from collections import defaultdict
-# 接受的斷行標籤（用於多行 %mor/%wor/%gra 合併的停止條件）
 TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
 WORD_RE      = re.compile(r"[A-Za-z0-9]+")
-# 接受「病人」角色：PAR / PAR0 / PAR1 / ...
 ID_PAR_RE = re.compile(r"\|PAR\d*\|")
-# 接受對話行：*INV: 或 *PAR0: / *PAR1: / ...
 UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
 # ────────── 同義集合（對齊時容忍形態變化） ──────────
@@ -46,7 +44,6 @@ SYN_SETS = [
     {"swim", "swims", "swimming", "swam", "swum"},
 ]
 def same_syn(a: str, b: str) -> bool:
-    """同詞彙不同形態視為相同"""
     if not a or not b:
         return False
     for s in SYN_SETS:
@@ -60,14 +57,14 @@ def canonical(txt: str) -> str:
     m = WORD_RE.search(head)
     return m.group(0).lower() if m else ""
-def merge_multiline(block_lines):
     """
-    合併跨行 %mor/%wor/%gra。
     規則：以 '%' 開頭者作為起始，往下串，遇到新標籤或 @ 開頭就停。
     """
     merged, buf = [], None
     for raw in block_lines:
-        ln = raw.rstrip("\n").replace("\x15", "")  # 去掉 CLAN 的分隔控制字
         if ln.lstrip().startswith("%") and ":" in ln:
             if buf:
                 merged.append(buf)
@@ -81,32 +78,42 @@ def merge_multiline(block_lines):
         merged.append(buf)
     return "\n".join(merged)
-# ────────── 主轉換 ──────────
-def cha_to_json(lines):
-    # 映射以 1 起算（pos / gra），aphasia 類型讓 defaultdict 從 0 起也行
-    pos_map     = defaultdict(lambda: len(pos_map)     + 1)
-    gra_map     = defaultdict(lambda: len(gra_map)     + 1)
-    aphasia_map = defaultdict(lambda: len(aphasia_map))  # 0,1,2,...
-    data = []
-    sent = None
-    i = 0
     while i < len(lines):
         line = lines[i].rstrip("\n")
-        # --- 啟段：用 @Begin（比 @UTF8 更語義化）---
         if line.startswith("@Begin"):
             sent = {
                 "sentence_id": f"S{len(data)+1}",
                 "sentence_pid": None,
-                "aphasia_type": None,     # 若最後仍沒有，就標 UNKNOWN
-                "dialogues": []           # [ { "INV": [...], "PAR": [...] }, ... ]
             }
             i += 1
             continue
-        # --- 結束：@End（只要有對話就收，不再卡 aphasia_type）---
         if line.startswith("@End"):
             if sent and sent["dialogues"]:
                 if not sent.get("aphasia_type"):
@@ -117,7 +124,7 @@ def cha_to_json(lines):
             i += 1
             continue
-        # --- 句子屬性 ---
         if sent and line.startswith("@PID:"):
             parts = line.split("\t")
             if len(parts) > 1:
@@ -128,9 +135,8 @@ def cha_to_json(lines):
         if sent and line.startswith("@ID:"):
             # 是否為病人那位 PAR*
             if ID_PAR_RE.search(line):
-                # 你的範例沒有寫失語類型 → 先標 UNKNOWN，避免被丟棄
                 aph = "UNKNOWN"
-                # 若未來 @ID 有藏類型，可在此寫 regex 抓出來替換 aph
                 # m = re.search(r"WAB:([A-Za-z]+)", line)
                 # if m: aph = m.group(1)
                 aph = aph.upper()
@@ -139,29 +145,27 @@ def cha_to_json(lines):
             i += 1
             continue
-        # --- 對話行：*INV: 或 *PAR0:/PAR1: ---
         if sent and UTTER_RE.match(line):
             role_tag = UTTER_RE.match(line).group(1)
             role = "INV" if role_tag == "INV" else "PAR"
             if not sent["dialogues"]:
                 sent["dialogues"].append({"INV": [], "PAR": []})
-            # 若新來的是 INV 而上一輪已有 PAR，視為下一輪互動
             if role == "INV" and sent["dialogues"][-1]["PAR"]:
                 sent["dialogues"].append({"INV": [], "PAR": []})
-            # 建一個空的 turn 容器（之後 %mor/%wor/%gra 會補進來）
             sent["dialogues"][-1][role].append(
-                {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []}
             )
             i += 1
             continue
-        # --- %mor ---
         if sent and line.startswith("%mor:"):
-            blk = [line]
-            i += 1
-            # 收集跨行，遇到新標籤停
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
@@ -170,36 +174,33 @@ def cha_to_json(lines):
             for u in units:
                 if "|" in u:
                     pos, rest = u.split("|", 1)
-                    # rest 可能像 noun|dog-Acc → 取第一段 'dog-Acc' 再切一次保守取第一個詞
                     word = rest.split("|", 1)[0]
-                    # 有些詞會像 propn|thefablecottagecom，照收
                     toks.append(word)
                     pos_ids.append(pos_map[pos])
-            # 放到當前輪的最後一個 turn
             dlg = sent["dialogues"][-1]
             tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
             tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
             continue
-        # --- %wor ---
         if sent and line.startswith("%wor:"):
-            blk = [line]
-            i += 1
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
             merged = merge_multiline(blk).replace("%wor:", "").strip()
-            # 你的檔案在去掉 \x15 後會變成：word 0_583 word 583_1166 ...
-            # 用這個 regex 抓：<word> <start>_<end>
             raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
             wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
             dlg = sent["dialogues"][-1]
             tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
-            # 嘗試將 %mor 的 tokens 與 %wor 的 word align，取 duration = end - start
-            aligned = []
             j = 0
             for tok in tgt.get("tokens", []):
                 c_tok = canonical(tok)
@@ -220,10 +221,9 @@ def cha_to_json(lines):
             tgt["word_durations"] = aligned
             continue
-        # --- %gra ---
         if sent and line.startswith("%gra:"):
-            blk = [line]
-            i += 1
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
@@ -242,28 +242,60 @@ def cha_to_json(lines):
             tgt["word_grammar_ids"] = triples
             continue
-        # 其他行 → 下一行
         i += 1
-    # 收尾（保險：如果檔��意外沒 @End）
     if sent and sent["dialogues"]:
         if not sent.get("aphasia_type"):
             sent["aphasia_type"] = "UNKNOWN"
             aphasia_map["UNKNOWN"]
         data.append(sent)
     return {
         "sentences": data,
         "pos_mapping": dict(pos_map),
         "grammar_mapping": dict(gra_map),
         "aphasia_types": dict(aphasia_map),
     }
-# ────────── 執行 ──────────
 def parse_args():
     p = argparse.ArgumentParser()
-    p.add_argument("--input", "-i", type=str, default=INPUT_CHA, help="輸入 .cha 檔")
-    p.add_argument("--output", "-o", type=str, default=OUTPUT_JSON, help="輸出 .json 檔")
     return p.parse_args()
 def main():
@@ -283,7 +315,11 @@ def main():
     with out_path.open("w", encoding="utf-8") as fh:
         json.dump(dataset, fh, ensure_ascii=False, indent=4)
-    print(f"✅ 轉換完成 → {out_path}（句數 {len(dataset['sentences'])}，pos={len(dataset['pos_mapping'])}，gra={len(dataset['grammar_mapping'])}，類型鍵={list(dataset['aphasia_types'].keys())}）")
 if __name__ == "__main__":
     main()

 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 """
+cha_json.py — 將單一 CLAN .cha 轉成 JSON（強化 %mor/%wor/%gra 對齊）
 用法：
+    # CLI
+    python3 cha_json.py --input /path/to/input.cha --output /path/to/output.json
+程式化呼叫（供 pipeline 使用）：
+    from cha_json import cha_to_json_file, cha_to_dict
+    out_path, data = cha_to_json_file("/path/in.cha", "/path/out.json")
+    data2 = cha_to_dict("/path/in.cha")
 """
+from __future__ import annotations
 import re
 import json
 import sys
 import argparse
 from pathlib import Path
 from collections import defaultdict
+from typing import List, Dict, Any, Tuple, Optional
+# 可接受的跨行停止條件（用於 %mor/%wor/%gra 合併）
 TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
 WORD_RE      = re.compile(r"[A-Za-z0-9]+")
+# 病人角色：PAR / PAR0 / PAR1 / ...
 ID_PAR_RE = re.compile(r"\|PAR\d*\|")
+# 對話行：*INV: 或 *PAR0: / *PAR1: / ...
 UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
 # ────────── 同義集合（對齊時容忍形態變化） ──────────
     {"swim", "swims", "swimming", "swam", "swum"},
 ]
 def same_syn(a: str, b: str) -> bool:
     if not a or not b:
         return False
     for s in SYN_SETS:
     m = WORD_RE.search(head)
     return m.group(0).lower() if m else ""
+def merge_multiline(block_lines: List[str]) -> str:
     """
+    合併跨行的 %mor/%wor/%gra。
     規則：以 '%' 開頭者作為起始，往下串，遇到新標籤或 @ 開頭就停。
     """
     merged, buf = [], None
     for raw in block_lines:
+        ln = raw.rstrip("\n").replace("\x15", "")  # 去掉 CLAN 控制字
         if ln.lstrip().startswith("%") and ":" in ln:
             if buf:
                 merged.append(buf)
         merged.append(buf)
     return "\n".join(merged)
+def cha_to_json(lines: List[str]) -> Dict[str, Any]:
+    """
+    將 .cha 檔行列表轉 JSON 結構。
+    回傳格式：
+    {
+      "sentences": [...],
+      "pos_mapping": {...},
+      "grammar_mapping": {...},
+      "aphasia_types": {...},
+      "text_all": "..."        # 方便下游模型使用的 PAR 合併文字
+    }
+    """
+    # 對應表（pos / gra 從 1 起算；aphasia 類型 0 起）
+    pos_map: Dict[str, int]     = defaultdict(lambda: len(pos_map) + 1)
+    gra_map: Dict[str, int]     = defaultdict(lambda: len(gra_map) + 1)
+    aphasia_map: Dict[str, int] = defaultdict(lambda: len(aphasia_map))
+    data: List[Dict[str, Any]] = []
+    sent: Optional[Dict[str, Any]] = None
+    i = 0
     while i < len(lines):
         line = lines[i].rstrip("\n")
+        # 啟段
         if line.startswith("@Begin"):
             sent = {
                 "sentence_id": f"S{len(data)+1}",
                 "sentence_pid": None,
+                "aphasia_type": None,   # 若最後仍沒有，就標 UNKNOWN
+                "dialogues": []         # [ { "INV": [...], "PAR": [...] }, ... ]
             }
             i += 1
             continue
+        # 結束
         if line.startswith("@End"):
             if sent and sent["dialogues"]:
                 if not sent.get("aphasia_type"):
             i += 1
             continue
+        # 句子屬性
         if sent and line.startswith("@PID:"):
             parts = line.split("\t")
             if len(parts) > 1:
         if sent and line.startswith("@ID:"):
             # 是否為病人那位 PAR*
             if ID_PAR_RE.search(line):
                 aph = "UNKNOWN"
+                # 如果 @ID 有標註失語類型，可在此使用 regex 抓出來並替換 aph
                 # m = re.search(r"WAB:([A-Za-z]+)", line)
                 # if m: aph = m.group(1)
                 aph = aph.upper()
             i += 1
             continue
+        # 對話行：*INV: 或 *PARx:
         if sent and UTTER_RE.match(line):
             role_tag = UTTER_RE.match(line).group(1)
             role = "INV" if role_tag == "INV" else "PAR"
             if not sent["dialogues"]:
                 sent["dialogues"].append({"INV": [], "PAR": []})
+            # 新輪對話：若來的是 INV 且上一輪已有 PAR，視為下一輪
             if role == "INV" and sent["dialogues"][-1]["PAR"]:
                 sent["dialogues"].append({"INV": [], "PAR": []})
+            # 新增一個空 turn（之後 %mor/%wor/%gra 會補）
             sent["dialogues"][-1][role].append(
+                {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": [], "utterance_text": ""}
             )
             i += 1
             continue
+        # %mor
         if sent and line.startswith("%mor:"):
+            blk = [line]; i += 1
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
             for u in units:
                 if "|" in u:
                     pos, rest = u.split("|", 1)
                     word = rest.split("|", 1)[0]
                     toks.append(word)
                     pos_ids.append(pos_map[pos])
             dlg = sent["dialogues"][-1]
             tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
             tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
+            # 也保存 plain text 供下游模型使用
+            tgt["utterance_text"] = " ".join(toks).strip()
             continue
+        # %wor
         if sent and line.startswith("%wor:"):
+            blk = [line]; i += 1
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
             merged = merge_multiline(blk).replace("%wor:", "").strip()
+            # 抓 <word> <start>_<end>
             raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
             wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
             dlg = sent["dialogues"][-1]
             tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
+            # 與 %mor tokens 對齊，duration = end - start
+            aligned: List[Tuple[str, int]] = []
             j = 0
             for tok in tgt.get("tokens", []):
                 c_tok = canonical(tok)
             tgt["word_durations"] = aligned
             continue
+        # %gra
         if sent and line.startswith("%gra:"):
+            blk = [line]; i += 1
             while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
                 blk.append(lines[i]); i += 1
             tgt["word_grammar_ids"] = triples
             continue
+        # 其他行
         i += 1
+    # 收尾（檔案若意外沒 @End）
     if sent and sent["dialogues"]:
         if not sent.get("aphasia_type"):
             sent["aphasia_type"] = "UNKNOWN"
             aphasia_map["UNKNOWN"]
         data.append(sent)
+    # 建立 text_all：把所有 PAR utterance_text 串起來
+    par_texts: List[str] = []
+    for s in data:
+        for turn in s.get("dialogues", []):
+            for par_ut in turn.get("PAR", []):
+                if par_ut.get("utterance_text"):
+                    par_texts.append(par_ut["utterance_text"])
+    text_all = "\n".join(par_texts).strip()
     return {
         "sentences": data,
         "pos_mapping": dict(pos_map),
         "grammar_mapping": dict(gra_map),
         "aphasia_types": dict(aphasia_map),
+        "text_all": text_all
     }
+# ────────── 封裝：檔案 → dict / 檔案 → 檔案 ──────────
+def cha_to_dict(cha_path: str) -> Dict[str, Any]:
+    """讀取 .cha 檔並回傳 dict（不寫檔）。"""
+    p = Path(cha_path)
+    if not p.exists():
+        raise FileNotFoundError(f"找不到檔案: {cha_path}")
+    with p.open("r", encoding="utf-8") as fh:
+        lines = fh.readlines()
+    return cha_to_json(lines)
+def cha_to_json_file(cha_path: str, output_json: Optional[str] = None) -> Tuple[str, Dict[str, Any]]:
+    """
+    將 .cha 轉成 JSON 並寫檔。
+    回傳：(output_json_path, data_dict)
+    """
+    data = cha_to_dict(cha_path)
+    out_path = Path(output_json) if output_json else Path(cha_path).with_suffix(".json")
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as fh:
+        json.dump(data, fh, ensure_ascii=False, indent=4)
+    return str(out_path), data
+# ────────── CLI ──────────
 def parse_args():
     p = argparse.ArgumentParser()
+    p.add_argument("--input", "-i", type=str, required=True, help="輸入 .cha 檔")
+    p.add_argument("--output", "-o", type=str, required=True, help="輸出 .json 檔")
     return p.parse_args()
 def main():
     with out_path.open("w", encoding="utf-8") as fh:
         json.dump(dataset, fh, ensure_ascii=False, indent=4)
+    print(
+        f"✅ 轉換完成 → {out_path}（句數 {len(dataset['sentences'])}，"
+        f"pos={len(dataset['pos_mapping'])}，gra={len(dataset['grammar_mapping'])}，"
+        f"類型鍵={list(dataset['aphasia_types'].keys())}）"
+    )
 if __name__ == "__main__":
     main()