Aphasia_Classificifier

Sleeping

App Files Files Community

Ellie5757575757 commited on Aug 8, 2025

Commit

85d56c8

verified ·

1 Parent(s): 223013e

Upload Cha_Json.py

Browse files

Files changed (1) hide show

Cha_Json.py +289 -0

Cha_Json.py ADDED Viewed

	@@ -0,0 +1,289 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+cha2json.py ── 將單一 CLAN .cha 轉成 JSON（強化 %mor/%wor/%gra 對齊）
+用法：
+    # 直接改上方預設路徑
+    python3 cha2json.py
+    # 或用參數
+    python3 cha2json.py --input /path/to/input.cha --output /path/to/output.json
+"""
+# ────────── 預設路徑：可改成你的固定路徑 ──────────
+INPUT_CHA   = "/workspace/SH001/vid_output/output.cha"
+OUTPUT_JSON = "/workspace/SH001/website/aphasia_website/aphasia_env/Output.json"
+# ───────────────────────────────────────────────
+import re
+import json
+import sys
+import argparse
+from pathlib import Path
+from collections import defaultdict
+# 接受的斷行標籤（用於多行 %mor/%wor/%gra 合併的停止條件）
+TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
+WORD_RE      = re.compile(r"[A-Za-z0-9]+")
+# 接受「病人」角色：PAR / PAR0 / PAR1 / ...
+ID_PAR_RE = re.compile(r"\|PAR\d*\|")
+# 接受對話行：*INV: 或 *PAR0: / *PAR1: / ...
+UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
+# ────────── 同義集合（對齊時容忍形態變化） ──────────
+SYN_SETS = [
+    {"be", "am", "is", "are", "was", "were", "been", "being"},
+    {"have", "has", "had"},
+    {"do", "does", "did", "done", "doing"},
+    {"go", "goes", "going", "went", "gone"},
+    {"run", "runs", "running", "ran"},
+    {"see", "sees", "seeing", "saw", "seen"},
+    {"get", "gets", "getting", "got", "gotten"},
+    {"drop", "drops", "dropping", "dropped"},
+    {"swim", "swims", "swimming", "swam", "swum"},
+]
+def same_syn(a: str, b: str) -> bool:
+    """同詞彙不同形態視為相同"""
+    if not a or not b:
+        return False
+    for s in SYN_SETS:
+        if a in s and b in s:
+            return True
+    return False
+def canonical(txt: str) -> str:
+    """token/word → 比對用字串：去掉 & ~ - | 之後的非字母數字、轉小寫"""
+    head = re.split(r"[~\-\&|]", txt, 1)[0]
+    m = WORD_RE.search(head)
+    return m.group(0).lower() if m else ""
+def merge_multiline(block_lines):
+    """
+    合併跨行 %mor/%wor/%gra。
+    規則：以 '%' 開頭者作為起始，往下串，遇到新標籤或 @ 開頭就停。
+    """
+    merged, buf = [], None
+    for raw in block_lines:
+        ln = raw.rstrip("\n").replace("\x15", "")  # 去掉 CLAN 的分隔控制字
+        if ln.lstrip().startswith("%") and ":" in ln:
+            if buf:
+                merged.append(buf)
+            buf = ln
+        else:
+            if buf and ln.strip():
+                buf += " " + ln.strip()
+            else:
+                merged.append(ln)
+    if buf:
+        merged.append(buf)
+    return "\n".join(merged)
+# ────────── 主轉換 ──────────
+def cha_to_json(lines):
+    # 映射以 1 起算（pos / gra），aphasia 類型讓 defaultdict 從 0 起也行
+    pos_map     = defaultdict(lambda: len(pos_map)     + 1)
+    gra_map     = defaultdict(lambda: len(gra_map)     + 1)
+    aphasia_map = defaultdict(lambda: len(aphasia_map))  # 0,1,2,...
+    data = []
+    sent = None
+    i = 0
+    while i < len(lines):
+        line = lines[i].rstrip("\n")
+        # --- 啟段：用 @Begin（比 @UTF8 更語義化）---
+        if line.startswith("@Begin"):
+            sent = {
+                "sentence_id": f"S{len(data)+1}",
+                "sentence_pid": None,
+                "aphasia_type": None,     # 若最後仍沒有，就標 UNKNOWN
+                "dialogues": []           # [ { "INV": [...], "PAR": [...] }, ... ]
+            }
+            i += 1
+            continue
+        # --- 結束：@End（只要有對話就收，不再卡 aphasia_type）---
+        if line.startswith("@End"):
+            if sent and sent["dialogues"]:
+                if not sent.get("aphasia_type"):
+                    sent["aphasia_type"] = "UNKNOWN"
+                    aphasia_map["UNKNOWN"]
+                data.append(sent)
+            sent = None
+            i += 1
+            continue
+        # --- 句子屬性 ---
+        if sent and line.startswith("@PID:"):
+            parts = line.split("\t")
+            if len(parts) > 1:
+                sent["sentence_pid"] = parts[1].strip()
+            i += 1
+            continue
+        if sent and line.startswith("@ID:"):
+            # 是否為病人那位 PAR*
+            if ID_PAR_RE.search(line):
+                # 你的範例沒有寫失語類型 → 先標 UNKNOWN，避免被丟棄
+                aph = "UNKNOWN"
+                # 若未來 @ID 有藏類型，可在此寫 regex 抓出來替換 aph
+                # m = re.search(r"WAB:([A-Za-z]+)", line)
+                # if m: aph = m.group(1)
+                aph = aph.upper()
+                aphasia_map[aph]            # 建立 map（自動編號）
+                sent["aphasia_type"] = aph
+            i += 1
+            continue
+        # --- 對話行：*INV: 或 *PAR0:/PAR1: ---
+        if sent and UTTER_RE.match(line):
+            role_tag = UTTER_RE.match(line).group(1)
+            role = "INV" if role_tag == "INV" else "PAR"
+            if not sent["dialogues"]:
+                sent["dialogues"].append({"INV": [], "PAR": []})
+            # 若新來的是 INV 而上一輪已有 PAR，視為下一輪互動
+            if role == "INV" and sent["dialogues"][-1]["PAR"]:
+                sent["dialogues"].append({"INV": [], "PAR": []})
+            # 建一個空的 turn 容器（之後 %mor/%wor/%gra 會補進來）
+            sent["dialogues"][-1][role].append(
+                {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []}
+            )
+            i += 1
+            continue
+        # --- %mor ---
+        if sent and line.startswith("%mor:"):
+            blk = [line]
+            i += 1
+            # 收集跨行，遇到新標籤停
+            while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
+                blk.append(lines[i]); i += 1
+            units = merge_multiline(blk).replace("%mor:", "").strip().split()
+            toks, pos_ids = [], []
+            for u in units:
+                if "|" in u:
+                    pos, rest = u.split("|", 1)
+                    # rest 可能像 noun|dog-Acc → 取第一段 'dog-Acc' 再切一次保守取第一個詞
+                    word = rest.split("|", 1)[0]
+                    # 有些詞會像 propn|thefablecottagecom，照收
+                    toks.append(word)
+                    pos_ids.append(pos_map[pos])
+            # 放到當前輪的最後一個 turn
+            dlg = sent["dialogues"][-1]
+            tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
+            tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
+            continue
+        # --- %wor ---
+        if sent and line.startswith("%wor:"):
+            blk = [line]
+            i += 1
+            while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
+                blk.append(lines[i]); i += 1
+            merged = merge_multiline(blk).replace("%wor:", "").strip()
+            # 你的檔案在去掉 \x15 後會變成：word 0_583 word 583_1166 ...
+            # 用這個 regex 抓：<word> <start>_<end>
+            raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
+            wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
+            dlg = sent["dialogues"][-1]
+            tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
+            # 嘗試將 %mor 的 tokens 與 %wor 的 word align，取 duration = end - start
+            aligned = []
+            j = 0
+            for tok in tgt.get("tokens", []):
+                c_tok = canonical(tok)
+                match = None
+                for k in range(j, len(wor)):
+                    c_w = canonical(wor[k][0])
+                    if (
+                        c_tok == c_w
+                        or c_w.startswith(c_tok)
+                        or c_tok.startswith(c_w)
+                        or same_syn(c_tok, c_w)
+                    ):
+                        match = wor[k]
+                        j = k + 1
+                        break
+                dur = (match[2] - match[1]) if match else 0
+                aligned.append([tok, dur])
+            tgt["word_durations"] = aligned
+            continue
+        # --- %gra ---
+        if sent and line.startswith("%gra:"):
+            blk = [line]
+            i += 1
+            while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
+                blk.append(lines[i]); i += 1
+            units = merge_multiline(blk).replace("%gra:", "").strip().split()
+            triples = []
+            for u in units:
+                # 例：1|2|DET
+                parts = u.split("|")
+                if len(parts) == 3:
+                    a, b, r = parts
+                    if a.isdigit() and b.isdigit():
+                        triples.append([int(a), int(b), gra_map[r]])
+            dlg = sent["dialogues"][-1]
+            tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
+            tgt["word_grammar_ids"] = triples
+            continue
+        # 其他行 → 下一行
+        i += 1
+    # 收尾（保險：如果檔案意外沒 @End）
+    if sent and sent["dialogues"]:
+        if not sent.get("aphasia_type"):
+            sent["aphasia_type"] = "UNKNOWN"
+            aphasia_map["UNKNOWN"]
+        data.append(sent)
+    return {
+        "sentences": data,
+        "pos_mapping": dict(pos_map),
+        "grammar_mapping": dict(gra_map),
+        "aphasia_types": dict(aphasia_map),
+    }
+# ────────── 執行 ──────────
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--input", "-i", type=str, default=INPUT_CHA, help="輸入 .cha 檔")
+    p.add_argument("--output", "-o", type=str, default=OUTPUT_JSON, help="輸出 .json 檔")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    in_path  = Path(args.input)
+    out_path = Path(args.output)
+    if not in_path.exists():
+        sys.exit(f"❌ 找不到檔案: {in_path}")
+    with in_path.open("r", encoding="utf-8") as fh:
+        lines = fh.readlines()
+    dataset = cha_to_json(lines)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    with out_path.open("w", encoding="utf-8") as fh:
+        json.dump(dataset, fh, ensure_ascii=False, indent=4)
+    print(f"✅ 轉換完成 → {out_path}（句數 {len(dataset['sentences'])}，pos={len(dataset['pos_mapping'])}，gra={len(dataset['grammar_mapping'])}，類型鍵={list(dataset['aphasia_types'].keys())}）")
+if __name__ == "__main__":
+    main()