Spaces:
Build error
Build error
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| cha2json.py ── 將單一 CLAN .cha 轉成 JSON(強化 %mor/%wor/%gra 對齊) | |
| 用法: | |
| # 直接改上方預設路徑 | |
| python3 cha2json.py | |
| # 或用參數 | |
| python3 cha2json.py --input /path/to/input.cha --output /path/to/output.json | |
| """ | |
| # ────────── 預設路徑:可改成你的固定路徑 ────────── | |
| INPUT_CHA = "/workspace/SH001/vid_output/output.cha" | |
| OUTPUT_JSON = "/workspace/SH001/website/aphasia_website/aphasia_env/Output.json" | |
| # ─────────────────────────────────────────────── | |
| import re | |
| import json | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| from collections import defaultdict | |
| # 接受的斷行標籤(用於多行 %mor/%wor/%gra 合併的停止條件) | |
| TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@") | |
| WORD_RE = re.compile(r"[A-Za-z0-9]+") | |
| # 接受「病人」角色:PAR / PAR0 / PAR1 / ... | |
| ID_PAR_RE = re.compile(r"\|PAR\d*\|") | |
| # 接受對話行:*INV: 或 *PAR0: / *PAR1: / ... | |
| UTTER_RE = re.compile(r"^\*(INV|PAR\d+):") | |
| # ────────── 同義集合(對齊時容忍形態變化) ────────── | |
| SYN_SETS = [ | |
| {"be", "am", "is", "are", "was", "were", "been", "being"}, | |
| {"have", "has", "had"}, | |
| {"do", "does", "did", "done", "doing"}, | |
| {"go", "goes", "going", "went", "gone"}, | |
| {"run", "runs", "running", "ran"}, | |
| {"see", "sees", "seeing", "saw", "seen"}, | |
| {"get", "gets", "getting", "got", "gotten"}, | |
| {"drop", "drops", "dropping", "dropped"}, | |
| {"swim", "swims", "swimming", "swam", "swum"}, | |
| ] | |
| def same_syn(a: str, b: str) -> bool: | |
| """同詞彙不同形態視為相同""" | |
| if not a or not b: | |
| return False | |
| for s in SYN_SETS: | |
| if a in s and b in s: | |
| return True | |
| return False | |
| def canonical(txt: str) -> str: | |
| """token/word → 比對用字串:去掉 & ~ - | 之後的非字母數字、轉小寫""" | |
| head = re.split(r"[~\-\&|]", txt, 1)[0] | |
| m = WORD_RE.search(head) | |
| return m.group(0).lower() if m else "" | |
| def merge_multiline(block_lines): | |
| """ | |
| 合併跨行 %mor/%wor/%gra。 | |
| 規則:以 '%' 開頭者作為起始,往下串,遇到新標籤或 @ 開頭就停。 | |
| """ | |
| merged, buf = [], None | |
| for raw in block_lines: | |
| ln = raw.rstrip("\n").replace("\x15", "") # 去掉 CLAN 的分隔控制字 | |
| if ln.lstrip().startswith("%") and ":" in ln: | |
| if buf: | |
| merged.append(buf) | |
| buf = ln | |
| else: | |
| if buf and ln.strip(): | |
| buf += " " + ln.strip() | |
| else: | |
| merged.append(ln) | |
| if buf: | |
| merged.append(buf) | |
| return "\n".join(merged) | |
| # ────────── 主轉換 ────────── | |
| def cha_to_json(lines): | |
| # 映射以 1 起算(pos / gra),aphasia 類型讓 defaultdict 從 0 起也行 | |
| pos_map = defaultdict(lambda: len(pos_map) + 1) | |
| gra_map = defaultdict(lambda: len(gra_map) + 1) | |
| aphasia_map = defaultdict(lambda: len(aphasia_map)) # 0,1,2,... | |
| data = [] | |
| sent = None | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i].rstrip("\n") | |
| # --- 啟段:用 @Begin(比 @UTF8 更語義化)--- | |
| if line.startswith("@Begin"): | |
| sent = { | |
| "sentence_id": f"S{len(data)+1}", | |
| "sentence_pid": None, | |
| "aphasia_type": None, # 若最後仍沒有,就標 UNKNOWN | |
| "dialogues": [] # [ { "INV": [...], "PAR": [...] }, ... ] | |
| } | |
| i += 1 | |
| continue | |
| # --- 結束:@End(只要有對話就收,不再卡 aphasia_type)--- | |
| if line.startswith("@End"): | |
| if sent and sent["dialogues"]: | |
| if not sent.get("aphasia_type"): | |
| sent["aphasia_type"] = "UNKNOWN" | |
| aphasia_map["UNKNOWN"] | |
| data.append(sent) | |
| sent = None | |
| i += 1 | |
| continue | |
| # --- 句子屬性 --- | |
| if sent and line.startswith("@PID:"): | |
| parts = line.split("\t") | |
| if len(parts) > 1: | |
| sent["sentence_pid"] = parts[1].strip() | |
| i += 1 | |
| continue | |
| if sent and line.startswith("@ID:"): | |
| # 是否為病人那位 PAR* | |
| if ID_PAR_RE.search(line): | |
| # 你的範例沒有寫失語類型 → 先標 UNKNOWN,避免被丟棄 | |
| aph = "UNKNOWN" | |
| # 若未來 @ID 有藏類型,可在此寫 regex 抓出來替換 aph | |
| # m = re.search(r"WAB:([A-Za-z]+)", line) | |
| # if m: aph = m.group(1) | |
| aph = aph.upper() | |
| aphasia_map[aph] # 建立 map(自動編號) | |
| sent["aphasia_type"] = aph | |
| i += 1 | |
| continue | |
| # --- 對話行:*INV: 或 *PAR0:/PAR1: --- | |
| if sent and UTTER_RE.match(line): | |
| role_tag = UTTER_RE.match(line).group(1) | |
| role = "INV" if role_tag == "INV" else "PAR" | |
| if not sent["dialogues"]: | |
| sent["dialogues"].append({"INV": [], "PAR": []}) | |
| # 若新來的是 INV 而上一輪已有 PAR,視為下一輪互動 | |
| if role == "INV" and sent["dialogues"][-1]["PAR"]: | |
| sent["dialogues"].append({"INV": [], "PAR": []}) | |
| # 建一個空的 turn 容器(之後 %mor/%wor/%gra 會補進來) | |
| sent["dialogues"][-1][role].append( | |
| {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []} | |
| ) | |
| i += 1 | |
| continue | |
| # --- %mor --- | |
| if sent and line.startswith("%mor:"): | |
| blk = [line] | |
| i += 1 | |
| # 收集跨行,遇到新標籤停 | |
| while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES): | |
| blk.append(lines[i]); i += 1 | |
| units = merge_multiline(blk).replace("%mor:", "").strip().split() | |
| toks, pos_ids = [], [] | |
| for u in units: | |
| if "|" in u: | |
| pos, rest = u.split("|", 1) | |
| # rest 可能像 noun|dog-Acc → 取第一段 'dog-Acc' 再切一次保守取第一個詞 | |
| word = rest.split("|", 1)[0] | |
| # 有些詞會像 propn|thefablecottagecom,照收 | |
| toks.append(word) | |
| pos_ids.append(pos_map[pos]) | |
| # 放到當前輪的最後一個 turn | |
| dlg = sent["dialogues"][-1] | |
| tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1] | |
| tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids | |
| continue | |
| # --- %wor --- | |
| if sent and line.startswith("%wor:"): | |
| blk = [line] | |
| i += 1 | |
| while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES): | |
| blk.append(lines[i]); i += 1 | |
| merged = merge_multiline(blk).replace("%wor:", "").strip() | |
| # 你的檔案在去掉 \x15 後會變成:word 0_583 word 583_1166 ... | |
| # 用這個 regex 抓:<word> <start>_<end> | |
| raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged) | |
| wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs] | |
| dlg = sent["dialogues"][-1] | |
| tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1] | |
| # 嘗試將 %mor 的 tokens 與 %wor 的 word align,取 duration = end - start | |
| aligned = [] | |
| j = 0 | |
| for tok in tgt.get("tokens", []): | |
| c_tok = canonical(tok) | |
| match = None | |
| for k in range(j, len(wor)): | |
| c_w = canonical(wor[k][0]) | |
| if ( | |
| c_tok == c_w | |
| or c_w.startswith(c_tok) | |
| or c_tok.startswith(c_w) | |
| or same_syn(c_tok, c_w) | |
| ): | |
| match = wor[k] | |
| j = k + 1 | |
| break | |
| dur = (match[2] - match[1]) if match else 0 | |
| aligned.append([tok, dur]) | |
| tgt["word_durations"] = aligned | |
| continue | |
| # --- %gra --- | |
| if sent and line.startswith("%gra:"): | |
| blk = [line] | |
| i += 1 | |
| while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES): | |
| blk.append(lines[i]); i += 1 | |
| units = merge_multiline(blk).replace("%gra:", "").strip().split() | |
| triples = [] | |
| for u in units: | |
| # 例:1|2|DET | |
| parts = u.split("|") | |
| if len(parts) == 3: | |
| a, b, r = parts | |
| if a.isdigit() and b.isdigit(): | |
| triples.append([int(a), int(b), gra_map[r]]) | |
| dlg = sent["dialogues"][-1] | |
| tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1] | |
| tgt["word_grammar_ids"] = triples | |
| continue | |
| # 其他行 → 下一行 | |
| i += 1 | |
| # 收尾(保險:如果檔案意外沒 @End) | |
| if sent and sent["dialogues"]: | |
| if not sent.get("aphasia_type"): | |
| sent["aphasia_type"] = "UNKNOWN" | |
| aphasia_map["UNKNOWN"] | |
| data.append(sent) | |
| return { | |
| "sentences": data, | |
| "pos_mapping": dict(pos_map), | |
| "grammar_mapping": dict(gra_map), | |
| "aphasia_types": dict(aphasia_map), | |
| } | |
| # ────────── 執行 ────────── | |
| def parse_args(): | |
| p = argparse.ArgumentParser() | |
| p.add_argument("--input", "-i", type=str, default=INPUT_CHA, help="輸入 .cha 檔") | |
| p.add_argument("--output", "-o", type=str, default=OUTPUT_JSON, help="輸出 .json 檔") | |
| return p.parse_args() | |
| def main(): | |
| args = parse_args() | |
| in_path = Path(args.input) | |
| out_path = Path(args.output) | |
| if not in_path.exists(): | |
| sys.exit(f"❌ 找不到檔案: {in_path}") | |
| with in_path.open("r", encoding="utf-8") as fh: | |
| lines = fh.readlines() | |
| dataset = cha_to_json(lines) | |
| out_path.parent.mkdir(parents=True, exist_ok=True) | |
| with out_path.open("w", encoding="utf-8") as fh: | |
| json.dump(dataset, fh, ensure_ascii=False, indent=4) | |
| print(f"✅ 轉換完成 → {out_path}(句數 {len(dataset['sentences'])},pos={len(dataset['pos_mapping'])},gra={len(dataset['grammar_mapping'])},類型鍵={list(dataset['aphasia_types'].keys())})") | |
| if __name__ == "__main__": | |
| main() | |