Ellie5757575757 commited on
Commit
85d56c8
·
verified ·
1 Parent(s): 223013e

Upload Cha_Json.py

Browse files
Files changed (1) hide show
  1. Cha_Json.py +289 -0
Cha_Json.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ cha2json.py ── 將單一 CLAN .cha 轉成 JSON(強化 %mor/%wor/%gra 對齊)
5
+
6
+ 用法:
7
+ # 直接改上方預設路徑
8
+ python3 cha2json.py
9
+
10
+ # 或用參數
11
+ python3 cha2json.py --input /path/to/input.cha --output /path/to/output.json
12
+ """
13
+
14
+ # ────────── 預設路徑:可改成你的固定路徑 ──────────
15
+ INPUT_CHA = "/workspace/SH001/vid_output/output.cha"
16
+ OUTPUT_JSON = "/workspace/SH001/website/aphasia_website/aphasia_env/Output.json"
17
+ # ───────────────────────────────────────────────
18
+
19
+ import re
20
+ import json
21
+ import sys
22
+ import argparse
23
+ from pathlib import Path
24
+ from collections import defaultdict
25
+
26
+ # 接受的斷行標籤(用於多行 %mor/%wor/%gra 合併的停止條件)
27
+ TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
28
+ WORD_RE = re.compile(r"[A-Za-z0-9]+")
29
+
30
+ # 接受「病人」角色:PAR / PAR0 / PAR1 / ...
31
+ ID_PAR_RE = re.compile(r"\|PAR\d*\|")
32
+
33
+ # 接受對話行:*INV: 或 *PAR0: / *PAR1: / ...
34
+ UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
35
+
36
+ # ────────── 同義集合(對齊時容忍形態變化) ──────────
37
+ SYN_SETS = [
38
+ {"be", "am", "is", "are", "was", "were", "been", "being"},
39
+ {"have", "has", "had"},
40
+ {"do", "does", "did", "done", "doing"},
41
+ {"go", "goes", "going", "went", "gone"},
42
+ {"run", "runs", "running", "ran"},
43
+ {"see", "sees", "seeing", "saw", "seen"},
44
+ {"get", "gets", "getting", "got", "gotten"},
45
+ {"drop", "drops", "dropping", "dropped"},
46
+ {"swim", "swims", "swimming", "swam", "swum"},
47
+ ]
48
+ def same_syn(a: str, b: str) -> bool:
49
+ """同詞彙不同形態視為相同"""
50
+ if not a or not b:
51
+ return False
52
+ for s in SYN_SETS:
53
+ if a in s and b in s:
54
+ return True
55
+ return False
56
+
57
+ def canonical(txt: str) -> str:
58
+ """token/word → 比對用字串:去掉 & ~ - | 之後的非字母數字、轉小寫"""
59
+ head = re.split(r"[~\-\&|]", txt, 1)[0]
60
+ m = WORD_RE.search(head)
61
+ return m.group(0).lower() if m else ""
62
+
63
+ def merge_multiline(block_lines):
64
+ """
65
+ 合併跨行 %mor/%wor/%gra。
66
+ 規則:以 '%' 開頭者作為起始,往下串,遇到新標籤或 @ 開頭就停。
67
+ """
68
+ merged, buf = [], None
69
+ for raw in block_lines:
70
+ ln = raw.rstrip("\n").replace("\x15", "") # 去掉 CLAN 的分隔控制字
71
+ if ln.lstrip().startswith("%") and ":" in ln:
72
+ if buf:
73
+ merged.append(buf)
74
+ buf = ln
75
+ else:
76
+ if buf and ln.strip():
77
+ buf += " " + ln.strip()
78
+ else:
79
+ merged.append(ln)
80
+ if buf:
81
+ merged.append(buf)
82
+ return "\n".join(merged)
83
+
84
+ # ────────── 主轉換 ──────────
85
+ def cha_to_json(lines):
86
+ # 映射以 1 起算(pos / gra),aphasia 類型讓 defaultdict 從 0 起也行
87
+ pos_map = defaultdict(lambda: len(pos_map) + 1)
88
+ gra_map = defaultdict(lambda: len(gra_map) + 1)
89
+ aphasia_map = defaultdict(lambda: len(aphasia_map)) # 0,1,2,...
90
+
91
+ data = []
92
+ sent = None
93
+ i = 0
94
+
95
+ while i < len(lines):
96
+ line = lines[i].rstrip("\n")
97
+
98
+ # --- 啟段:用 @Begin(比 @UTF8 更語義化)---
99
+ if line.startswith("@Begin"):
100
+ sent = {
101
+ "sentence_id": f"S{len(data)+1}",
102
+ "sentence_pid": None,
103
+ "aphasia_type": None, # 若最後仍沒有,就標 UNKNOWN
104
+ "dialogues": [] # [ { "INV": [...], "PAR": [...] }, ... ]
105
+ }
106
+ i += 1
107
+ continue
108
+
109
+ # --- 結束:@End(只要有對話就收,不再卡 aphasia_type)---
110
+ if line.startswith("@End"):
111
+ if sent and sent["dialogues"]:
112
+ if not sent.get("aphasia_type"):
113
+ sent["aphasia_type"] = "UNKNOWN"
114
+ aphasia_map["UNKNOWN"]
115
+ data.append(sent)
116
+ sent = None
117
+ i += 1
118
+ continue
119
+
120
+ # --- 句子屬性 ---
121
+ if sent and line.startswith("@PID:"):
122
+ parts = line.split("\t")
123
+ if len(parts) > 1:
124
+ sent["sentence_pid"] = parts[1].strip()
125
+ i += 1
126
+ continue
127
+
128
+ if sent and line.startswith("@ID:"):
129
+ # 是否為病人那位 PAR*
130
+ if ID_PAR_RE.search(line):
131
+ # 你的範例沒有寫失語類型 → 先標 UNKNOWN,避免被丟棄
132
+ aph = "UNKNOWN"
133
+ # 若未來 @ID 有藏類型,可在此寫 regex 抓出來替換 aph
134
+ # m = re.search(r"WAB:([A-Za-z]+)", line)
135
+ # if m: aph = m.group(1)
136
+ aph = aph.upper()
137
+ aphasia_map[aph] # 建立 map(自動編號)
138
+ sent["aphasia_type"] = aph
139
+ i += 1
140
+ continue
141
+
142
+ # --- 對話行:*INV: 或 *PAR0:/PAR1: ---
143
+ if sent and UTTER_RE.match(line):
144
+ role_tag = UTTER_RE.match(line).group(1)
145
+ role = "INV" if role_tag == "INV" else "PAR"
146
+
147
+ if not sent["dialogues"]:
148
+ sent["dialogues"].append({"INV": [], "PAR": []})
149
+ # 若新來的是 INV 而上一輪已有 PAR,視為下一輪互動
150
+ if role == "INV" and sent["dialogues"][-1]["PAR"]:
151
+ sent["dialogues"].append({"INV": [], "PAR": []})
152
+
153
+ # 建一個空的 turn 容器(之後 %mor/%wor/%gra 會補進來)
154
+ sent["dialogues"][-1][role].append(
155
+ {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []}
156
+ )
157
+ i += 1
158
+ continue
159
+
160
+ # --- %mor ---
161
+ if sent and line.startswith("%mor:"):
162
+ blk = [line]
163
+ i += 1
164
+ # 收集跨行,遇到新標籤停
165
+ while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
166
+ blk.append(lines[i]); i += 1
167
+
168
+ units = merge_multiline(blk).replace("%mor:", "").strip().split()
169
+ toks, pos_ids = [], []
170
+ for u in units:
171
+ if "|" in u:
172
+ pos, rest = u.split("|", 1)
173
+ # rest 可能像 noun|dog-Acc → 取第一段 'dog-Acc' 再切一次保守取第一個詞
174
+ word = rest.split("|", 1)[0]
175
+ # 有些詞會像 propn|thefablecottagecom,照收
176
+ toks.append(word)
177
+ pos_ids.append(pos_map[pos])
178
+
179
+ # 放到當前輪的最後一個 turn
180
+ dlg = sent["dialogues"][-1]
181
+ tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
182
+ tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
183
+ continue
184
+
185
+ # --- %wor ---
186
+ if sent and line.startswith("%wor:"):
187
+ blk = [line]
188
+ i += 1
189
+ while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
190
+ blk.append(lines[i]); i += 1
191
+
192
+ merged = merge_multiline(blk).replace("%wor:", "").strip()
193
+ # 你的檔案在去掉 \x15 後會變成:word 0_583 word 583_1166 ...
194
+ # 用這個 regex 抓:<word> <start>_<end>
195
+ raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
196
+ wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
197
+
198
+ dlg = sent["dialogues"][-1]
199
+ tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
200
+
201
+ # 嘗試將 %mor 的 tokens 與 %wor 的 word align,取 duration = end - start
202
+ aligned = []
203
+ j = 0
204
+ for tok in tgt.get("tokens", []):
205
+ c_tok = canonical(tok)
206
+ match = None
207
+ for k in range(j, len(wor)):
208
+ c_w = canonical(wor[k][0])
209
+ if (
210
+ c_tok == c_w
211
+ or c_w.startswith(c_tok)
212
+ or c_tok.startswith(c_w)
213
+ or same_syn(c_tok, c_w)
214
+ ):
215
+ match = wor[k]
216
+ j = k + 1
217
+ break
218
+ dur = (match[2] - match[1]) if match else 0
219
+ aligned.append([tok, dur])
220
+ tgt["word_durations"] = aligned
221
+ continue
222
+
223
+ # --- %gra ---
224
+ if sent and line.startswith("%gra:"):
225
+ blk = [line]
226
+ i += 1
227
+ while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
228
+ blk.append(lines[i]); i += 1
229
+
230
+ units = merge_multiline(blk).replace("%gra:", "").strip().split()
231
+ triples = []
232
+ for u in units:
233
+ # 例:1|2|DET
234
+ parts = u.split("|")
235
+ if len(parts) == 3:
236
+ a, b, r = parts
237
+ if a.isdigit() and b.isdigit():
238
+ triples.append([int(a), int(b), gra_map[r]])
239
+
240
+ dlg = sent["dialogues"][-1]
241
+ tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
242
+ tgt["word_grammar_ids"] = triples
243
+ continue
244
+
245
+ # 其他行 → 下一行
246
+ i += 1
247
+
248
+ # 收尾(保險:如果檔案意外沒 @End)
249
+ if sent and sent["dialogues"]:
250
+ if not sent.get("aphasia_type"):
251
+ sent["aphasia_type"] = "UNKNOWN"
252
+ aphasia_map["UNKNOWN"]
253
+ data.append(sent)
254
+
255
+ return {
256
+ "sentences": data,
257
+ "pos_mapping": dict(pos_map),
258
+ "grammar_mapping": dict(gra_map),
259
+ "aphasia_types": dict(aphasia_map),
260
+ }
261
+
262
+ # ────────── 執行 ──────────
263
+ def parse_args():
264
+ p = argparse.ArgumentParser()
265
+ p.add_argument("--input", "-i", type=str, default=INPUT_CHA, help="輸入 .cha 檔")
266
+ p.add_argument("--output", "-o", type=str, default=OUTPUT_JSON, help="輸出 .json 檔")
267
+ return p.parse_args()
268
+
269
+ def main():
270
+ args = parse_args()
271
+ in_path = Path(args.input)
272
+ out_path = Path(args.output)
273
+
274
+ if not in_path.exists():
275
+ sys.exit(f"❌ 找不到檔案: {in_path}")
276
+
277
+ with in_path.open("r", encoding="utf-8") as fh:
278
+ lines = fh.readlines()
279
+
280
+ dataset = cha_to_json(lines)
281
+
282
+ out_path.parent.mkdir(parents=True, exist_ok=True)
283
+ with out_path.open("w", encoding="utf-8") as fh:
284
+ json.dump(dataset, fh, ensure_ascii=False, indent=4)
285
+
286
+ print(f"✅ 轉換完成 → {out_path}(句數 {len(dataset['sentences'])},pos={len(dataset['pos_mapping'])},gra={len(dataset['grammar_mapping'])},類型鍵={list(dataset['aphasia_types'].keys())})")
287
+
288
+ if __name__ == "__main__":
289
+ main()