Ellie5757575757 commited on
Commit
f869b0b
·
verified ·
1 Parent(s): 4393971

Update Cha_Json.py

Browse files
Files changed (1) hide show
  1. Cha_Json.py +97 -61
Cha_Json.py CHANGED
@@ -1,36 +1,34 @@
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
- cha2json.py ── 將單一 CLAN .cha 轉成 JSON(強化 %mor/%wor/%gra 對齊)
5
-
6
  用法:
7
- # 直接改上方預設路徑
8
- python3 cha2json.py
9
 
10
- # 參數
11
- python3 cha2json.py --input /path/to/input.cha --output /path/to/output.json
 
 
12
  """
13
 
14
- # ────────── 預設路徑:可改成你的固定路徑 ──────────
15
- INPUT_CHA = "/workspace/SH001/vid_output/output.cha"
16
- OUTPUT_JSON = "/workspace/SH001/website/aphasia_website/aphasia_env/Output.json"
17
- # ───────────────────────────────────────────────
18
-
19
  import re
20
  import json
21
  import sys
22
  import argparse
23
  from pathlib import Path
24
  from collections import defaultdict
 
25
 
26
- # 接受的標籤(用於多行 %mor/%wor/%gra 合併的停止條件
27
  TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
28
  WORD_RE = re.compile(r"[A-Za-z0-9]+")
29
 
30
- # 接受「病人角色:PAR / PAR0 / PAR1 / ...
31
  ID_PAR_RE = re.compile(r"\|PAR\d*\|")
32
 
33
- # 接受對話行:*INV: 或 *PAR0: / *PAR1: / ...
34
  UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
35
 
36
  # ────────── 同義集合(對齊時容忍形態變化) ──────────
@@ -46,7 +44,6 @@ SYN_SETS = [
46
  {"swim", "swims", "swimming", "swam", "swum"},
47
  ]
48
  def same_syn(a: str, b: str) -> bool:
49
- """同詞彙不同形態視為相同"""
50
  if not a or not b:
51
  return False
52
  for s in SYN_SETS:
@@ -60,14 +57,14 @@ def canonical(txt: str) -> str:
60
  m = WORD_RE.search(head)
61
  return m.group(0).lower() if m else ""
62
 
63
- def merge_multiline(block_lines):
64
  """
65
- 合併跨行 %mor/%wor/%gra。
66
  規則:以 '%' 開頭者作為起始,往下串,遇到新標籤或 @ 開頭就停。
67
  """
68
  merged, buf = [], None
69
  for raw in block_lines:
70
- ln = raw.rstrip("\n").replace("\x15", "") # 去掉 CLAN 的分隔控制字
71
  if ln.lstrip().startswith("%") and ":" in ln:
72
  if buf:
73
  merged.append(buf)
@@ -81,32 +78,42 @@ def merge_multiline(block_lines):
81
  merged.append(buf)
82
  return "\n".join(merged)
83
 
84
- # ────────── 主轉換 ──────────
85
- def cha_to_json(lines):
86
- # 映射以 1 起算(pos / gra),aphasia 類型讓 defaultdict 從 0 起也行
87
- pos_map = defaultdict(lambda: len(pos_map) + 1)
88
- gra_map = defaultdict(lambda: len(gra_map) + 1)
89
- aphasia_map = defaultdict(lambda: len(aphasia_map)) # 0,1,2,...
 
 
 
 
 
 
 
 
 
 
90
 
91
- data = []
92
- sent = None
93
- i = 0
94
 
 
95
  while i < len(lines):
96
  line = lines[i].rstrip("\n")
97
 
98
- # --- 啟段:用 @Begin(比 @UTF8 更語義化)---
99
  if line.startswith("@Begin"):
100
  sent = {
101
  "sentence_id": f"S{len(data)+1}",
102
  "sentence_pid": None,
103
- "aphasia_type": None, # 若最後仍沒有,就標 UNKNOWN
104
- "dialogues": [] # [ { "INV": [...], "PAR": [...] }, ... ]
105
  }
106
  i += 1
107
  continue
108
 
109
- # --- 結束:@End(只要有對話就收,不再卡 aphasia_type)---
110
  if line.startswith("@End"):
111
  if sent and sent["dialogues"]:
112
  if not sent.get("aphasia_type"):
@@ -117,7 +124,7 @@ def cha_to_json(lines):
117
  i += 1
118
  continue
119
 
120
- # --- 句子屬性 ---
121
  if sent and line.startswith("@PID:"):
122
  parts = line.split("\t")
123
  if len(parts) > 1:
@@ -128,9 +135,8 @@ def cha_to_json(lines):
128
  if sent and line.startswith("@ID:"):
129
  # 是否為病人那位 PAR*
130
  if ID_PAR_RE.search(line):
131
- # 你的範例沒有寫失語類型 → 先標 UNKNOWN,避免被丟棄
132
  aph = "UNKNOWN"
133
- # 若未來 @ID 有類型,可在此 regex 抓出來替換 aph
134
  # m = re.search(r"WAB:([A-Za-z]+)", line)
135
  # if m: aph = m.group(1)
136
  aph = aph.upper()
@@ -139,29 +145,27 @@ def cha_to_json(lines):
139
  i += 1
140
  continue
141
 
142
- # --- 對話行:*INV: 或 *PAR0:/PAR1: ---
143
  if sent and UTTER_RE.match(line):
144
  role_tag = UTTER_RE.match(line).group(1)
145
  role = "INV" if role_tag == "INV" else "PAR"
146
 
147
  if not sent["dialogues"]:
148
  sent["dialogues"].append({"INV": [], "PAR": []})
149
- # 新來的是 INV 上一輪已有 PAR,視為下一輪互動
150
  if role == "INV" and sent["dialogues"][-1]["PAR"]:
151
  sent["dialogues"].append({"INV": [], "PAR": []})
152
 
153
- # 一個空 turn 容器(之後 %mor/%wor/%gra 會補進來
154
  sent["dialogues"][-1][role].append(
155
- {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []}
156
  )
157
  i += 1
158
  continue
159
 
160
- # --- %mor ---
161
  if sent and line.startswith("%mor:"):
162
- blk = [line]
163
- i += 1
164
- # 收集跨行,遇到新標籤停
165
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
166
  blk.append(lines[i]); i += 1
167
 
@@ -170,36 +174,33 @@ def cha_to_json(lines):
170
  for u in units:
171
  if "|" in u:
172
  pos, rest = u.split("|", 1)
173
- # rest 可能像 noun|dog-Acc → 取第一段 'dog-Acc' 再切一次保守取第一個詞
174
  word = rest.split("|", 1)[0]
175
- # 有些詞會像 propn|thefablecottagecom,照收
176
  toks.append(word)
177
  pos_ids.append(pos_map[pos])
178
 
179
- # 放到當前輪的最後一個 turn
180
  dlg = sent["dialogues"][-1]
181
  tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
182
  tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
 
 
183
  continue
184
 
185
- # --- %wor ---
186
  if sent and line.startswith("%wor:"):
187
- blk = [line]
188
- i += 1
189
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
190
  blk.append(lines[i]); i += 1
191
 
192
  merged = merge_multiline(blk).replace("%wor:", "").strip()
193
- # 你的檔案在去掉 \x15 後會變成:word 0_583 word 583_1166 ...
194
- # 用這個 regex 抓:<word> <start>_<end>
195
  raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
196
  wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
197
 
198
  dlg = sent["dialogues"][-1]
199
  tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
200
 
201
- # 嘗試將 %mor tokens 與 %wor 的 word alignduration = end - start
202
- aligned = []
203
  j = 0
204
  for tok in tgt.get("tokens", []):
205
  c_tok = canonical(tok)
@@ -220,10 +221,9 @@ def cha_to_json(lines):
220
  tgt["word_durations"] = aligned
221
  continue
222
 
223
- # --- %gra ---
224
  if sent and line.startswith("%gra:"):
225
- blk = [line]
226
- i += 1
227
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
228
  blk.append(lines[i]); i += 1
229
 
@@ -242,28 +242,60 @@ def cha_to_json(lines):
242
  tgt["word_grammar_ids"] = triples
243
  continue
244
 
245
- # 其他行 → 下一行
246
  i += 1
247
 
248
- # 收尾(保險:如果��意外沒 @End)
249
  if sent and sent["dialogues"]:
250
  if not sent.get("aphasia_type"):
251
  sent["aphasia_type"] = "UNKNOWN"
252
  aphasia_map["UNKNOWN"]
253
  data.append(sent)
254
 
 
 
 
 
 
 
 
 
 
255
  return {
256
  "sentences": data,
257
  "pos_mapping": dict(pos_map),
258
  "grammar_mapping": dict(gra_map),
259
  "aphasia_types": dict(aphasia_map),
 
260
  }
261
 
262
- # ────────── 執行 ──────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
263
  def parse_args():
264
  p = argparse.ArgumentParser()
265
- p.add_argument("--input", "-i", type=str, default=INPUT_CHA, help="輸入 .cha 檔")
266
- p.add_argument("--output", "-o", type=str, default=OUTPUT_JSON, help="輸出 .json 檔")
267
  return p.parse_args()
268
 
269
  def main():
@@ -283,7 +315,11 @@ def main():
283
  with out_path.open("w", encoding="utf-8") as fh:
284
  json.dump(dataset, fh, ensure_ascii=False, indent=4)
285
 
286
- print(f"✅ 轉換完成 → {out_path}(句數 {len(dataset['sentences'])},pos={len(dataset['pos_mapping'])},gra={len(dataset['grammar_mapping'])},類型鍵={list(dataset['aphasia_types'].keys())})")
 
 
 
 
287
 
288
  if __name__ == "__main__":
289
  main()
 
1
  #!/usr/bin/env python3
2
  # -*- coding: utf-8 -*-
3
  """
4
+ cha_json.py 將單一 CLAN .cha 轉成 JSON(強化 %mor/%wor/%gra 對齊)
 
5
  用法:
6
+ # CLI
7
+ python3 cha_json.py --input /path/to/input.cha --output /path/to/output.json
8
 
9
+ 程式化呼叫(供 pipeline 使):
10
+ from cha_json import cha_to_json_file, cha_to_dict
11
+ out_path, data = cha_to_json_file("/path/in.cha", "/path/out.json")
12
+ data2 = cha_to_dict("/path/in.cha")
13
  """
14
 
15
+ from __future__ import annotations
 
 
 
 
16
  import re
17
  import json
18
  import sys
19
  import argparse
20
  from pathlib import Path
21
  from collections import defaultdict
22
+ from typing import List, Dict, Any, Tuple, Optional
23
 
24
+ # 接受的停止條件(用於 %mor/%wor/%gra 合併)
25
  TAG_PREFIXES = ("*PAR", "*INV", "%mor:", "%gra:", "%wor:", "@")
26
  WORD_RE = re.compile(r"[A-Za-z0-9]+")
27
 
28
+ # 病人角色:PAR / PAR0 / PAR1 / ...
29
  ID_PAR_RE = re.compile(r"\|PAR\d*\|")
30
 
31
+ # 對話行:*INV: 或 *PAR0: / *PAR1: / ...
32
  UTTER_RE = re.compile(r"^\*(INV|PAR\d+):")
33
 
34
  # ────────── 同義集合(對齊時容忍形態變化) ──────────
 
44
  {"swim", "swims", "swimming", "swam", "swum"},
45
  ]
46
  def same_syn(a: str, b: str) -> bool:
 
47
  if not a or not b:
48
  return False
49
  for s in SYN_SETS:
 
57
  m = WORD_RE.search(head)
58
  return m.group(0).lower() if m else ""
59
 
60
+ def merge_multiline(block_lines: List[str]) -> str:
61
  """
62
+ 合併跨行 %mor/%wor/%gra。
63
  規則:以 '%' 開頭者作為起始,往下串,遇到新標籤或 @ 開頭就停。
64
  """
65
  merged, buf = [], None
66
  for raw in block_lines:
67
+ ln = raw.rstrip("\n").replace("\x15", "") # 去掉 CLAN 控制字
68
  if ln.lstrip().startswith("%") and ":" in ln:
69
  if buf:
70
  merged.append(buf)
 
78
  merged.append(buf)
79
  return "\n".join(merged)
80
 
81
+ def cha_to_json(lines: List[str]) -> Dict[str, Any]:
82
+ """
83
+ .cha 檔行列表轉 JSON 結構。
84
+ 回傳格式:
85
+ {
86
+ "sentences": [...],
87
+ "pos_mapping": {...},
88
+ "grammar_mapping": {...},
89
+ "aphasia_types": {...},
90
+ "text_all": "..." # 方便下游模型使用的 PAR 合併文字
91
+ }
92
+ """
93
+ # 對應表(pos / gra 從 1 起算;aphasia 類型 0 起)
94
+ pos_map: Dict[str, int] = defaultdict(lambda: len(pos_map) + 1)
95
+ gra_map: Dict[str, int] = defaultdict(lambda: len(gra_map) + 1)
96
+ aphasia_map: Dict[str, int] = defaultdict(lambda: len(aphasia_map))
97
 
98
+ data: List[Dict[str, Any]] = []
99
+ sent: Optional[Dict[str, Any]] = None
 
100
 
101
+ i = 0
102
  while i < len(lines):
103
  line = lines[i].rstrip("\n")
104
 
105
+ # 啟段
106
  if line.startswith("@Begin"):
107
  sent = {
108
  "sentence_id": f"S{len(data)+1}",
109
  "sentence_pid": None,
110
+ "aphasia_type": None, # 若最後仍沒有,就標 UNKNOWN
111
+ "dialogues": [] # [ { "INV": [...], "PAR": [...] }, ... ]
112
  }
113
  i += 1
114
  continue
115
 
116
+ # 結束
117
  if line.startswith("@End"):
118
  if sent and sent["dialogues"]:
119
  if not sent.get("aphasia_type"):
 
124
  i += 1
125
  continue
126
 
127
+ # 句子屬性
128
  if sent and line.startswith("@PID:"):
129
  parts = line.split("\t")
130
  if len(parts) > 1:
 
135
  if sent and line.startswith("@ID:"):
136
  # 是否為病人那位 PAR*
137
  if ID_PAR_RE.search(line):
 
138
  aph = "UNKNOWN"
139
+ # 如果 @ID 有標註失語類型,可在此使用 regex 抓出來替換 aph
140
  # m = re.search(r"WAB:([A-Za-z]+)", line)
141
  # if m: aph = m.group(1)
142
  aph = aph.upper()
 
145
  i += 1
146
  continue
147
 
148
+ # 對話行:*INV: 或 *PARx:
149
  if sent and UTTER_RE.match(line):
150
  role_tag = UTTER_RE.match(line).group(1)
151
  role = "INV" if role_tag == "INV" else "PAR"
152
 
153
  if not sent["dialogues"]:
154
  sent["dialogues"].append({"INV": [], "PAR": []})
155
+ # 新輪對話:若來的是 INV 上一輪已有 PAR,視為下一輪
156
  if role == "INV" and sent["dialogues"][-1]["PAR"]:
157
  sent["dialogues"].append({"INV": [], "PAR": []})
158
 
159
+ # 新增一個空 turn(之後 %mor/%wor/%gra 會補)
160
  sent["dialogues"][-1][role].append(
161
+ {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": [], "utterance_text": ""}
162
  )
163
  i += 1
164
  continue
165
 
166
+ # %mor
167
  if sent and line.startswith("%mor:"):
168
+ blk = [line]; i += 1
 
 
169
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
170
  blk.append(lines[i]); i += 1
171
 
 
174
  for u in units:
175
  if "|" in u:
176
  pos, rest = u.split("|", 1)
 
177
  word = rest.split("|", 1)[0]
 
178
  toks.append(word)
179
  pos_ids.append(pos_map[pos])
180
 
 
181
  dlg = sent["dialogues"][-1]
182
  tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
183
  tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
184
+ # 也保存 plain text 供下游模型使用
185
+ tgt["utterance_text"] = " ".join(toks).strip()
186
  continue
187
 
188
+ # %wor
189
  if sent and line.startswith("%wor:"):
190
+ blk = [line]; i += 1
 
191
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
192
  blk.append(lines[i]); i += 1
193
 
194
  merged = merge_multiline(blk).replace("%wor:", "").strip()
195
+ # <word> <start>_<end>
 
196
  raw_pairs = re.findall(r"(\S+)\s+(\d+)_(\d+)", merged)
197
  wor = [(w, int(s), int(e)) for (w, s, e) in raw_pairs]
198
 
199
  dlg = sent["dialogues"][-1]
200
  tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
201
 
202
+ # %mor tokens 對齊,duration = end - start
203
+ aligned: List[Tuple[str, int]] = []
204
  j = 0
205
  for tok in tgt.get("tokens", []):
206
  c_tok = canonical(tok)
 
221
  tgt["word_durations"] = aligned
222
  continue
223
 
224
+ # %gra
225
  if sent and line.startswith("%gra:"):
226
+ blk = [line]; i += 1
 
227
  while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
228
  blk.append(lines[i]); i += 1
229
 
 
242
  tgt["word_grammar_ids"] = triples
243
  continue
244
 
245
+ # 其他行
246
  i += 1
247
 
248
+ # 收尾(檔案若意外沒 @End)
249
  if sent and sent["dialogues"]:
250
  if not sent.get("aphasia_type"):
251
  sent["aphasia_type"] = "UNKNOWN"
252
  aphasia_map["UNKNOWN"]
253
  data.append(sent)
254
 
255
+ # 建立 text_all:把所有 PAR utterance_text 串起來
256
+ par_texts: List[str] = []
257
+ for s in data:
258
+ for turn in s.get("dialogues", []):
259
+ for par_ut in turn.get("PAR", []):
260
+ if par_ut.get("utterance_text"):
261
+ par_texts.append(par_ut["utterance_text"])
262
+ text_all = "\n".join(par_texts).strip()
263
+
264
  return {
265
  "sentences": data,
266
  "pos_mapping": dict(pos_map),
267
  "grammar_mapping": dict(gra_map),
268
  "aphasia_types": dict(aphasia_map),
269
+ "text_all": text_all
270
  }
271
 
272
+ # ────────── 封裝:檔案 → dict / 檔案 → 檔案 ──────────
273
+ def cha_to_dict(cha_path: str) -> Dict[str, Any]:
274
+ """讀取 .cha 檔並回傳 dict(不寫檔)。"""
275
+ p = Path(cha_path)
276
+ if not p.exists():
277
+ raise FileNotFoundError(f"找不到檔案: {cha_path}")
278
+ with p.open("r", encoding="utf-8") as fh:
279
+ lines = fh.readlines()
280
+ return cha_to_json(lines)
281
+
282
+ def cha_to_json_file(cha_path: str, output_json: Optional[str] = None) -> Tuple[str, Dict[str, Any]]:
283
+ """
284
+ 將 .cha 轉成 JSON 並寫檔。
285
+ 回傳:(output_json_path, data_dict)
286
+ """
287
+ data = cha_to_dict(cha_path)
288
+ out_path = Path(output_json) if output_json else Path(cha_path).with_suffix(".json")
289
+ out_path.parent.mkdir(parents=True, exist_ok=True)
290
+ with out_path.open("w", encoding="utf-8") as fh:
291
+ json.dump(data, fh, ensure_ascii=False, indent=4)
292
+ return str(out_path), data
293
+
294
+ # ────────── CLI ──────────
295
  def parse_args():
296
  p = argparse.ArgumentParser()
297
+ p.add_argument("--input", "-i", type=str, required=True, help="輸入 .cha 檔")
298
+ p.add_argument("--output", "-o", type=str, required=True, help="輸出 .json 檔")
299
  return p.parse_args()
300
 
301
  def main():
 
315
  with out_path.open("w", encoding="utf-8") as fh:
316
  json.dump(dataset, fh, ensure_ascii=False, indent=4)
317
 
318
+ print(
319
+ f"✅ 轉換完成 → {out_path}(句數 {len(dataset['sentences'])},"
320
+ f"pos={len(dataset['pos_mapping'])},gra={len(dataset['grammar_mapping'])},"
321
+ f"類型鍵={list(dataset['aphasia_types'].keys())})"
322
+ )
323
 
324
  if __name__ == "__main__":
325
  main()