Ellie5757575757 commited on
Commit
223013e
ยท
verified ยท
1 Parent(s): 762e584

Delete Cha_Json.py

Browse files
Files changed (1) hide show
  1. Cha_Json.py +0 -181
Cha_Json.py DELETED
@@ -1,181 +0,0 @@
1
- #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
- """
4
- cha2json.py โ”€โ”€ ๅฐ‡ๅ–ฎไธ€ CLAN .cha ่ฝ‰ๆˆ JSON๏ผˆๅผทๅŒ– %mor/%wor ๅฐ้ฝŠ๏ผ‰
5
-
6
- ๅช่ฆ๏ผš
7
- $ python3 cha2json.py
8
- """
9
-
10
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ้€™ๅ…ฉ่กŒๆ”นๆˆไฝ ็š„ๅ›บๅฎš่ทฏๅพ‘ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
11
- INPUT_CHA = "/workspace/SH001/website/ACWT01a(4).cha"
12
- OUTPUT_JSON = "/workspace/SH001/website/Output.json"
13
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
14
-
15
- import re, json, sys
16
- from pathlib import Path
17
- from collections import defaultdict
18
-
19
- TAG_PREFIXES = ("*PAR:", "*INV:", "%mor:", "%gra:", "%wor:", "@")
20
- WORD_RE = re.compile(r"[A-Za-z0-9]+")
21
-
22
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ๅŒ็พฉ้›†ๅˆ๏ผˆๅŠ ้€Ÿๅฐ้ฝŠ๏ผ‰ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
23
- SYN_SETS = [
24
- {"be", "am", "is", "are", "was", "were"},
25
- {"have", "has", "had"},
26
- {"do", "does", "did"},
27
- {"go", "going", "went", "gone"},
28
- ]
29
- def same_syn(a, b): # ๅŒ่ฉžๅฝ™ไธๅŒๅฝขๆ…‹่ฆ–็‚บ็›ธๅŒ
30
- return any(a in s and b in s for s in SYN_SETS)
31
-
32
- def canonical(txt): # token/word โ†’ ๆฏ”ๅฐ็”จๅญ—ไธฒ
33
- head = re.split(r"[~\-\&|]", txt, 1)[0]
34
- m = WORD_RE.search(head)
35
- return m.group(0).lower() if m else ""
36
-
37
- def merge_multiline(block): # ๅˆไฝต่ทจ่กŒ %mor/%wor/%gra
38
- merged, buf = [], None
39
- for raw in block:
40
- ln = raw.rstrip("\n").replace("\x15", "")
41
- if ln.lstrip().startswith("%") and ":" in ln:
42
- if buf: merged.append(buf)
43
- buf = ln
44
- else:
45
- if buf and ln.strip(): buf += " " + ln.strip()
46
- else: merged.append(ln)
47
- if buf: merged.append(buf)
48
- return "\n".join(merged)
49
-
50
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ไธป่ฝ‰ๆ› โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
51
- def cha_to_json(lines):
52
- pos_map = defaultdict(lambda: len(pos_map) + 1)
53
- gra_map = defaultdict(lambda: len(gra_map) + 1)
54
- aphasia_map = defaultdict(lambda: len(aphasia_map))
55
-
56
- data, sent, i = [], None, 0
57
- while i < len(lines):
58
- line = lines[i]
59
-
60
- # --- ๆจ™้ ญ / ็ตๅฐพ ---
61
- if line.startswith("@UTF8"):
62
- sent = {"sentence_id": f"S{len(data)+1}",
63
- "sentence_pid": None,
64
- "aphasia_type": None,
65
- "dialogues": []}
66
- i += 1; continue
67
- if line.startswith("@End"):
68
- if sent and sent["aphasia_type"] and sent["dialogues"]:
69
- data.append(sent)
70
- sent = None; i += 1; continue
71
-
72
- # --- ๅฅๅญๅฑฌๆ€ง ---
73
- if sent and line.startswith("@PID:"):
74
- parts = line.split("\t")
75
- if len(parts) > 1:
76
- sent["sentence_pid"] = parts[1].strip()
77
- i += 1; continue
78
- if sent and line.startswith("@ID:") and "|PAR|" in line:
79
- aph = line.split("|")[5].strip().upper()
80
- aphasia_map[aph]
81
- sent["aphasia_type"] = aph
82
- i += 1; continue
83
-
84
- # --- ๅฐ่ฉฑ่กŒ ---
85
- if sent and (line.startswith("*INV:") or line.startswith("*PAR:")):
86
- role = "INV" if line.startswith("*INV:") else "PAR"
87
- if not sent["dialogues"]:
88
- sent["dialogues"].append({"INV": [], "PAR": []})
89
- if role == "INV" and sent["dialogues"][-1]["PAR"]:
90
- sent["dialogues"].append({"INV": [], "PAR": []})
91
- sent["dialogues"][-1][role].append(
92
- {"tokens": [], "word_pos_ids": [], "word_grammar_ids": [], "word_durations": []})
93
- i += 1; continue
94
-
95
- # --- %mor ---
96
- if sent and line.startswith("%mor:"):
97
- blk = [line]; i += 1
98
- while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
99
- blk.append(lines[i]); i += 1
100
- units = merge_multiline(blk).replace("%mor:", "").strip().split()
101
-
102
- toks, pos_ids = [], []
103
- for u in units:
104
- if "|" in u:
105
- pos, rest = u.split("|", 1)
106
- toks.append(rest.split("|", 1)[0])
107
- pos_ids.append(pos_map[pos])
108
-
109
- dlg = sent["dialogues"][-1]
110
- tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
111
- tgt["tokens"], tgt["word_pos_ids"] = toks, pos_ids
112
- continue
113
-
114
- # --- %wor ---
115
- if sent and line.startswith("%wor:"):
116
- blk = [line]; i += 1
117
- while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
118
- blk.append(lines[i]); i += 1
119
- merged = merge_multiline(blk).replace("%wor:", "").strip()
120
- raw = re.findall(r"(\S+)\s+(\d+)\D+(\d+)", merged)
121
- wor = [(w, int(e)-int(s)) for w,s,e in raw]
122
-
123
- dlg = sent["dialogues"][-1]
124
- tgt = dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1]
125
-
126
- aligned, j = [], 0
127
- for tok in tgt["tokens"]:
128
- c_tok = canonical(tok); match = None
129
- for k in range(j, len(wor)):
130
- c_w = canonical(wor[k][0])
131
- if (c_tok == c_w or c_w.startswith(c_tok) or c_tok.startswith(c_w)
132
- or same_syn(c_tok, c_w)):
133
- match = wor[k]; j = k+1; break
134
- aligned.append([tok, match[1] if match else 0])
135
- tgt["word_durations"] = aligned
136
- continue
137
-
138
- # --- %gra ---
139
- if sent and line.startswith("%gra:"):
140
- blk = [line]; i += 1
141
- while i < len(lines) and not lines[i].lstrip().startswith(TAG_PREFIXES):
142
- blk.append(lines[i]); i += 1
143
- units = merge_multiline(blk).replace("%gra:", "").strip().split()
144
-
145
- triples = []
146
- for u in units:
147
- a,b,r = u.split("|")
148
- if a.isdigit() and b.isdigit():
149
- triples.append([int(a), int(b), gra_map[r]])
150
-
151
- dlg = sent["dialogues"][-1]
152
- (dlg["PAR"][-1] if dlg["PAR"] else dlg["INV"][-1])["word_grammar_ids"] = triples
153
- continue
154
-
155
- i += 1 # ๅ…ถไป–่กŒ
156
-
157
- return {"sentences": data,
158
- "pos_mapping": dict(pos_map),
159
- "grammar_mapping": dict(gra_map),
160
- "aphasia_types": dict(aphasia_map)}
161
-
162
- # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ ๅŸท่กŒ โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
163
- def main():
164
- in_path = Path(INPUT_CHA)
165
- out_path = Path(OUTPUT_JSON)
166
-
167
- if not in_path.exists():
168
- sys.exit(f"โŒ ๆ‰พไธๅˆฐๆช”ๆกˆ: {in_path}")
169
-
170
- with in_path.open("r", encoding="utf-8") as fh:
171
- lines = fh.readlines()
172
-
173
- dataset = cha_to_json(lines)
174
- out_path.parent.mkdir(parents=True, exist_ok=True)
175
- with out_path.open("w", encoding="utf-8") as fh:
176
- json.dump(dataset, fh, ensure_ascii=False, indent=4)
177
-
178
- print(f"โœ… ่ฝ‰ๆ›ๅฎŒๆˆ โ†’ {out_path}")
179
-
180
- if __name__ == "__main__":
181
- main()