| | import os |
| | import json |
| |
|
| | def annotate_transcript(session_id, base_dir="session_data"): |
| |
|
| | session_dir = os.path.join(base_dir, session_id) |
| | json_file = os.path.join(session_dir, f"{session_id}_transcriptionCW.json") |
| | output_file = os.path.join(session_dir, "annotation_result.txt") |
| | |
| | if not os.path.exists(json_file): |
| | print(f"Error: could not find {json_file}") |
| | return |
| | |
| | with open(json_file, "r", encoding="utf-8") as f: |
| | data = json.load(f) |
| | |
| | segments = data.get("segments", []) |
| | annotated_lines = [] |
| | |
| | for segment in segments: |
| | speaker = segment.get("speaker", "UNKNOWN") |
| | words = segment.get("words", []) |
| | n = len(words) |
| | |
| | pause_map = {} |
| | for pause in segment.get("pauses", []): |
| | pause_start = pause.get("start") |
| | duration = pause.get("duration") |
| | for idx, token in enumerate(words): |
| | if abs(token.get("end", 0) - pause_start) < 0.01: |
| | pause_map.setdefault(idx, []).append(f"({duration})") |
| | break |
| | |
| | rep_map = {} |
| | for rep in segment.get("repetitions", []): |
| | indices = rep.get("words", []) |
| | if indices: |
| | start_idx = indices[0] |
| | end_idx = indices[-1] |
| | rep_content = rep.get("content", "") |
| | rep_map[start_idx] = (end_idx, rep_content) |
| | |
| | annotated_tokens = [] |
| | i = 0 |
| | while i < n: |
| | if i in rep_map: |
| | rep_end, rep_content = rep_map[i] |
| | rep_str = f"<{rep_content}> [/]" |
| | annotated_tokens.append(rep_str) |
| | if rep_end in pause_map: |
| | for pause_marker in pause_map[rep_end]: |
| | annotated_tokens.append(pause_marker) |
| | i = rep_end + 1 |
| | else: |
| | token_word = words[i].get("word", "") |
| | annotated_tokens.append(token_word) |
| | if i in pause_map: |
| | for pause_marker in pause_map[i]: |
| | annotated_tokens.append(pause_marker) |
| | i += 1 |
| | |
| | |
| | transcript = " ".join(annotated_tokens) |
| | |
| | line = f"*{speaker}\t{transcript}" |
| | annotated_lines.append(line) |
| | |
| | |
| | with open(output_file, "w", encoding="utf-8") as f: |
| | for line in annotated_lines: |
| | f.write(line + "\n") |
| | |
| | print(f"Annotation done in {output_file}") |
| | return output_file |
| |
|
| | if __name__ == "__main__": |
| |
|
| | annotate_transcript("000030") |
| |
|