sunnyzjx commited on
Commit
099b013
·
verified ·
1 Parent(s): a24bfd7

Upload 7 files

Browse files
Files changed (7) hide show
  1. annotation.py +255 -0
  2. app.py +198 -0
  3. config.py +12 -0
  4. data_processing.py +176 -0
  5. requirements.txt +4 -0
  6. task_manager.py +161 -0
  7. ui_components.py +119 -0
annotation.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import json
4
+ from huggingface_hub import HfApi, hf_hub_download
5
+ from collections import defaultdict
6
+ import config
7
+
8
+ HF_TOKEN = os.getenv("HF_TOKEN")
9
+ REPO_ID = config.SAVE_REPO_ID
10
+
11
+ api = HfApi()
12
+
13
+
14
+ def get_user_annotation_filename(username: str) -> str:
15
+ """生成用户标注文件名"""
16
+ safe_username = re.sub(r'[\\/*?:"<>|]', "_", username)
17
+ return f"annotation_results_{safe_username}.json"
18
+
19
+
20
+ def get_aggregated_filename() -> str:
21
+ """聚合结果文件名"""
22
+ return "aggregated_annotations.json"
23
+
24
+
25
+ def save_annotations(username_state, annotation_results_state, tasks):
26
+ """保存个人标注结果并更新聚合结果"""
27
+ try:
28
+ individual_result = save_individual_annotations(username_state, annotation_results_state, tasks)
29
+
30
+ aggregated_result = update_aggregated_annotations(tasks)
31
+
32
+ return f"{individual_result}\n{aggregated_result}"
33
+
34
+ except Exception as e:
35
+ return f"❌ 保存失败: {str(e)}"
36
+
37
+
38
+ def save_individual_annotations(username_state, annotation_results_state, tasks):
39
+ """保存个人标注结果"""
40
+ save_data = {
41
+ "total_tasks": len(tasks),
42
+ "completed_tasks": len(annotation_results_state),
43
+ "username": username_state,
44
+ "annotations": []
45
+ }
46
+
47
+ for task_id, choice in annotation_results_state.items():
48
+ task = tasks[task_id]
49
+ save_data["annotations"].append({
50
+ "task_id": task_id,
51
+ "text": task["text"],
52
+ "instruction": task["instruction"],
53
+ "comparison": f"{task['audioA_source']} vs {task['audioB_source']}",
54
+ "audioA_source": task["audioA_source"],
55
+ "audioB_source": task["audioB_source"],
56
+ "original_index": task["original_index"],
57
+ "choice": choice,
58
+ "username": username_state
59
+ })
60
+
61
+ save_str = json.dumps(save_data, ensure_ascii=False, indent=2)
62
+ filename = get_user_annotation_filename(username_state)
63
+
64
+ api.upload_file(
65
+ path_or_fileobj=save_str.encode("utf-8"),
66
+ path_in_repo=filename,
67
+ repo_id=REPO_ID,
68
+ repo_type="dataset",
69
+ token=HF_TOKEN
70
+ )
71
+
72
+ return f"✅ 个人标注已保存: {filename} ({len(annotation_results_state)}/{len(tasks)})"
73
+
74
+
75
+ def update_aggregated_annotations(tasks):
76
+ """更新聚合标注结果"""
77
+ try:
78
+ aggregated_data = load_aggregated_annotations()
79
+
80
+ all_annotations = collect_all_annotations()
81
+
82
+ aggregated_data = build_aggregated_results(all_annotations, tasks)
83
+
84
+ save_str = json.dumps(aggregated_data, ensure_ascii=False, indent=2)
85
+ filename = get_aggregated_filename()
86
+
87
+ api.upload_file(
88
+ path_or_fileobj=save_str.encode("utf-8"),
89
+ path_in_repo=filename,
90
+ repo_id=REPO_ID,
91
+ repo_type="dataset",
92
+ token=HF_TOKEN
93
+ )
94
+
95
+ return f"✅ 聚合结果已更新: {filename}"
96
+
97
+ except Exception as e:
98
+ return f"❌ 聚合结果更新失败: {str(e)}"
99
+
100
+
101
+ def collect_all_annotations():
102
+ """收集所有用户的标注结果"""
103
+ try:
104
+ files_info = api.list_repo_files(repo_id=REPO_ID, repo_type="dataset", token=HF_TOKEN)
105
+
106
+ all_annotations = []
107
+
108
+ for filename in files_info:
109
+ if filename.startswith("annotation_results_") and filename.endswith(".json"):
110
+ try:
111
+ # 下载并加载用户标注
112
+ local_path = hf_hub_download(
113
+ repo_id=REPO_ID,
114
+ filename=filename,
115
+ repo_type="dataset",
116
+ token=HF_TOKEN,
117
+ force_download=True
118
+ )
119
+
120
+ with open(local_path, "r", encoding="utf-8") as f:
121
+ user_data = json.load(f)
122
+ all_annotations.extend(user_data.get("annotations", []))
123
+
124
+ except Exception as e:
125
+ print(f"加载文件 {filename} 失败: {e}")
126
+ continue
127
+
128
+ return all_annotations
129
+
130
+ except Exception as e:
131
+ print(f"收集标注失败: {e}")
132
+ return []
133
+
134
+
135
+ def build_aggregated_results(all_annotations, tasks):
136
+ """构建聚合结果"""
137
+ groups = defaultdict(lambda: {
138
+ "text": "",
139
+ "instruction": "",
140
+ "comparisons": defaultdict(lambda: {"win": 0, "tie": 0, "lose": 0, "annotators": []})
141
+ })
142
+
143
+ for ann in all_annotations:
144
+ original_index = ann.get("original_index")
145
+ comparison = ann.get("comparison")
146
+ choice = ann.get("choice")
147
+ username = ann.get("username")
148
+ text = ann.get("text", "")
149
+ instruction = ann.get("instruction", "")
150
+
151
+ if original_index is not None and comparison and choice:
152
+ key = original_index
153
+
154
+ groups[key]["text"] = text
155
+ groups[key]["instruction"] = instruction
156
+
157
+ if choice in ["win", "tie", "lose"]:
158
+ groups[key]["comparisons"][comparison][choice] += 1
159
+ if username not in groups[key]["comparisons"][comparison]["annotators"]:
160
+ groups[key]["comparisons"][comparison]["annotators"].append(username)
161
+
162
+ aggregated_results = []
163
+ for original_index, group_data in groups.items():
164
+ result_item = {
165
+ "original_index": original_index,
166
+ "text": group_data["text"],
167
+ "instruction": group_data["instruction"],
168
+ "comparisons": {}
169
+ }
170
+
171
+ for comparison, votes in group_data["comparisons"].items():
172
+ result_item["comparisons"][comparison] = {
173
+ "votes(win tie lose)": [votes["win"], votes["tie"], votes["lose"]],
174
+ "total_annotators": len(votes["annotators"]),
175
+ "annotators": votes["annotators"]
176
+ }
177
+
178
+ aggregated_results.append(result_item)
179
+
180
+ aggregated_results.sort(key=lambda x: x["original_index"])
181
+
182
+ return {
183
+ "total_groups": len(aggregated_results),
184
+ "total_annotations": len(all_annotations),
185
+ "results": aggregated_results
186
+ }
187
+
188
+
189
+ def load_aggregated_annotations():
190
+ """加载现有的聚合结果"""
191
+ try:
192
+ filename = get_aggregated_filename()
193
+ local_path = hf_hub_download(
194
+ repo_id=REPO_ID,
195
+ filename=filename,
196
+ repo_type="dataset",
197
+ token=HF_TOKEN,
198
+ force_download=True
199
+ )
200
+
201
+ with open(local_path, "r", encoding="utf-8") as f:
202
+ return json.load(f)
203
+
204
+ except Exception:
205
+ return {"total_groups": 0, "total_annotations": 0, "results": []}
206
+
207
+
208
+ def load_annotations(username):
209
+ """从 Hugging Face Hub 加载用户特定的标注结果"""
210
+ try:
211
+ filename = get_user_annotation_filename(username)
212
+ local_path = hf_hub_download(
213
+ repo_id=REPO_ID,
214
+ filename=filename,
215
+ repo_type="dataset",
216
+ token=HF_TOKEN,
217
+ force_download=True
218
+ )
219
+ with open(local_path, "r", encoding="utf-8") as f:
220
+ save_data = json.load(f)
221
+ annotation_results = {ann["task_id"]: ann["choice"] for ann in save_data.get("annotations", [])}
222
+ return annotation_results
223
+ except Exception:
224
+ return {}
225
+
226
+
227
+ def get_aggregated_stats():
228
+ """获取聚合统计信息"""
229
+ try:
230
+ aggregated_data = load_aggregated_annotations()
231
+
232
+ stats = {
233
+ "total_groups": aggregated_data.get("total_groups", 0),
234
+ "total_annotations": aggregated_data.get("total_annotations", 0),
235
+ "comparison_summary": {}
236
+ }
237
+
238
+ for result in aggregated_data.get("results", []):
239
+ for comparison, data in result.get("comparisons", {}).items():
240
+ if comparison not in stats["comparison_summary"]:
241
+ stats["comparison_summary"][comparison] = {
242
+ "total_votes": 0,
243
+ "win": 0, "tie": 0, "lose": 0
244
+ }
245
+
246
+ votes = data.get("votes", [0, 0, 0])
247
+ stats["comparison_summary"][comparison]["win"] += votes[0]
248
+ stats["comparison_summary"][comparison]["tie"] += votes[1]
249
+ stats["comparison_summary"][comparison]["lose"] += votes[2]
250
+ stats["comparison_summary"][comparison]["total_votes"] += sum(votes)
251
+
252
+ return stats
253
+
254
+ except Exception as e:
255
+ return {"error": str(e)}
app.py ADDED
@@ -0,0 +1,198 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import config
3
+ from ui_components import create_ui
4
+ from annotation import load_annotations
5
+ from task_manager import get_current_task, get_current_task_with_annotations, get_total_tasks
6
+
7
+ css = """
8
+ .center { text-align: center; }
9
+ .audio-container { margin: 10px; padding: 15px; }
10
+
11
+ /* 胜负选择样式 - 绿色 */
12
+ .selected {
13
+ border: 3px solid #4CAF50 !important;
14
+ background-color: #e8f5e9 !important;
15
+ }
16
+
17
+ /* 平局选择样式 - 橙色 */
18
+ .tie-selected {
19
+ border: 3px solid #FF9800 !important;
20
+ background-color: #fff3e0 !important;
21
+ }
22
+
23
+ .user-task-info {
24
+ font-size: 16px;
25
+ color: #333;
26
+ padding: 10px;
27
+ background-color: #f0f0f0;
28
+ border-radius: 5px;
29
+ display: flex;
30
+ justify-content: space-between;
31
+ align-items: center;
32
+ }
33
+
34
+ .progress-bar {
35
+ background-color: #e0e0e0;
36
+ border-radius: 10px;
37
+ height: 8px;
38
+ width: 200px;
39
+ margin: 0 10px;
40
+ position: relative;
41
+ }
42
+
43
+ .progress-fill {
44
+ background-color: #4CAF50;
45
+ height: 100%;
46
+ border-radius: 10px;
47
+ transition: width 0.3s ease;
48
+ }
49
+ """
50
+
51
+
52
+ def calculate_completion_stats(annotation_results):
53
+ """计算任务完成统计信息"""
54
+ if not annotation_results:
55
+ return 0, 0, 0.0
56
+
57
+ total_tasks = get_total_tasks()
58
+ completed_tasks = len(annotation_results)
59
+ completion_rate = (completed_tasks / total_tasks * 100) if total_tasks > 0 else 0.0
60
+
61
+ return completed_tasks, total_tasks, completion_rate
62
+
63
+
64
+ def get_initial_task_position(annotation_results):
65
+ """根据用户的标注历史确定初始任务位置"""
66
+ if not annotation_results:
67
+ return 0
68
+
69
+ max_annotated = max(annotation_results.keys()) if annotation_results else -1
70
+ next_task = max_annotated + 1
71
+
72
+ total_tasks = get_total_tasks()
73
+ return min(next_task, total_tasks - 1)
74
+
75
+
76
+ def create_task_info_html(username, annotation_results, current_task_num):
77
+ """创建包含用户信息和完成度的HTML"""
78
+ completed_tasks, total_tasks, completion_rate = calculate_completion_stats(annotation_results)
79
+
80
+ # 创建进度条HTML
81
+ progress_bar_html = f"""
82
+ <div class="progress-bar">
83
+ <div class="progress-fill" style="width: {completion_rate}%;"></div>
84
+ </div>
85
+ """
86
+
87
+ task_info_html = f"""
88
+ <div class="user-task-info">
89
+ <span>👤 当前用户: {username}</span>
90
+ <div style="display: flex; align-items: center;">
91
+ <span>完成度: {completed_tasks}/{total_tasks} ({completion_rate:.1f}%)</span>
92
+ {progress_bar_html}
93
+ </div>
94
+ <span><strong>当前任务: {current_task_num}</strong></span>
95
+ </div>
96
+ """
97
+
98
+ return task_info_html
99
+
100
+
101
+ def set_user_info(request: gr.Request):
102
+ """设置用户信息到 State 并加载用户特定的标注"""
103
+ username = request.username if hasattr(request, 'username') else "unknown"
104
+ annotation_results = load_annotations(username)
105
+ print(f"加载用户 {username} 的标注结果:{annotation_results}")
106
+
107
+ user_current_task = get_initial_task_position(annotation_results)
108
+
109
+ inst, text, audioA_update, audioB_update, prev_disabled, next_disabled, task_num = get_current_task_with_annotations(
110
+ annotation_results, user_current_task)
111
+
112
+ # 创建包含完成度信息的HTML
113
+ task_info_html = create_task_info_html(username, annotation_results, task_num)
114
+
115
+ return (
116
+ username,
117
+ annotation_results,
118
+ user_current_task,
119
+ inst,
120
+ text,
121
+ audioA_update,
122
+ audioB_update,
123
+ gr.update(interactive=not prev_disabled),
124
+ gr.update(interactive=not next_disabled),
125
+ gr.update(value=task_info_html)
126
+ )
127
+
128
+
129
+ def update_task_info_after_action(username, annotation_results, current_task_num):
130
+ """在用户操作后更新任务信息显示"""
131
+ return create_task_info_html(username, annotation_results, current_task_num)
132
+
133
+
134
+ if __name__ == "__main__":
135
+ print("启动应用...")
136
+ with gr.Blocks(css=css) as demo:
137
+ username = gr.State(value="unknown")
138
+ annotation_results = gr.State(value={})
139
+ user_current_task = gr.State(value=0)
140
+
141
+ ui_components = create_ui(get_current_task(), username, annotation_results)
142
+
143
+ demo.load(
144
+ set_user_info,
145
+ inputs=None,
146
+ outputs=[
147
+ username,
148
+ annotation_results,
149
+ user_current_task,
150
+ ui_components["instruction"],
151
+ ui_components["text_box"],
152
+ ui_components["audioA"],
153
+ ui_components["audioB"],
154
+ ui_components["btn_prev"],
155
+ ui_components["btn_next"],
156
+ ui_components["task_number"]
157
+ ]
158
+ )
159
+
160
+ # 修改按钮点击事件,添加任务信息更新
161
+ ui_components["btn_win"].click(
162
+ ui_components["select_result"],
163
+ inputs=[gr.State("win"), ui_components["audioA"], ui_components["audioB"], annotation_results, username,
164
+ user_current_task],
165
+ outputs=[ui_components["audioA"], ui_components["audioB"], annotation_results, ui_components["task_number"]]
166
+ )
167
+
168
+ ui_components["btn_tie"].click(
169
+ ui_components["select_result"],
170
+ inputs=[gr.State("tie"), ui_components["audioA"], ui_components["audioB"], annotation_results, username,
171
+ user_current_task],
172
+ outputs=[ui_components["audioA"], ui_components["audioB"], annotation_results, ui_components["task_number"]]
173
+ )
174
+
175
+ ui_components["btn_lose"].click(
176
+ ui_components["select_result"],
177
+ inputs=[gr.State("lose"), ui_components["audioA"], ui_components["audioB"], annotation_results, username,
178
+ user_current_task],
179
+ outputs=[ui_components["audioA"], ui_components["audioB"], annotation_results, ui_components["task_number"]]
180
+ )
181
+
182
+ ui_components["btn_prev"].click(
183
+ ui_components["change_task"],
184
+ inputs=[gr.State("prev"), annotation_results, username, user_current_task],
185
+ outputs=[ui_components["instruction"], ui_components["text_box"], ui_components["audioA"],
186
+ ui_components["audioB"], ui_components["btn_prev"], ui_components["btn_next"],
187
+ ui_components["task_number"], annotation_results, user_current_task]
188
+ )
189
+
190
+ ui_components["btn_next"].click(
191
+ ui_components["change_task"],
192
+ inputs=[gr.State("next"), annotation_results, username, user_current_task],
193
+ outputs=[ui_components["instruction"], ui_components["text_box"], ui_components["audioA"],
194
+ ui_components["audioB"], ui_components["btn_prev"], ui_components["btn_next"],
195
+ ui_components["task_number"], annotation_results, user_current_task]
196
+ )
197
+
198
+ demo.launch(auth=config.ANNOTATOR)
config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ # Hugging Face 数据集配置
4
+ PROCESS_REPO_ID = "sunnyzjx/Test_dataset"
5
+ SAVE_REPO_ID = "sunnyzjx/annotation_results"
6
+
7
+ # 数据集字段名配置
8
+ AUDIO_FIELDS = ['model1', 'model2']
9
+ FIELD_TEXT = "text"
10
+ FIELD_INSTRUCTION = "instruction"
11
+
12
+ ANNOTATOR = [('zjx', '123'), ('wy', '123')]
data_processing.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from datasets import load_dataset
3
+ import os
4
+ import config
5
+ from itertools import combinations
6
+ import random
7
+
8
+ os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "true"
9
+
10
+ dataset = load_dataset(config.PROCESS_REPO_ID, split="train")
11
+
12
+
13
+ def process_audio(audio_obj):
14
+ """处理音频对象,返回音频数据和采样率"""
15
+ try:
16
+ if hasattr(audio_obj, 'get_all_samples'):
17
+ samples = audio_obj.get_all_samples()
18
+ audio_data = samples.data
19
+ if not isinstance(audio_data, np.ndarray):
20
+ audio_data = np.array(audio_data, dtype=np.float32)
21
+ sample_rate = samples.sample_rate
22
+ if not isinstance(sample_rate, int):
23
+ sample_rate = int(sample_rate)
24
+ if len(audio_data.shape) > 1:
25
+ audio_data = audio_data.mean(axis=0)
26
+ return audio_data, sample_rate
27
+ else:
28
+ print("音频对象缺少 get_all_samples 方法")
29
+ return None, None
30
+ except Exception as e:
31
+ print(f"处理音频失败: {e}")
32
+ return None, None
33
+
34
+
35
+ def generate_random_pairs(audio_fields, include_reverse=True, shuffle_order=True):
36
+ """
37
+ 生成随机的音频对比较对
38
+
39
+ Args:
40
+ audio_fields: 音频字段列表
41
+ include_reverse: 是否包含反向比较(A vs B 和 B vs A)
42
+ shuffle_order: 是否随机打乱比较对的顺序
43
+
44
+ Returns:
45
+ 比较对的列表
46
+ """
47
+ basic_combinations = list(combinations(audio_fields, 2))
48
+
49
+ if include_reverse:
50
+ pairs = []
51
+ for combo in basic_combinations:
52
+ if random.choice([True, False]):
53
+ pairs.append((combo[1], combo[0]))
54
+ else:
55
+ pairs.append(combo)
56
+ else:
57
+ pairs = basic_combinations
58
+
59
+ if shuffle_order:
60
+ random.shuffle(pairs)
61
+
62
+ return pairs
63
+
64
+
65
+ def generate_all_permutations(audio_fields, shuffle_order=True):
66
+ """
67
+ 生成所有可能的有序对(包括正向和反向)
68
+
69
+ Args:
70
+ audio_fields: 音频字段列表
71
+ shuffle_order: 是否随机打乱顺序
72
+
73
+ Returns:
74
+ 所有有序对的列表
75
+ """
76
+ pairs = []
77
+ for i, field_a in enumerate(audio_fields):
78
+ for j, field_b in enumerate(audio_fields):
79
+ if i != j: # 不与自己比较
80
+ pairs.append((field_a, field_b))
81
+
82
+ if shuffle_order:
83
+ random.shuffle(pairs)
84
+
85
+ return pairs
86
+
87
+
88
+ def load_tasks(comparison_mode="random_reverse", seed=None):
89
+ """
90
+ 使用config配置的音频字段进行两两比较
91
+
92
+ Args:
93
+ comparison_mode: 比较模式
94
+ - "fixed": 固定顺序的组合(原始模式)
95
+ - "random_reverse": 随机决定是否反转每个组合的顺序
96
+ - "all_permutations": 生成所有可能的有序对
97
+ seed: 随机种子,仅在需要复现结果时使用
98
+ """
99
+ if seed is not None:
100
+ random.seed(seed)
101
+ print(f"使用随机种子: {seed}")
102
+ else:
103
+ print("使用真随机模式")
104
+
105
+ print("处理数据集...")
106
+
107
+ audio_fields = config.AUDIO_FIELDS
108
+ text_field = config.FIELD_TEXT
109
+ instruction_field = config.FIELD_INSTRUCTION
110
+
111
+ print(f"使用音频字段: {audio_fields}")
112
+ print(f"文本字段: {text_field}")
113
+ print(f"指令字段: {instruction_field}")
114
+ print(f"比较模式: {comparison_mode}")
115
+
116
+ tasks = []
117
+
118
+ for i, row in enumerate(dataset):
119
+ processed_audios = {}
120
+ for field in audio_fields:
121
+ if field not in row or row[field] is None:
122
+ print(f"任务 {i} 缺少音频字段: {field}")
123
+ continue
124
+
125
+ audio_data, audio_rate = process_audio(row[field])
126
+ if (audio_data is not None and audio_rate is not None and
127
+ isinstance(audio_data, np.ndarray) and isinstance(audio_rate, int)):
128
+ processed_audios[field] = (audio_data, audio_rate)
129
+ else:
130
+ print(f"任务 {i} 的音频字段 {field} 处理失败")
131
+
132
+ if len(processed_audios) < 2:
133
+ print(f"跳过任务 {i}:有效音频数量不足")
134
+ continue
135
+
136
+ text = row.get(text_field, '')
137
+ instruction = row.get(instruction_field, '请比较这两个音频的质量')
138
+
139
+ available_fields = list(processed_audios.keys())
140
+
141
+ if comparison_mode == "fixed":
142
+ pairs = list(combinations(available_fields, 2))
143
+ elif comparison_mode == "random_reverse":
144
+ pairs = generate_random_pairs(available_fields, include_reverse=True, shuffle_order=True)
145
+ elif comparison_mode == "all_permutations":
146
+ pairs = generate_all_permutations(available_fields, shuffle_order=True)
147
+ else:
148
+ raise ValueError(f"未知的比较模式: {comparison_mode}")
149
+
150
+ for field_a, field_b in pairs:
151
+ tasks.append({
152
+ "instruction": instruction,
153
+ "text": text,
154
+ "audioA": processed_audios[field_a],
155
+ "audioB": processed_audios[field_b],
156
+ "audioA_source": field_a,
157
+ "audioB_source": field_b,
158
+ "comparison": f"{field_a} vs {field_b}",
159
+ "original_index": i
160
+ })
161
+
162
+ print(f"成功生成 {len(tasks)} 个比较任务")
163
+ if len(tasks) == 0:
164
+ print("没有可用任务!")
165
+ exit()
166
+
167
+ comparison_counts = {}
168
+ for task in tasks:
169
+ comp = task["comparison"]
170
+ comparison_counts[comp] = comparison_counts.get(comp, 0) + 1
171
+
172
+ print("比较任务统计:")
173
+ for comp, count in sorted(comparison_counts.items()):
174
+ print(f" {comp}: {count} 个任务")
175
+
176
+ return tasks
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==4.44.0
2
+ numpy
3
+ datasets
4
+ huggingface_hub
task_manager.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from data_processing import load_tasks
3
+ from annotation import save_annotations
4
+
5
+ tasks = load_tasks(comparison_mode="random_reverse", seed=42)
6
+
7
+
8
+ def get_current_task_with_annotations(annotation_results, user_current_task=0):
9
+ """获取当前任务信息,应用已有标注的样式(用于初始加载)"""
10
+ task = tasks[user_current_task]
11
+ current_choice = annotation_results.get(user_current_task) if annotation_results else None
12
+
13
+ # 基础音频数据
14
+ audioA_data = (task["audioA"][1], task["audioA"][0]) # (rate, data)
15
+ audioB_data = (task["audioB"][1], task["audioB"][0]) # (rate, data)
16
+
17
+ # 根据选择结果应用样式
18
+ if current_choice == "win":
19
+ # A胜过B - A高亮绿色,B显示败北
20
+ audioA_styled = gr.update(value=audioA_data, elem_classes="selected")
21
+ audioB_styled = gr.update(value=audioB_data, elem_classes="")
22
+ elif current_choice == "lose":
23
+ # A输给B - B高亮绿色,A显示败北
24
+ audioA_styled = gr.update(value=audioA_data, elem_classes="")
25
+ audioB_styled = gr.update(value=audioB_data, elem_classes="selected")
26
+ elif current_choice == "tie":
27
+ # 平局 - 两个都用特殊样式
28
+ audioA_styled = gr.update(value=audioA_data, elem_classes="tie-selected")
29
+ audioB_styled = gr.update(value=audioB_data, elem_classes="tie-selected")
30
+ else:
31
+ # 未选择
32
+ audioA_styled = gr.update(value=audioA_data, elem_classes="")
33
+ audioB_styled = gr.update(value=audioB_data, elem_classes="")
34
+
35
+ return (
36
+ task["instruction"],
37
+ task["text"],
38
+ audioA_styled,
39
+ audioB_styled,
40
+ user_current_task == 0,
41
+ user_current_task == len(tasks) - 1,
42
+ user_current_task + 1
43
+ )
44
+
45
+
46
+ def get_current_task(user_current_task=0, annotation_results=None, styled=False):
47
+ """获取当前任务信息,可选择是否应用样式"""
48
+ task = tasks[user_current_task]
49
+
50
+ if styled and annotation_results is not None:
51
+ current_choice = annotation_results.get(user_current_task)
52
+ audioA_data = (task["audioA"][1], task["audioA"][0]) # (rate, data)
53
+ audioB_data = (task["audioB"][1], task["audioB"][0]) # (rate, data)
54
+
55
+ if current_choice == "win":
56
+ audioA_styled = gr.update(value=audioA_data, elem_classes="selected")
57
+ audioB_styled = gr.update(value=audioB_data, elem_classes="")
58
+ elif current_choice == "lose":
59
+ audioA_styled = gr.update(value=audioA_data, elem_classes="")
60
+ audioB_styled = gr.update(value=audioB_data, elem_classes="selected")
61
+ elif current_choice == "tie":
62
+ audioA_styled = gr.update(value=audioA_data, elem_classes="tie-selected")
63
+ audioB_styled = gr.update(value=audioB_data, elem_classes="tie-selected")
64
+ else:
65
+ audioA_styled = gr.update(value=audioA_data, elem_classes="")
66
+ audioB_styled = gr.update(value=audioB_data, elem_classes="")
67
+
68
+ return (
69
+ task["instruction"],
70
+ task["text"],
71
+ audioA_styled,
72
+ audioB_styled,
73
+ user_current_task == 0,
74
+ user_current_task == len(tasks) - 1,
75
+ user_current_task + 1
76
+ )
77
+ else:
78
+ return (
79
+ task["instruction"],
80
+ task["text"],
81
+ task["audioA"][0],
82
+ task["audioA"][1],
83
+ task["audioB"][0],
84
+ task["audioB"][1],
85
+ user_current_task == 0,
86
+ user_current_task == len(tasks) - 1,
87
+ user_current_task + 1
88
+ )
89
+
90
+
91
+ def apply_selection_style(audioA, audioB, choice):
92
+ """根据选择结果应用样式"""
93
+ if choice == "win":
94
+ # A胜过B
95
+ return (
96
+ gr.update(value=audioA, elem_classes="selected"),
97
+ gr.update(value=audioB, elem_classes="")
98
+ )
99
+ elif choice == "lose":
100
+ # A输给B
101
+ return (
102
+ gr.update(value=audioA, elem_classes=""),
103
+ gr.update(value=audioB, elem_classes="selected")
104
+ )
105
+ elif choice == "tie":
106
+ # 平局
107
+ return (
108
+ gr.update(value=audioA, elem_classes="tie-selected"),
109
+ gr.update(value=audioB, elem_classes="tie-selected")
110
+ )
111
+ else:
112
+ # 清除选择
113
+ return (
114
+ gr.update(value=audioA, elem_classes=""),
115
+ gr.update(value=audioB, elem_classes="")
116
+ )
117
+
118
+
119
+ def select_result(choice, audioA, audioB, annotation_results, username, user_current_task):
120
+ """记录选择结果并更新UI高亮,自动保存标注结果"""
121
+ annotation_results[user_current_task] = choice
122
+
123
+ # 自动保存标注结果
124
+ save_result = save_annotations(username, annotation_results, tasks)
125
+ print(f"自动保存结果: {save_result}")
126
+
127
+ audioA_update, audioB_update = apply_selection_style(audioA, audioB, choice)
128
+ return audioA_update, audioB_update, annotation_results
129
+
130
+
131
+ def change_task(direction, annotation_results, username, user_current_task):
132
+ """切换任务"""
133
+ new_user_current_task = user_current_task
134
+
135
+ if direction == "prev" and user_current_task > 0:
136
+ new_user_current_task = user_current_task - 1
137
+ elif direction == "next" and user_current_task < len(tasks) - 1:
138
+ new_user_current_task = user_current_task + 1
139
+
140
+ inst, text, audioA_update, audioB_update, prev_disabled, next_disabled, task_num = get_current_task(
141
+ new_user_current_task, annotation_results, styled=True)
142
+
143
+ total_tasks = get_total_tasks()
144
+
145
+ combined_task_info = f'<div class="user-task-info"><span>👤 当前用户: {username}</span><span><strong>任务编号: {task_num} / {total_tasks}</strong></span></div>'
146
+
147
+ return (
148
+ inst, text,
149
+ audioA_update,
150
+ audioB_update,
151
+ gr.update(interactive=not prev_disabled),
152
+ gr.update(interactive=not next_disabled),
153
+ gr.update(value=combined_task_info),
154
+ annotation_results,
155
+ new_user_current_task
156
+ )
157
+
158
+
159
+ def get_total_tasks():
160
+ """返回总任务数"""
161
+ return len(tasks)
ui_components.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from task_manager import get_current_task, select_result, change_task, tasks, get_total_tasks
3
+ from annotation import save_annotations
4
+
5
+
6
+ def calculate_completion_stats(annotation_results):
7
+ """计算任务完成统计信息"""
8
+ if not annotation_results:
9
+ return 0, 0, 0.0
10
+
11
+ total_tasks = get_total_tasks()
12
+ completed_tasks = len(annotation_results)
13
+ completion_rate = (completed_tasks / total_tasks * 100) if total_tasks > 0 else 0.0
14
+
15
+ return completed_tasks, total_tasks, completion_rate
16
+
17
+
18
+ def create_task_info_html(username, annotation_results, current_task_num):
19
+ """创建包含用户信息和完成度的HTML"""
20
+ completed_tasks, total_tasks, completion_rate = calculate_completion_stats(annotation_results)
21
+
22
+ # 使用更简洁的HTML结构
23
+ task_info_html = f"""
24
+ <div style="font-size: 16px; color: #333; padding: 10px; background-color: #f0f0f0; border-radius: 5px; display: flex; justify-content: space-between; align-items: center;">
25
+ <span>👤 用户: {username}</span>
26
+ <span>完成度: {completed_tasks}/{total_tasks} ({completion_rate:.1f}%)</span>
27
+ <span><strong>当前任务: {current_task_num}</strong></span>
28
+ </div>
29
+ """
30
+
31
+ return task_info_html
32
+
33
+
34
+ def create_ui(init_task, username, annotation_results):
35
+ """创建 Gradio 界面组件"""
36
+ init_inst, init_text, init_audioA_data, init_audioA_rate, init_audioB_data, init_audioB_rate, init_prev_dis, init_next_dis, init_task_num = init_task
37
+
38
+ gr.Markdown('<div class="center"><h2>🎵 音频对比标注平台</h2></div>')
39
+ user_display = gr.Markdown()
40
+
41
+ instruction = gr.Textbox(label="🎯 指令", value=init_inst, interactive=False)
42
+ text_box = gr.Textbox(label="📋 转录文本", value=init_text, interactive=False)
43
+
44
+ # 初始化时创建完成度显示 - 使用HTML组件而不是Markdown
45
+ initial_task_info = create_task_info_html("unknown", {}, init_task_num)
46
+ task_number = gr.HTML(value=initial_task_info, elem_classes="center")
47
+
48
+ with gr.Row():
49
+ with gr.Column(elem_classes="audio-container"):
50
+ audioA = gr.Audio(
51
+ label="🔊 音频 A",
52
+ value=(init_audioA_rate, init_audioA_data),
53
+ interactive=False,
54
+ type="numpy"
55
+ )
56
+ with gr.Column(elem_classes="audio-container"):
57
+ audioB = gr.Audio(
58
+ label="🔊 音频 B",
59
+ value=(init_audioB_rate, init_audioB_data),
60
+ interactive=False,
61
+ type="numpy"
62
+ )
63
+
64
+ with gr.Row():
65
+ btn_win = gr.Button("🏆 Win", variant="primary")
66
+ btn_tie = gr.Button("🤝 Tie", variant="secondary")
67
+ btn_lose = gr.Button("❌ Lose", variant="stop")
68
+
69
+ with gr.Row():
70
+ btn_prev = gr.Button("⬅️ 上一题", interactive=not init_prev_dis)
71
+ btn_next = gr.Button("➡️ 下一题", interactive=not init_next_dis)
72
+
73
+ def wrapped_select_result(choice, audioA, audioB, annotation_results, username, user_current_task):
74
+ """包装 select_result 函数以适配新的参数签名"""
75
+ result_audioA, result_audioB, updated_annotation_results = select_result(choice, audioA, audioB,
76
+ annotation_results, username,
77
+ user_current_task)
78
+
79
+ # 更新完成度显示 - 保持当前任务编号不变,因为还在同一个任务上
80
+ current_task_display = user_current_task + 1 # 假设任务编号从1开始显示
81
+ updated_task_info = create_task_info_html(username, updated_annotation_results, current_task_display)
82
+
83
+ return result_audioA, result_audioB, updated_annotation_results, gr.update(value=updated_task_info)
84
+
85
+ def wrapped_change_task(direction, annotation_results, username, user_current_task):
86
+ """包装 change_task 函数以适配新的参数签名"""
87
+ result = change_task(direction, annotation_results, username, user_current_task)
88
+
89
+ if len(result) >= 9: # 确保返回值包含所有必需的字段
90
+ inst, text, audioA, audioB, btn_prev, btn_next, original_task_display, updated_annotation_results, updated_current_task = result[
91
+ :9]
92
+
93
+ # 使用更新后的当前任务索引来计算显示编号
94
+ current_task_display = updated_current_task + 1 # 转换为1-based显示
95
+
96
+ # 创建新的完成度显示
97
+ updated_task_info = create_task_info_html(username, updated_annotation_results, current_task_display)
98
+
99
+ return inst, text, audioA, audioB, btn_prev, btn_next, gr.update(
100
+ value=updated_task_info), updated_annotation_results, updated_current_task
101
+ else:
102
+ return result
103
+
104
+ return {
105
+ "user_display": user_display,
106
+ "instruction": instruction,
107
+ "text_box": text_box,
108
+ "task_number": task_number,
109
+ "audioA": audioA,
110
+ "audioB": audioB,
111
+ "btn_win": btn_win,
112
+ "btn_tie": btn_tie,
113
+ "btn_lose": btn_lose,
114
+ "btn_prev": btn_prev,
115
+ "btn_next": btn_next,
116
+ "select_result": wrapped_select_result,
117
+ "change_task": wrapped_change_task,
118
+ "save_annotations": lambda u, a: save_annotations(u, a, tasks)
119
+ }