GoodResearch commited on
Commit
45de075
·
verified ·
1 Parent(s): 58817aa

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +226 -10
app.py CHANGED
@@ -1,10 +1,226 @@
1
- import gradio as gr
2
- import time
3
-
4
- def stream():
5
- for i in range(5):
6
- yield f"⏳ 第 {i+1} 秒..."
7
- time.sleep(1)
8
- yield "✅ 完成!"
9
-
10
- gr.Interface(fn=stream, inputs=None, outputs=gr.Textbox()).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import gradio as gr
3
+ import whisper
4
+ import subprocess
5
+ import os
6
+ import time
7
+ import re
8
+ import datetime
9
+ from pyannote.audio import Pipeline
10
+
11
+ print("Gradio version:", gr.__version__)
12
+
13
+ # huggingface token 用于访问 pyannote 模型(替换为你的)
14
+ hf_token = os.getenv("HF_TOKEN")
15
+
16
+ # Whisper 模型加载(提前加载以加速)
17
+ # asr_model = whisper.load_model("base").to("cuda")
18
+ diarization_pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token)
19
+
20
+
21
+ def get_audio_duration(filename):
22
+ result = subprocess.run(
23
+ ['ffprobe', '-v', 'error', '-show_entries', 'format=duration',
24
+ '-of', 'default=noprint_wrappers=1:nokey=1', filename],
25
+ stdout=subprocess.PIPE,
26
+ stderr=subprocess.STDOUT)
27
+ return float(result.stdout)
28
+
29
+
30
+ def transcribe_single(audio_path, language="auto", model_size="base"):
31
+ input_path = "audio.mp3"
32
+ os.rename(audio_path, input_path)
33
+ asr_model = whisper.load_model(model_size)#.to("cuda")
34
+
35
+ # Step 1: 静音检测
36
+ silence_cmd = f"ffmpeg -i {input_path} -af silencedetect=noise=-30dB:d=1 -f null - 2> silence_log.txt"
37
+ os.system(silence_cmd)
38
+
39
+ audio_duration = get_audio_duration(input_path)
40
+
41
+ # Step 2: 解析 silence_log.txt
42
+ silence_starts, silence_ends = [], []
43
+ with open("silence_log.txt", "r") as f:
44
+ for line in f:
45
+ if "silence_start" in line:
46
+ match = re.search(r"silence_start: (\d+\.?\d*)", line)
47
+ if match:
48
+ silence_starts.append(float(match.group(1)))
49
+ elif "silence_end" in line:
50
+ match = re.search(r"silence_end: (\d+\.?\d*)", line)
51
+ if match:
52
+ silence_ends.append(float(match.group(1)))
53
+
54
+ silence_starts.append(audio_duration)
55
+ silence_ends.append(audio_duration)
56
+
57
+ # Step 3: 分段
58
+ MIN_TARGET, MAX_TARGET = 480, 600
59
+ segments = []
60
+ current_start = 0.0
61
+
62
+ for i in range(len(silence_starts)):
63
+ silence_point = silence_starts[i]
64
+ segment_length = silence_point - current_start
65
+ if segment_length >= MIN_TARGET:
66
+ segment_end = silence_point if segment_length <= MAX_TARGET else current_start + MAX_TARGET
67
+ segments.append((current_start, segment_end))
68
+ current_start = silence_ends[i]
69
+
70
+ if current_start < audio_duration:
71
+ segments.append((current_start, None))
72
+
73
+ # Step 4: 分段 + whisper
74
+ output_lines = []
75
+ for idx, (start, end) in enumerate(segments):
76
+ chunk_file = f"chunk_{idx:03d}.mp3"
77
+ cmd = f"ffmpeg -i {input_path} -ss {start:.2f}"
78
+ if end:
79
+ cmd += f" -to {end:.2f}"
80
+ cmd += f" -c copy {chunk_file}"
81
+ os.system(cmd)
82
+
83
+ result = asr_model.transcribe(chunk_file, language=language)
84
+ output_lines.append(result["text"].strip())
85
+ os.remove(chunk_file)
86
+
87
+ with open("transcription_output.txt", "w", encoding="utf-8") as f:
88
+ f.write("\n".join(output_lines))
89
+
90
+ return "transcription_output.txt"
91
+
92
+
93
+ def transcribe_multi(audio_path, language="auto", model_size="base"):
94
+ input_path = "audio_multi.mp3"
95
+ os.rename(audio_path, input_path)
96
+ asr_model = whisper.load_model(model_size).to("cuda")
97
+
98
+ diarization = diarization_pipeline(input_path)
99
+ segments = []
100
+
101
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
102
+ start_time = turn.start
103
+ end_time = turn.end
104
+ speaker_label = speaker
105
+
106
+ tmp_chunk = f"tmp_{start_time:.2f}_{end_time:.2f}.wav"
107
+ os.system(f"ffmpeg -y -i {input_path} -ss {start_time:.3f} -to {end_time:.3f} -ar 16000 -ac 1 -loglevel error {tmp_chunk}")
108
+ result = asr_model.transcribe(tmp_chunk, language=language)
109
+ text = result['text'].strip()
110
+ os.remove(tmp_chunk)
111
+
112
+ if text:
113
+ segments.append({
114
+ "start": start_time,
115
+ "end": end_time,
116
+ "speaker": speaker_label,
117
+ "text": text
118
+ })
119
+
120
+ speaker_map = {}
121
+ speaker_counter = 1
122
+ output_lines = []
123
+
124
+ for seg in segments:
125
+ speaker = seg["speaker"]
126
+ if speaker not in speaker_map:
127
+ speaker_map[speaker] = f"说话人{speaker_counter}"
128
+ speaker_counter += 1
129
+
130
+ speaker_name = speaker_map[speaker]
131
+
132
+ def format_ts(seconds):
133
+ return str(datetime.timedelta(seconds=int(seconds))) + f".{int((seconds % 1) * 1000):03d}"
134
+
135
+ start_str = format_ts(seg["start"])
136
+ end_str = format_ts(seg["end"])
137
+
138
+ line = f"[{start_str} - {end_str}] {speaker_name}:{seg['text']}"
139
+ output_lines.append(line)
140
+
141
+ with open("transcription_with_speakers.txt", "w", encoding="utf-8") as f:
142
+ f.write("\n".join(output_lines))
143
+
144
+ return "transcription_with_speakers.txt"
145
+
146
+
147
+ # def main(audio_file, is_multispeaker, language, model_size):
148
+ # start_time = time.time()
149
+ # result_file = transcribe_multi(audio_file, language, model_size) if is_multispeaker else transcribe_single(audio_file, language, model_size)
150
+ # end_time = time.time()
151
+ # elapsed = end_time - start_time
152
+ # time_info = f"⏱️ 转录耗时:{elapsed:.2f} 秒"
153
+ # return result_file, time_info
154
+
155
+ # 运行转录任务,放在线程里
156
+ import threading
157
+
158
+ def main_with_progress(audio_file, is_multispeaker, language, model_size):
159
+ start_time = time.time()
160
+ yield None, "⏳ 正在转录,请稍等...", None
161
+
162
+ result_file_holder = {"file": None}
163
+
164
+ def transcribe_task():
165
+ result_file_holder["file"] = transcribe_multi(audio_file, language, model_size) if is_multispeaker else transcribe_single(audio_file, language, model_size)
166
+
167
+ thread = threading.Thread(target=transcribe_task)
168
+ thread.start()
169
+
170
+ # 每秒更新状态,直到任务完成
171
+ while thread.is_alive():
172
+ elapsed = time.time() - start_time
173
+ yield None, f"⏳ 正在转录中... 已耗时 {elapsed:.1f} 秒", None
174
+ time.sleep(1)
175
+
176
+ # 完成后显示最终信息
177
+ elapsed = time.time() - start_time
178
+ result_file = result_file_holder["file"]
179
+ yield result_file, f"✅ 转录完成,⏱️总耗时:{elapsed:.2f} 秒"
180
+
181
+
182
+ with gr.Blocks() as demo:
183
+ gr.Markdown("# Whisper + PyAnnote 音频转录系统")
184
+
185
+ audio_input = gr.Audio(type="filepath", label="上传音频")
186
+ is_multi = gr.Checkbox(label="是否为多人对话音频(启用说话人分离)")
187
+ language = gr.Dropdown(
188
+ choices=[
189
+ ("自动识别", "auto"),
190
+ ("英语 (English)", "en"),
191
+ ("中文 (Chinese)", "zh"),
192
+ ("法语 (French)", "fr"),
193
+ ("德语 (German)", "de"),
194
+ ("西班牙语 (Spanish)", "es"),
195
+ ("日语 (Japanese)", "ja"),
196
+ ("韩语 (Korean)", "ko"),
197
+ ("葡萄牙语 (Portuguese)", "pt"),
198
+ ("俄语 (Russian)", "ru"),
199
+ ],
200
+ value="auto",
201
+ label="音频语言"
202
+ )
203
+ model_size = gr.Dropdown(
204
+ choices=[
205
+ ("tiny (39M)", "tiny"),
206
+ ("base (74M)", "base"),
207
+ ("small (244M)", "small"),
208
+ ("medium (769M)", "medium"),
209
+ ("large (1550M)", "large")
210
+ ],
211
+ value="base",
212
+ label="Whisper 模型规模"
213
+ )
214
+ # status_box = gr.Textbox(label="状态更新", interactive=False)
215
+ output_file = gr.File(label="转录结果(.txt)")
216
+ elapsed_time = gr.Textbox(label="处理用时", interactive=False, live=True)
217
+
218
+ run_btn = gr.Button("开始转录")
219
+ run_btn.click(
220
+ fn=main_with_progress,
221
+ inputs=[audio_input, is_multi, language, model_size],
222
+ outputs=[output_file, elapsed_time]
223
+ )
224
+
225
+ if __name__ == "__main__":
226
+ demo.launch()