Ander1 commited on
Commit
5bfef5b
·
verified ·
1 Parent(s): 8eb1ae4

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +150 -296
app.py CHANGED
@@ -1,318 +1,172 @@
1
  import gradio as gr
2
  import os
3
- from elevenlabs_stt import transcribe_audio as transcribe_audio_elevenlabs
4
  from whisper_stt import transcribe_audio_whisper
5
  from transcript_refiner import refine_transcript
6
- from utils import check_file_size, split_large_audio
7
- import logging
8
-
9
- # 設定日誌
10
- logging.basicConfig(level=logging.INFO)
11
- logger = logging.getLogger(__name__)
12
-
13
- # 定義可用的 OpenAI 模型
14
- OPENAI_MODELS = {
15
- "gpt-4o": "gpt-4o",
16
- "gpt-4o-mini": "gpt-4o-mini",
17
- "o3-mini": "o3-mini",
18
- "o1-mini": "o1-mini"
19
- }
20
-
21
- # 模型設定和價格(USD per 1M tokens)
22
- MODEL_CONFIG = {
23
- "gpt-4o": {
24
- "display_name": "gpt-4o",
25
- "input": 2.50,
26
- "cached_input": 1.25,
27
- "output": 10.00
28
- },
29
- "gpt-4o-mini": {
30
- "display_name": "gpt-4o-mini",
31
- "input": 0.15,
32
- "cached_input": 0.075,
33
- "output": 0.60
34
- },
35
- "o1-mini": {
36
- "display_name": "o1-mini",
37
- "input": 1.10,
38
- "cached_input": 0.55,
39
- "output": 4.40
40
- },
41
- "o3-mini": {
42
- "display_name": "o3-mini",
43
- "input": 1.10,
44
- "cached_input": 0.55,
45
- "output": 4.40
46
- }
47
- }
48
 
49
  def process_audio(
50
- audio_file,
51
- transcription_service,
52
  openai_api_key,
53
  elevenlabs_api_key,
54
- whisper_model,
55
  openai_model,
56
- language_mode,
57
- language_code,
58
- context_prompt,
59
- temperature,
60
- enable_diarization
61
  ):
62
- """處理音訊檔案並返回結果"""
63
  try:
64
- # 檢查必要的 API 金鑰
65
- if not openai_api_key or len(openai_api_key.strip()) < 20:
66
- return "請提供有效的 OpenAI API 金鑰", "", "", 0, "NT$ 0.00"
67
-
68
- if transcription_service == "ElevenLabs" and (not elevenlabs_api_key or len(elevenlabs_api_key.strip()) < 20):
69
- return "請提供有效的 ElevenLabs API 金鑰", "", "", 0, "NT$ 0.00"
70
-
71
- # 初始化變數
72
- full_transcript = ""
73
 
74
- # 檢查檔案大小
75
- if check_file_size(audio_file):
76
- # 檔案需要分割
77
- audio_segments = split_large_audio(audio_file)
78
- if not audio_segments:
79
- return "檔案分割失敗", "", "", 0, "NT$ 0.00"
80
-
81
- for segment_path in audio_segments:
82
- if transcription_service == "Whisper":
83
- result = transcribe_audio_whisper(
84
- segment_path,
85
- model_name=whisper_model,
86
- language=language_code if language_mode == "指定語言" else None,
87
- initial_prompt=context_prompt
88
- )
89
- else:
90
- result = transcribe_audio_elevenlabs(
91
- api_key=elevenlabs_api_key,
92
- file_path=segment_path,
93
- diarize=enable_diarization
94
- )
95
-
96
- if result:
97
- full_transcript += result["text"] + "\n"
98
- os.remove(segment_path)
99
- else:
100
- # 直接轉錄
101
- if transcription_service == "Whisper":
102
- result = transcribe_audio_whisper(
103
- audio_file,
104
- model_name=whisper_model,
105
- language=language_code if language_mode == "指定語言" else None,
106
- initial_prompt=context_prompt
107
- )
108
- else:
109
- result = transcribe_audio_elevenlabs(
110
- api_key=elevenlabs_api_key,
111
- file_path=audio_file,
112
- diarize=enable_diarization
113
- )
114
-
115
- if result:
116
- full_transcript = result["text"]
117
-
118
- # 優化文字
119
- if full_transcript:
120
- refined = refine_transcript(
121
- raw_text=full_transcript,
122
- api_key=openai_api_key,
123
- model=openai_model,
124
- temperature=temperature,
125
- context=context_prompt
126
  )
127
-
128
- if refined:
129
- # 計算成本
130
- current_usage = refined.get("usage", {})
131
- input_tokens = current_usage.get("prompt_tokens", 0)
132
- output_tokens = current_usage.get("completion_tokens", 0)
133
- total_tokens = input_tokens + output_tokens
134
-
135
- # 計算費用
136
- model_price = MODEL_CONFIG[openai_model]
137
- input_cost = (input_tokens / 1_000_000) * model_price["input"]
138
- output_cost = (output_tokens / 1_000_000) * model_price["output"]
139
- total_cost_usd = input_cost + output_cost
140
- total_cost_ntd = total_cost_usd * 31.5
141
-
142
- return (
143
- full_transcript,
144
- refined["corrected"],
145
- refined["summary"],
146
- total_tokens,
147
- f"NT$ {total_cost_ntd:.2f}"
148
- )
149
-
150
- return "處理失敗", "", "", 0, "NT$ 0.00"
151
-
152
  except Exception as e:
153
- logger.error(f"處理失敗:{str(e)}")
154
- return f"處理失敗:{str(e)}", "", "", 0, "NT$ 0.00"
155
  finally:
156
  # 清除敏感資訊
157
- del openai_api_key
158
- del elevenlabs_api_key
 
 
159
 
160
- def create_gradio_interface():
161
- """建立 Gradio 介面"""
162
- with gr.Blocks(title="音訊轉文字與優化系統") as app:
163
- gr.Markdown("# 音訊轉文字與優化系統")
164
-
165
- with gr.Row():
166
- with gr.Column(scale=2):
167
- # 音訊輸入
168
- audio_input = gr.Audio(
169
- label="上傳音訊檔案",
170
- type="filepath"
171
- )
172
-
173
- # API 金鑰
174
- with gr.Group():
175
- gr.Markdown("""### API 金鑰設定
176
- > **安全提示:**
177
- > - API 金鑰僅在當前處理中使用,不會被儲存
178
- > - 每次使用需重新輸入以確保安全性
179
- > - 請勿與他人分享您的 API 金鑰
180
- """)
181
- openai_key = gr.Textbox(
182
- label="OpenAI API 金鑰",
183
- type="password",
184
- placeholder="sk-...",
185
- value="",
186
- every=None # 確保不會被快取
187
- )
188
- elevenlabs_key = gr.Textbox(
189
- label="ElevenLabs API 金鑰",
190
- type="password",
191
- placeholder="輸入您的 ElevenLabs API 金鑰",
192
- value="",
193
- every=None # 確保不會被快取
194
- )
195
-
196
- # 模型選擇
197
- with gr.Group():
198
- gr.Markdown("### 模型設定")
199
- service = gr.Radio(
200
- choices=["Whisper", "ElevenLabs"],
201
- label="轉錄服務",
202
- value="Whisper"
203
- )
204
- whisper_model_choice = gr.Dropdown(
205
- choices=["tiny", "base", "small", "medium", "large"],
206
- label="Whisper 模型",
207
- value="small"
208
- )
209
- openai_model_choice = gr.Dropdown(
210
- choices=list(OPENAI_MODELS.keys()),
211
- label="OpenAI 模型",
212
- value="o3-mini"
213
- )
214
-
215
- # 語言設定
216
- with gr.Group():
217
- gr.Markdown("### 語言設定")
218
- lang_mode = gr.Radio(
219
- choices=["自動偵測", "指定語言", "混合語言"],
220
- label="語言模式",
221
- value="自動偵測"
222
- )
223
- lang_code = gr.Textbox(
224
- label="語言代碼",
225
- placeholder="例如:zh-tw",
226
- visible=False
227
- )
228
-
229
- # 其他設定
230
- with gr.Group():
231
- gr.Markdown("### 其他設定")
232
- context = gr.Textbox(
233
- label="背景���示詞",
234
- placeholder="輸入相關背景資訊",
235
- lines=3
236
- )
237
- temp = gr.Slider(
238
- minimum=0,
239
- maximum=1,
240
- value=0.5,
241
- label="創意程度"
242
- )
243
- diarize = gr.Checkbox(
244
- label="啟用說話者辨識",
245
- value=False
246
- )
247
-
248
- # 處理按鈕
249
- process_btn = gr.Button("處理音訊", variant="primary")
250
 
251
- with gr.Column(scale=3):
252
- # 輸出區域
253
- original_text = gr.Textbox(
254
- label="原始轉錄文字",
255
- lines=10
256
- )
257
- refined_text = gr.Textbox(
258
- label="優化後的文字",
259
- lines=10
260
- )
261
- summary_text = gr.Textbox(
262
- label="文字摘要",
263
- lines=5
264
  )
265
- token_count = gr.Number(
266
- label="Token 使用量",
267
- value=0
 
 
 
268
  )
269
- cost_display = gr.Textbox(
270
- label="費用",
271
- value="NT$ 0.00"
272
- )
273
-
274
- # 更新語言代碼輸入框的可見性
275
- lang_mode.change(
276
- fn=lambda x: gr.update(visible=(x == "指定語言")),
277
- inputs=[lang_mode],
278
- outputs=[lang_code]
279
- )
280
-
281
- # 處理按鈕點擊事件
282
- process_btn.click(
283
- fn=process_audio,
284
- inputs=[
285
- audio_input,
286
- service,
287
- openai_key,
288
- elevenlabs_key,
289
- whisper_model_choice,
290
- openai_model_choice,
291
- lang_mode,
292
- lang_code,
293
- context,
294
- temp,
295
- diarize
296
- ],
297
- outputs=[
298
- original_text,
299
- refined_text,
300
- summary_text,
301
- token_count,
302
- cost_display
303
- ]
304
- )
305
-
306
- # 作者資訊
307
- gr.Markdown("""
308
- ### Created by
309
- **Tseng Yao Hsien**
310
- Endocrinologist
311
- Tungs' Taichung MetroHarbor Hospital
312
- """)
313
 
314
- return app
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
 
316
- if __name__ == "__main__":
317
- app = create_gradio_interface()
318
- app.launch(share=True)
 
1
  import gradio as gr
2
  import os
3
+ from elevenlabs_stt import transcribe_audio_elevenlabs
4
  from whisper_stt import transcribe_audio_whisper
5
  from transcript_refiner import refine_transcript
6
+ from utils import calculate_tokens_and_cost, OPENAI_MODELS, MODEL_PRICES
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def process_audio(
9
+ audio_file,
 
10
  openai_api_key,
11
  elevenlabs_api_key,
12
+ service_choice,
13
  openai_model,
14
+ language,
15
+ speaker_detection=False,
16
+ creativity=0.5
 
 
17
  ):
 
18
  try:
19
+ if not openai_api_key or len(openai_api_key) < 20:
20
+ return "請輸入有效的 OpenAI API 金鑰", "", "", ""
 
 
 
 
 
 
 
21
 
22
+ if service_choice == "ElevenLabs" and (not elevenlabs_api_key or len(elevenlabs_api_key) < 20):
23
+ return "請輸入有效的 ElevenLabs API 金鑰", "", "", ""
24
+
25
+ # 音訊轉文字
26
+ if service_choice == "ElevenLabs":
27
+ transcript = transcribe_audio_elevenlabs(
28
+ audio_file,
29
+ elevenlabs_api_key,
30
+ language=language,
31
+ speaker_detection=speaker_detection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  )
33
+ else: # Whisper
34
+ transcript = transcribe_audio_whisper(
35
+ audio_file,
36
+ language=language
37
+ )
38
+
39
+ # 優化文字
40
+ refined_text = refine_transcript(
41
+ transcript,
42
+ openai_api_key,
43
+ openai_model,
44
+ creativity
45
+ )
46
+
47
+ # 計算 token 和費用
48
+ tokens_info, cost_info = calculate_tokens_and_cost(
49
+ transcript,
50
+ refined_text,
51
+ openai_model
52
+ )
53
+
54
+ return transcript, refined_text, tokens_info, cost_info
55
+
 
 
56
  except Exception as e:
57
+ return f"錯誤:{str(e)}", "", "", ""
58
+
59
  finally:
60
  # 清除敏感資訊
61
+ if 'openai_api_key' in locals():
62
+ del openai_api_key
63
+ if 'elevenlabs_api_key' in locals():
64
+ del elevenlabs_api_key
65
 
66
+ # 創建 Gradio 介面
67
+ with gr.Blocks() as demo:
68
+ gr.Markdown("# 音訊轉文字與優化系統")
69
+
70
+ with gr.Row():
71
+ with gr.Column():
72
+ audio_input = gr.Audio(
73
+ label="上傳音訊檔案",
74
+ type="filepath"
75
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
 
77
+ with gr.Row():
78
+ openai_key = gr.Textbox(
79
+ label="OpenAI API 金鑰",
80
+ placeholder="輸入您的 OpenAI API 金鑰",
81
+ type="password",
82
+ value="",
83
+ every=None
 
 
 
 
 
 
84
  )
85
+ elevenlabs_key = gr.Textbox(
86
+ label="ElevenLabs API 金鑰",
87
+ placeholder="輸入您的 ElevenLabs API 金鑰(如果使用 ElevenLabs)",
88
+ type="password",
89
+ value="",
90
+ every=None
91
  )
92
+
93
+ service = gr.Radio(
94
+ choices=["Whisper", "ElevenLabs"],
95
+ label="選擇轉錄服務",
96
+ value="Whisper"
97
+ )
98
+
99
+ model = gr.Dropdown(
100
+ choices=list(OPENAI_MODELS.keys()),
101
+ label="選擇 OpenAI 模型",
102
+ value="gpt-3.5-turbo"
103
+ )
104
+
105
+ language = gr.Textbox(
106
+ label="語言(可選)",
107
+ placeholder="輸入語言代碼,例如:zh-TW、en、ja",
108
+ value=""
109
+ )
110
+
111
+ speaker = gr.Checkbox(
112
+ label="啟用說話者辨識(僅限 ElevenLabs)",
113
+ value=False
114
+ )
115
+
116
+ creativity = gr.Slider(
117
+ minimum=0,
118
+ maximum=1,
119
+ value=0.5,
120
+ label="創意程度"
121
+ )
122
+
123
+ process_btn = gr.Button("處理音訊")
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
+ with gr.Column():
126
+ original_output = gr.Textbox(
127
+ label="原始轉錄文字",
128
+ lines=10
129
+ )
130
+ refined_output = gr.Textbox(
131
+ label="優化後文字",
132
+ lines=10
133
+ )
134
+ token_info = gr.Textbox(
135
+ label="Token 使用資訊",
136
+ lines=3
137
+ )
138
+ cost_info = gr.Textbox(
139
+ label="費用資訊",
140
+ lines=3
141
+ )
142
+
143
+ gr.Markdown("""
144
+ ### 安全性說明
145
+ - API 金鑰僅在當前處理中使用
146
+ - 不會儲存任何敏感資訊
147
+ - 每次使用需重新輸入 API 金鑰
148
+ """)
149
+
150
+ # 設定處理函數
151
+ process_btn.click(
152
+ fn=process_audio,
153
+ inputs=[
154
+ audio_input,
155
+ openai_key,
156
+ elevenlabs_key,
157
+ service,
158
+ model,
159
+ language,
160
+ speaker,
161
+ creativity
162
+ ],
163
+ outputs=[
164
+ original_output,
165
+ refined_output,
166
+ token_info,
167
+ cost_info
168
+ ]
169
+ )
170
 
171
+ # 啟動應用程式
172
+ demo.launch()