Ander1 commited on
Commit
bf1a837
·
verified ·
1 Parent(s): 89717fb

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +307 -0
app.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from dotenv import load_dotenv
3
+ import os
4
+ from elevenlabs_stt import transcribe_audio as transcribe_audio_elevenlabs
5
+ from whisper_stt import transcribe_audio_whisper, get_available_models, get_model_description
6
+ from transcript_refiner import refine_transcript
7
+ from utils import check_file_size, split_large_audio
8
+ import logging
9
+
10
+ # 載入環境變數
11
+ load_dotenv()
12
+
13
+ # 設定日誌
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # 定義可用的 OpenAI 模型
18
+ OPENAI_MODELS = {
19
+ "gpt-4o": "gpt-4o",
20
+ "gpt-4o-mini": "gpt-4o-mini",
21
+ "o3-mini": "o3-mini",
22
+ "o1-mini": "o1-mini"
23
+ }
24
+
25
+ # 模型設定和價格(USD per 1M tokens)
26
+ MODEL_CONFIG = {
27
+ "gpt-4o": {
28
+ "display_name": "gpt-4o",
29
+ "input": 2.50,
30
+ "cached_input": 1.25,
31
+ "output": 10.00
32
+ },
33
+ "gpt-4o-mini": {
34
+ "display_name": "gpt-4o-mini",
35
+ "input": 0.15,
36
+ "cached_input": 0.075,
37
+ "output": 0.60
38
+ },
39
+ "o1-mini": {
40
+ "display_name": "o1-mini",
41
+ "input": 1.10,
42
+ "cached_input": 0.55,
43
+ "output": 4.40
44
+ },
45
+ "o3-mini": {
46
+ "display_name": "o3-mini",
47
+ "input": 1.10,
48
+ "cached_input": 0.55,
49
+ "output": 4.40
50
+ }
51
+ }
52
+
53
+ def process_audio(
54
+ audio_file,
55
+ transcription_service,
56
+ openai_api_key,
57
+ elevenlabs_api_key,
58
+ whisper_model,
59
+ openai_model,
60
+ language_mode,
61
+ language_code,
62
+ context_prompt,
63
+ temperature,
64
+ enable_diarization
65
+ ):
66
+ """處理音訊檔案並返回結果"""
67
+ try:
68
+ # 檢查必要的 API 金鑰
69
+ if not openai_api_key:
70
+ return "請提供 OpenAI API 金鑰", "", "", 0, "NT$ 0.00"
71
+
72
+ if transcription_service == "ElevenLabs" and not elevenlabs_api_key:
73
+ return "請提供 ElevenLabs API 金鑰", "", "", 0, "NT$ 0.00"
74
+
75
+ # 初始化變數
76
+ full_transcript = ""
77
+
78
+ # 檢查檔案大小
79
+ if check_file_size(audio_file):
80
+ # 檔案需要分割
81
+ audio_segments = split_large_audio(audio_file)
82
+ if not audio_segments:
83
+ return "檔案分割失敗", "", "", 0, "NT$ 0.00"
84
+
85
+ for segment_path in audio_segments:
86
+ if transcription_service == "Whisper":
87
+ result = transcribe_audio_whisper(
88
+ segment_path,
89
+ model_name=whisper_model,
90
+ language=language_code if language_mode == "指定語言" else None,
91
+ initial_prompt=context_prompt
92
+ )
93
+ else:
94
+ result = transcribe_audio_elevenlabs(
95
+ api_key=elevenlabs_api_key,
96
+ file_path=segment_path,
97
+ diarize=enable_diarization
98
+ )
99
+
100
+ if result:
101
+ full_transcript += result["text"] + "\n"
102
+ os.remove(segment_path)
103
+ else:
104
+ # 直接轉錄
105
+ if transcription_service == "Whisper":
106
+ result = transcribe_audio_whisper(
107
+ audio_file,
108
+ model_name=whisper_model,
109
+ language=language_code if language_mode == "指定語言" else None,
110
+ initial_prompt=context_prompt
111
+ )
112
+ else:
113
+ result = transcribe_audio_elevenlabs(
114
+ api_key=elevenlabs_api_key,
115
+ file_path=audio_file,
116
+ diarize=enable_diarization
117
+ )
118
+
119
+ if result:
120
+ full_transcript = result["text"]
121
+
122
+ # 優化文字
123
+ if full_transcript:
124
+ refined = refine_transcript(
125
+ raw_text=full_transcript,
126
+ api_key=openai_api_key,
127
+ model=openai_model,
128
+ temperature=temperature,
129
+ context=context_prompt
130
+ )
131
+
132
+ if refined:
133
+ # 計算成本
134
+ current_usage = refined.get("usage", {})
135
+ input_tokens = current_usage.get("prompt_tokens", 0)
136
+ output_tokens = current_usage.get("completion_tokens", 0)
137
+ total_tokens = input_tokens + output_tokens
138
+
139
+ # 計算費用
140
+ model_price = MODEL_CONFIG[openai_model]
141
+ input_cost = (input_tokens / 1_000_000) * model_price["input"]
142
+ output_cost = (output_tokens / 1_000_000) * model_price["output"]
143
+ total_cost_usd = input_cost + output_cost
144
+ total_cost_ntd = total_cost_usd * 31.5
145
+
146
+ return (
147
+ full_transcript,
148
+ refined["corrected"],
149
+ refined["summary"],
150
+ total_tokens,
151
+ f"NT$ {total_cost_ntd:.2f}"
152
+ )
153
+
154
+ return "處理失敗", "", "", 0, "NT$ 0.00"
155
+
156
+ except Exception as e:
157
+ logger.error(f"處理失敗:{str(e)}")
158
+ return f"處理失敗:{str(e)}", "", "", 0, "NT$ 0.00"
159
+
160
+ def create_gradio_interface():
161
+ """建立 Gradio 介面"""
162
+ with gr.Blocks(title="音訊轉文字與優化系統") as app:
163
+ gr.Markdown("# 音訊轉文字與優化系統")
164
+
165
+ with gr.Row():
166
+ with gr.Column(scale=2):
167
+ # 音訊輸入
168
+ audio_input = gr.Audio(
169
+ label="上傳音訊檔案",
170
+ type="filepath"
171
+ )
172
+
173
+ # API 金鑰
174
+ with gr.Group():
175
+ gr.Markdown("### API 金鑰設定")
176
+ openai_key = gr.Textbox(
177
+ label="OpenAI API 金鑰",
178
+ type="password"
179
+ )
180
+ elevenlabs_key = gr.Textbox(
181
+ label="ElevenLabs API 金鑰",
182
+ type="password"
183
+ )
184
+
185
+ # 模型選擇
186
+ with gr.Group():
187
+ gr.Markdown("### 模型設定")
188
+ service = gr.Radio(
189
+ choices=["Whisper", "ElevenLabs"],
190
+ label="轉錄服務",
191
+ value="Whisper"
192
+ )
193
+ whisper_model_choice = gr.Dropdown(
194
+ choices=["tiny", "base", "small", "medium", "large"],
195
+ label="Whisper 模型",
196
+ value="small"
197
+ )
198
+ openai_model_choice = gr.Dropdown(
199
+ choices=list(OPENAI_MODELS.keys()),
200
+ label="OpenAI 模型",
201
+ value="o3-mini"
202
+ )
203
+
204
+ # 語言設定
205
+ with gr.Group():
206
+ gr.Markdown("### 語言設定")
207
+ lang_mode = gr.Radio(
208
+ choices=["自動偵測", "指定語言", "混合語言"],
209
+ label="語言模式",
210
+ value="自動偵測"
211
+ )
212
+ lang_code = gr.Textbox(
213
+ label="語言代碼",
214
+ placeholder="例如:zh-tw",
215
+ visible=False
216
+ )
217
+
218
+ # 其他設定
219
+ with gr.Group():
220
+ gr.Markdown("### 其他設定")
221
+ context = gr.Textbox(
222
+ label="背景提示詞",
223
+ placeholder="輸入相關背景資訊",
224
+ lines=3
225
+ )
226
+ temp = gr.Slider(
227
+ minimum=0,
228
+ maximum=1,
229
+ value=0.5,
230
+ label="創意程度"
231
+ )
232
+ diarize = gr.Checkbox(
233
+ label="啟用說話者辨識",
234
+ value=False
235
+ )
236
+
237
+ # 處理按鈕
238
+ process_btn = gr.Button("處理音訊", variant="primary")
239
+
240
+ with gr.Column(scale=3):
241
+ # 輸出區域
242
+ original_text = gr.Textbox(
243
+ label="原始轉錄文字",
244
+ lines=10
245
+ )
246
+ refined_text = gr.Textbox(
247
+ label="優化後的文字",
248
+ lines=10
249
+ )
250
+ summary_text = gr.Textbox(
251
+ label="文字摘要",
252
+ lines=5
253
+ )
254
+ token_count = gr.Number(
255
+ label="Token 使用量",
256
+ value=0
257
+ )
258
+ cost_display = gr.Textbox(
259
+ label="費用",
260
+ value="NT$ 0.00"
261
+ )
262
+
263
+ # 更新語言代碼輸入框的可見性
264
+ lang_mode.change(
265
+ fn=lambda x: gr.update(visible=(x == "指定語言")),
266
+ inputs=[lang_mode],
267
+ outputs=[lang_code]
268
+ )
269
+
270
+ # 處理按鈕點擊事件
271
+ process_btn.click(
272
+ fn=process_audio,
273
+ inputs=[
274
+ audio_input,
275
+ service,
276
+ openai_key,
277
+ elevenlabs_key,
278
+ whisper_model_choice,
279
+ openai_model_choice,
280
+ lang_mode,
281
+ lang_code,
282
+ context,
283
+ temp,
284
+ diarize
285
+ ],
286
+ outputs=[
287
+ original_text,
288
+ refined_text,
289
+ summary_text,
290
+ token_count,
291
+ cost_display
292
+ ]
293
+ )
294
+
295
+ # 作者資訊
296
+ gr.Markdown("""
297
+ ### Created by
298
+ **Tseng Yao Hsien**
299
+ Endocrinologist
300
+ Tungs' Taichung MetroHarbor Hospital
301
+ """)
302
+
303
+ return app
304
+
305
+ if __name__ == "__main__":
306
+ app = create_gradio_interface()
307
+ app.launch(share=True)