Ander1
/

audio2text

Model card Files Files and versions

xet

Community

Ander1 commited on Mar 19, 2025

Commit

5bfef5b

verified ·

1 Parent(s): 8eb1ae4

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +150 -296

app.py CHANGED Viewed

@@ -1,318 +1,172 @@
 import gradio as gr
 import os
-from elevenlabs_stt import transcribe_audio as transcribe_audio_elevenlabs
 from whisper_stt import transcribe_audio_whisper
 from transcript_refiner import refine_transcript
-from utils import check_file_size, split_large_audio
-import logging
-# 設定日誌
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-# 定義可用的 OpenAI 模型
-OPENAI_MODELS = {
-    "gpt-4o": "gpt-4o",
-    "gpt-4o-mini": "gpt-4o-mini",
-    "o3-mini": "o3-mini",
-    "o1-mini": "o1-mini"
-}
-# 模型設定和價格（USD per 1M tokens）
-MODEL_CONFIG = {
-    "gpt-4o": {
-        "display_name": "gpt-4o",
-        "input": 2.50,
-        "cached_input": 1.25,
-        "output": 10.00
-    },
-    "gpt-4o-mini": {
-        "display_name": "gpt-4o-mini",
-        "input": 0.15,
-        "cached_input": 0.075,
-        "output": 0.60
-    },
-    "o1-mini": {
-        "display_name": "o1-mini",
-        "input": 1.10,
-        "cached_input": 0.55,
-        "output": 4.40
-    },
-    "o3-mini": {
-        "display_name": "o3-mini",
-        "input": 1.10,
-        "cached_input": 0.55,
-        "output": 4.40
-    }
-}
 def process_audio(
-    audio_file,
-    transcription_service,
     openai_api_key,
     elevenlabs_api_key,
-    whisper_model,
     openai_model,
-    language_mode,
-    language_code,
-    context_prompt,
-    temperature,
-    enable_diarization
 ):
-    """處理音訊檔案並返回結果"""
     try:
-        # 檢查必要的 API 金鑰
-        if not openai_api_key or len(openai_api_key.strip()) < 20:
-            return "請提供有效的 OpenAI API 金鑰", "", "", 0, "NT$ 0.00"
-        if transcription_service == "ElevenLabs" and (not elevenlabs_api_key or len(elevenlabs_api_key.strip()) < 20):
-            return "請提供有效的 ElevenLabs API 金鑰", "", "", 0, "NT$ 0.00"
-        # 初始化變數
-        full_transcript = ""
-        # 檢查檔案大小
-        if check_file_size(audio_file):
-            # 檔案需要分割
-            audio_segments = split_large_audio(audio_file)
-            if not audio_segments:
-                return "檔案分割失敗", "", "", 0, "NT$ 0.00"
-            for segment_path in audio_segments:
-                if transcription_service == "Whisper":
-                    result = transcribe_audio_whisper(
-                        segment_path,
-                        model_name=whisper_model,
-                        language=language_code if language_mode == "指定語言" else None,
-                        initial_prompt=context_prompt
-                    )
-                else:
-                    result = transcribe_audio_elevenlabs(
-                        api_key=elevenlabs_api_key,
-                        file_path=segment_path,
-                        diarize=enable_diarization
-                    )
-                if result:
-                    full_transcript += result["text"] + "\n"
-                os.remove(segment_path)
-        else:
-            # 直接轉錄
-            if transcription_service == "Whisper":
-                result = transcribe_audio_whisper(
-                    audio_file,
-                    model_name=whisper_model,
-                    language=language_code if language_mode == "指定語言" else None,
-                    initial_prompt=context_prompt
-                )
-            else:
-                result = transcribe_audio_elevenlabs(
-                    api_key=elevenlabs_api_key,
-                    file_path=audio_file,
-                    diarize=enable_diarization
-                )
-            if result:
-                full_transcript = result["text"]
-        # 優化文字
-        if full_transcript:
-            refined = refine_transcript(
-                raw_text=full_transcript,
-                api_key=openai_api_key,
-                model=openai_model,
-                temperature=temperature,
-                context=context_prompt
             )
-            if refined:
-                # 計算成本
-                current_usage = refined.get("usage", {})
-                input_tokens = current_usage.get("prompt_tokens", 0)
-                output_tokens = current_usage.get("completion_tokens", 0)
-                total_tokens = input_tokens + output_tokens
-                # 計算費用
-                model_price = MODEL_CONFIG[openai_model]
-                input_cost = (input_tokens / 1_000_000) * model_price["input"]
-                output_cost = (output_tokens / 1_000_000) * model_price["output"]
-                total_cost_usd = input_cost + output_cost
-                total_cost_ntd = total_cost_usd * 31.5
-                return (
-                    full_transcript,
-                    refined["corrected"],
-                    refined["summary"],
-                    total_tokens,
-                    f"NT$ {total_cost_ntd:.2f}"
-                )
-        return "處理失敗", "", "", 0, "NT$ 0.00"
     except Exception as e:
-        logger.error(f"處理失敗：{str(e)}")
-        return f"處理失敗：{str(e)}", "", "", 0, "NT$ 0.00"
     finally:
         # 清除敏感資訊
-        del openai_api_key
-        del elevenlabs_api_key
-def create_gradio_interface():
-    """建立 Gradio 介面"""
-    with gr.Blocks(title="音訊轉文字與優化系統") as app:
-        gr.Markdown("# 音訊轉文字與優化系統")
-        with gr.Row():
-            with gr.Column(scale=2):
-                # 音訊輸入
-                audio_input = gr.Audio(
-                    label="上傳音訊檔案",
-                    type="filepath"
-                )
-                # API 金鑰
-                with gr.Group():
-                    gr.Markdown("""### API 金鑰設定
-                    > **安全提示：**
-                    > - API 金鑰僅在當前處理中使用，不會被儲存
-                    > - 每次使用需重新輸入以確保安全性
-                    > - 請勿與他人分享您的 API 金鑰
-                    """)
-                    openai_key = gr.Textbox(
-                        label="OpenAI API 金鑰",
-                        type="password",
-                        placeholder="sk-...",
-                        value="",
-                        every=None  # 確保不會被快取
-                    )
-                    elevenlabs_key = gr.Textbox(
-                        label="ElevenLabs API 金鑰",
-                        type="password",
-                        placeholder="輸入您的 ElevenLabs API 金鑰",
-                        value="",
-                        every=None  # 確保不會被快取
-                    )
-                # 模型選擇
-                with gr.Group():
-                    gr.Markdown("### 模型設定")
-                    service = gr.Radio(
-                        choices=["Whisper", "ElevenLabs"],
-                        label="轉錄服務",
-                        value="Whisper"
-                    )
-                    whisper_model_choice = gr.Dropdown(
-                        choices=["tiny", "base", "small", "medium", "large"],
-                        label="Whisper 模型",
-                        value="small"
-                    )
-                    openai_model_choice = gr.Dropdown(
-                        choices=list(OPENAI_MODELS.keys()),
-                        label="OpenAI 模型",
-                        value="o3-mini"
-                    )
-                # 語言設定
-                with gr.Group():
-                    gr.Markdown("### 語言設定")
-                    lang_mode = gr.Radio(
-                        choices=["自動偵測", "指定語言", "混合語言"],
-                        label="語言模式",
-                        value="自動偵測"
-                    )
-                    lang_code = gr.Textbox(
-                        label="語言代碼",
-                        placeholder="例如：zh-tw",
-                        visible=False
-                    )
-                # 其他設定
-                with gr.Group():
-                    gr.Markdown("### 其他設定")
-                    context = gr.Textbox(
-                        label="背景���示詞",
-                        placeholder="輸入相關背景資訊",
-                        lines=3
-                    )
-                    temp = gr.Slider(
-                        minimum=0,
-                        maximum=1,
-                        value=0.5,
-                        label="創意程度"
-                    )
-                    diarize = gr.Checkbox(
-                        label="啟用說話者辨識",
-                        value=False
-                    )
-                # 處理按鈕
-                process_btn = gr.Button("處理音訊", variant="primary")
-            with gr.Column(scale=3):
-                # 輸出區域
-                original_text = gr.Textbox(
-                    label="原始轉錄文字",
-                    lines=10
-                )
-                refined_text = gr.Textbox(
-                    label="優化後的文字",
-                    lines=10
-                )
-                summary_text = gr.Textbox(
-                    label="文字摘要",
-                    lines=5
                 )
-                token_count = gr.Number(
-                    label="Token 使用量",
-                    value=0
                 )
-                cost_display = gr.Textbox(
-                    label="費用",
-                    value="NT$ 0.00"
-                )
-        # 更新語言代碼輸入框的可見性
-        lang_mode.change(
-            fn=lambda x: gr.update(visible=(x == "指定語言")),
-            inputs=[lang_mode],
-            outputs=[lang_code]
-        )
-        # 處理按鈕點擊事件
-        process_btn.click(
-            fn=process_audio,
-            inputs=[
-                audio_input,
-                service,
-                openai_key,
-                elevenlabs_key,
-                whisper_model_choice,
-                openai_model_choice,
-                lang_mode,
-                lang_code,
-                context,
-                temp,
-                diarize
-            ],
-            outputs=[
-                original_text,
-                refined_text,
-                summary_text,
-                token_count,
-                cost_display
-            ]
-        )
-        # 作者資訊
-        gr.Markdown("""
-        ### Created by
-        **Tseng Yao Hsien**
-        Endocrinologist
-        Tungs' Taichung MetroHarbor Hospital
-        """)
-    return app
-if __name__ == "__main__":
-    app = create_gradio_interface()
-    app.launch(share=True)

 import gradio as gr
 import os
+from elevenlabs_stt import transcribe_audio_elevenlabs
 from whisper_stt import transcribe_audio_whisper
 from transcript_refiner import refine_transcript
+from utils import calculate_tokens_and_cost, OPENAI_MODELS, MODEL_PRICES
 def process_audio(
+    audio_file,
     openai_api_key,
     elevenlabs_api_key,
+    service_choice,
     openai_model,
+    language,
+    speaker_detection=False,
+    creativity=0.5
 ):
     try:
+        if not openai_api_key or len(openai_api_key) < 20:
+            return "請輸入有效的 OpenAI API 金鑰", "", "", ""
+        if service_choice == "ElevenLabs" and (not elevenlabs_api_key or len(elevenlabs_api_key) < 20):
+            return "請輸入有效的 ElevenLabs API 金鑰", "", "", ""
+        # 音訊轉文字
+        if service_choice == "ElevenLabs":
+            transcript = transcribe_audio_elevenlabs(
+                audio_file,
+                elevenlabs_api_key,
+                language=language,
+                speaker_detection=speaker_detection
             )
+        else:  # Whisper
+            transcript = transcribe_audio_whisper(
+                audio_file,
+                language=language
+            )
+        # 優化文字
+        refined_text = refine_transcript(
+            transcript,
+            openai_api_key,
+            openai_model,
+            creativity
+        )
+        # 計算 token 和費用
+        tokens_info, cost_info = calculate_tokens_and_cost(
+            transcript,
+            refined_text,
+            openai_model
+        )
+        return transcript, refined_text, tokens_info, cost_info
     except Exception as e:
+        return f"錯誤：{str(e)}", "", "", ""
     finally:
         # 清除敏感資訊
+        if 'openai_api_key' in locals():
+            del openai_api_key
+        if 'elevenlabs_api_key' in locals():
+            del elevenlabs_api_key
+# 創建 Gradio 介面
+with gr.Blocks() as demo:
+    gr.Markdown("# 音訊轉文字與優化系統")
+    with gr.Row():
+        with gr.Column():
+            audio_input = gr.Audio(
+                label="上傳音訊檔案",
+                type="filepath"
+            )
+            with gr.Row():
+                openai_key = gr.Textbox(
+                    label="OpenAI API 金鑰",
+                    placeholder="輸入您的 OpenAI API 金鑰",
+                    type="password",
+                    value="",
+                    every=None
                 )
+                elevenlabs_key = gr.Textbox(
+                    label="ElevenLabs API 金鑰",
+                    placeholder="輸入您的 ElevenLabs API 金鑰（如果使用 ElevenLabs）",
+                    type="password",
+                    value="",
+                    every=None
                 )
+            service = gr.Radio(
+                choices=["Whisper", "ElevenLabs"],
+                label="選擇轉錄服務",
+                value="Whisper"
+            )
+            model = gr.Dropdown(
+                choices=list(OPENAI_MODELS.keys()),
+                label="選擇 OpenAI 模型",
+                value="gpt-3.5-turbo"
+            )
+            language = gr.Textbox(
+                label="語言（可選）",
+                placeholder="輸入語言代碼，例如：zh-TW、en、ja",
+                value=""
+            )
+            speaker = gr.Checkbox(
+                label="啟用說話者辨識（僅限 ElevenLabs）",
+                value=False
+            )
+            creativity = gr.Slider(
+                minimum=0,
+                maximum=1,
+                value=0.5,
+                label="創意程度"
+            )
+            process_btn = gr.Button("處理音訊")
+        with gr.Column():
+            original_output = gr.Textbox(
+                label="原始轉錄文字",
+                lines=10
+            )
+            refined_output = gr.Textbox(
+                label="優化後文字",
+                lines=10
+            )
+            token_info = gr.Textbox(
+                label="Token 使用資訊",
+                lines=3
+            )
+            cost_info = gr.Textbox(
+                label="費用資訊",
+                lines=3
+            )
+    gr.Markdown("""
+    ### 安全性說明
+    - API 金鑰僅在當前處理中使用
+    - 不會儲存任何敏感資訊
+    - 每次使用需重新輸入 API 金鑰
+    """)
+    # 設定處理函數
+    process_btn.click(
+        fn=process_audio,
+        inputs=[
+            audio_input,
+            openai_key,
+            elevenlabs_key,
+            service,
+            model,
+            language,
+            speaker,
+            creativity
+        ],
+        outputs=[
+            original_output,
+            refined_output,
+            token_info,
+            cost_info
+        ]
+    )
+# 啟動應用程式
+demo.launch()