Ander1
/

audio2text

Model card Files Files and versions

xet

Community

Ander1 commited on Mar 19, 2025

Commit

bf1a837

verified ·

1 Parent(s): 89717fb

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +307 -0

app.py ADDED Viewed

	@@ -0,0 +1,307 @@

+import gradio as gr
+from dotenv import load_dotenv
+import os
+from elevenlabs_stt import transcribe_audio as transcribe_audio_elevenlabs
+from whisper_stt import transcribe_audio_whisper, get_available_models, get_model_description
+from transcript_refiner import refine_transcript
+from utils import check_file_size, split_large_audio
+import logging
+# 載入環境變數
+load_dotenv()
+# 設定日誌
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# 定義可用的 OpenAI 模型
+OPENAI_MODELS = {
+    "gpt-4o": "gpt-4o",
+    "gpt-4o-mini": "gpt-4o-mini",
+    "o3-mini": "o3-mini",
+    "o1-mini": "o1-mini"
+}
+# 模型設定和價格（USD per 1M tokens）
+MODEL_CONFIG = {
+    "gpt-4o": {
+        "display_name": "gpt-4o",
+        "input": 2.50,
+        "cached_input": 1.25,
+        "output": 10.00
+    },
+    "gpt-4o-mini": {
+        "display_name": "gpt-4o-mini",
+        "input": 0.15,
+        "cached_input": 0.075,
+        "output": 0.60
+    },
+    "o1-mini": {
+        "display_name": "o1-mini",
+        "input": 1.10,
+        "cached_input": 0.55,
+        "output": 4.40
+    },
+    "o3-mini": {
+        "display_name": "o3-mini",
+        "input": 1.10,
+        "cached_input": 0.55,
+        "output": 4.40
+    }
+}
+def process_audio(
+    audio_file,
+    transcription_service,
+    openai_api_key,
+    elevenlabs_api_key,
+    whisper_model,
+    openai_model,
+    language_mode,
+    language_code,
+    context_prompt,
+    temperature,
+    enable_diarization
+):
+    """處理音訊檔案並返回結果"""
+    try:
+        # 檢查必要的 API 金鑰
+        if not openai_api_key:
+            return "請提供 OpenAI API 金鑰", "", "", 0, "NT$ 0.00"
+        if transcription_service == "ElevenLabs" and not elevenlabs_api_key:
+            return "請提供 ElevenLabs API 金鑰", "", "", 0, "NT$ 0.00"
+        # 初始化變數
+        full_transcript = ""
+        # 檢查檔案大小
+        if check_file_size(audio_file):
+            # 檔案需要分割
+            audio_segments = split_large_audio(audio_file)
+            if not audio_segments:
+                return "檔案分割失敗", "", "", 0, "NT$ 0.00"
+            for segment_path in audio_segments:
+                if transcription_service == "Whisper":
+                    result = transcribe_audio_whisper(
+                        segment_path,
+                        model_name=whisper_model,
+                        language=language_code if language_mode == "指定語言" else None,
+                        initial_prompt=context_prompt
+                    )
+                else:
+                    result = transcribe_audio_elevenlabs(
+                        api_key=elevenlabs_api_key,
+                        file_path=segment_path,
+                        diarize=enable_diarization
+                    )
+                if result:
+                    full_transcript += result["text"] + "\n"
+                os.remove(segment_path)
+        else:
+            # 直接轉錄
+            if transcription_service == "Whisper":
+                result = transcribe_audio_whisper(
+                    audio_file,
+                    model_name=whisper_model,
+                    language=language_code if language_mode == "指定語言" else None,
+                    initial_prompt=context_prompt
+                )
+            else:
+                result = transcribe_audio_elevenlabs(
+                    api_key=elevenlabs_api_key,
+                    file_path=audio_file,
+                    diarize=enable_diarization
+                )
+            if result:
+                full_transcript = result["text"]
+        # 優化文字
+        if full_transcript:
+            refined = refine_transcript(
+                raw_text=full_transcript,
+                api_key=openai_api_key,
+                model=openai_model,
+                temperature=temperature,
+                context=context_prompt
+            )
+            if refined:
+                # 計算成本
+                current_usage = refined.get("usage", {})
+                input_tokens = current_usage.get("prompt_tokens", 0)
+                output_tokens = current_usage.get("completion_tokens", 0)
+                total_tokens = input_tokens + output_tokens
+                # 計算費用
+                model_price = MODEL_CONFIG[openai_model]
+                input_cost = (input_tokens / 1_000_000) * model_price["input"]
+                output_cost = (output_tokens / 1_000_000) * model_price["output"]
+                total_cost_usd = input_cost + output_cost
+                total_cost_ntd = total_cost_usd * 31.5
+                return (
+                    full_transcript,
+                    refined["corrected"],
+                    refined["summary"],
+                    total_tokens,
+                    f"NT$ {total_cost_ntd:.2f}"
+                )
+        return "處理失敗", "", "", 0, "NT$ 0.00"
+    except Exception as e:
+        logger.error(f"處理失敗：{str(e)}")
+        return f"處理失敗：{str(e)}", "", "", 0, "NT$ 0.00"
+def create_gradio_interface():
+    """建立 Gradio 介面"""
+    with gr.Blocks(title="音訊轉文字與優化系統") as app:
+        gr.Markdown("# 音訊轉文字與優化系統")
+        with gr.Row():
+            with gr.Column(scale=2):
+                # 音訊輸入
+                audio_input = gr.Audio(
+                    label="上傳音訊檔案",
+                    type="filepath"
+                )
+                # API 金鑰
+                with gr.Group():
+                    gr.Markdown("### API 金鑰設定")
+                    openai_key = gr.Textbox(
+                        label="OpenAI API 金鑰",
+                        type="password"
+                    )
+                    elevenlabs_key = gr.Textbox(
+                        label="ElevenLabs API 金鑰",
+                        type="password"
+                    )
+                # 模型選擇
+                with gr.Group():
+                    gr.Markdown("### 模型設定")
+                    service = gr.Radio(
+                        choices=["Whisper", "ElevenLabs"],
+                        label="轉錄服務",
+                        value="Whisper"
+                    )
+                    whisper_model_choice = gr.Dropdown(
+                        choices=["tiny", "base", "small", "medium", "large"],
+                        label="Whisper 模型",
+                        value="small"
+                    )
+                    openai_model_choice = gr.Dropdown(
+                        choices=list(OPENAI_MODELS.keys()),
+                        label="OpenAI 模型",
+                        value="o3-mini"
+                    )
+                # 語言設定
+                with gr.Group():
+                    gr.Markdown("### 語言設定")
+                    lang_mode = gr.Radio(
+                        choices=["自動偵測", "指定語言", "混合語言"],
+                        label="語言模式",
+                        value="自動偵測"
+                    )
+                    lang_code = gr.Textbox(
+                        label="語言代碼",
+                        placeholder="例如：zh-tw",
+                        visible=False
+                    )
+                # 其他設定
+                with gr.Group():
+                    gr.Markdown("### 其他設定")
+                    context = gr.Textbox(
+                        label="背景提示詞",
+                        placeholder="輸入相關背景資訊",
+                        lines=3
+                    )
+                    temp = gr.Slider(
+                        minimum=0,
+                        maximum=1,
+                        value=0.5,
+                        label="創意程度"
+                    )
+                    diarize = gr.Checkbox(
+                        label="啟用說話者辨識",
+                        value=False
+                    )
+                # 處理按鈕
+                process_btn = gr.Button("處理音訊", variant="primary")
+            with gr.Column(scale=3):
+                # 輸出區域
+                original_text = gr.Textbox(
+                    label="原始轉錄文字",
+                    lines=10
+                )
+                refined_text = gr.Textbox(
+                    label="優化後的文字",
+                    lines=10
+                )
+                summary_text = gr.Textbox(
+                    label="文字摘要",
+                    lines=5
+                )
+                token_count = gr.Number(
+                    label="Token 使用量",
+                    value=0
+                )
+                cost_display = gr.Textbox(
+                    label="費用",
+                    value="NT$ 0.00"
+                )
+        # 更新語言代碼輸入框的可見性
+        lang_mode.change(
+            fn=lambda x: gr.update(visible=(x == "指定語言")),
+            inputs=[lang_mode],
+            outputs=[lang_code]
+        )
+        # 處理按鈕點擊事件
+        process_btn.click(
+            fn=process_audio,
+            inputs=[
+                audio_input,
+                service,
+                openai_key,
+                elevenlabs_key,
+                whisper_model_choice,
+                openai_model_choice,
+                lang_mode,
+                lang_code,
+                context,
+                temp,
+                diarize
+            ],
+            outputs=[
+                original_text,
+                refined_text,
+                summary_text,
+                token_count,
+                cost_display
+            ]
+        )
+        # 作者資訊
+        gr.Markdown("""
+        ### Created by
+        **Tseng Yao Hsien**
+        Endocrinologist
+        Tungs' Taichung MetroHarbor Hospital
+        """)
+    return app
+if __name__ == "__main__":
+    app = create_gradio_interface()
+    app.launch(share=True)