import os, tempfile import torch import gradio as gr from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read # ----------------------------- # CONFIG # ----------------------------- ASR_MODEL = "openai/whisper-large-v3" BATCH_SIZE = 8 HAS_CUDA = torch.cuda.is_available() DEVICE = 0 if HAS_CUDA else "cpu" DTYPE = torch.float16 if HAS_CUDA else torch.float32 asr_pipe = pipeline( task="automatic-speech-recognition", model=ASR_MODEL, device=DEVICE, torch_dtype=DTYPE, chunk_length_s=30, ) def _save_text_file(text: str, suffix: str = ".txt") -> str: fd, path = tempfile.mkstemp(suffix=suffix) with os.fdopen(fd, "w", encoding="utf-8") as f: f.write(text) return path def _resolve_path(x): """ รับค่า input ได้ทั้ง: - str (filepath) - dict ที่มี key 'name' หรือ 'path' - gradio FileData (มี .path) คืนค่าเป็น filepath เสมอ """ if x is None: return None if isinstance(x, str): return x # Gradio v4/v5 บางทีให้เป็น dict if isinstance(x, dict): return x.get("path") or x.get("name") # Gradio FileData path = getattr(x, "path", None) if path: return path # เผื่อกรณี edge return str(x) def _transcribe_from_any(file_like, task: str): path = _resolve_path(file_like) if not path or not os.path.exists(path): raise gr.Error("ไม่พบไฟล์ที่อัปโหลด (path ว่างหรือไฟล์หาย)") # อ่านเป็น bytes แล้วให้ ffmpeg แปลงเป็นโมโน float32 with open(path, "rb") as f: payload = f.read() audio = ffmpeg_read(payload, asr_pipe.feature_extractor.sampling_rate) inputs = {"array": audio, "sampling_rate": asr_pipe.feature_extractor.sampling_rate} out = asr_pipe( inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, # 'transcribe' = ตามภาษาเดิม, 'translate' = แปลอังกฤษ return_timestamps=True, ) text = out["text"] return text, _save_text_file(text, ".txt") def transcribe_mic(mic_path: str, task: str): return _transcribe_from_any(mic_path, task) def transcribe_audio(file_path: str, task: str): return _transcribe_from_any(file_path, task) def transcribe_video(video_file, task: str): # รับเป็นไฟล์วิดีโอ (mp4, mov, webm ก็ได้) แล้วให้ ffmpeg_read ดึงเสียง return _transcribe_from_any(video_file, task) # ----------------------------- # UI # ----------------------------- with gr.Blocks(title="Whisper V3 – Transcriber (Audio + MP4)") as demo: gr.Markdown("## 🎙️ Whisper V3 – Record/Upload Audio or MP4 → Transcript → Download (.txt)") with gr.Tab("🎤 Microphone"): mic_audio = gr.Audio(sources="microphone", type="filepath", label="Record") mic_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task") mic_text = gr.Textbox(label="Transcript", lines=10) mic_file = gr.File(label="Download Transcript (.txt)") gr.Button("Run").click(transcribe_mic, inputs=[mic_audio, mic_task], outputs=[mic_text, mic_file]) with gr.Tab("📁 Audio file"): up_audio = gr.Audio(sources="upload", type="filepath", label="Upload audio") up_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task") up_text = gr.Textbox(label="Transcript", lines=10) up_file = gr.File(label="Download Transcript (.txt)") gr.Button("Run").click(transcribe_audio, inputs=[up_audio, up_task], outputs=[up_text, up_file]) with gr.Tab("🎬 Video MP4"): # ใช้ gr.File เพื่อให้ได้ path ที่นิ่งที่สุด up_video = gr.File(file_count="single", file_types=["video"], label="Upload MP4 / Video") vd_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task") vd_text = gr.Textbox(label="Transcript", lines=10) vd_file = gr.File(label="Download Transcript (.txt)") gr.Button("Run").click(transcribe_video, inputs=[up_video, vd_task], outputs=[vd_text, vd_file]) demo.queue().launch()