PPloychor's picture
Update app.py
7ffbf27 verified
import os, tempfile
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read
# -----------------------------
# CONFIG
# -----------------------------
ASR_MODEL = "openai/whisper-large-v3"
BATCH_SIZE = 8
HAS_CUDA = torch.cuda.is_available()
DEVICE = 0 if HAS_CUDA else "cpu"
DTYPE = torch.float16 if HAS_CUDA else torch.float32
asr_pipe = pipeline(
task="automatic-speech-recognition",
model=ASR_MODEL,
device=DEVICE,
torch_dtype=DTYPE,
chunk_length_s=30,
)
def _save_text_file(text: str, suffix: str = ".txt") -> str:
fd, path = tempfile.mkstemp(suffix=suffix)
with os.fdopen(fd, "w", encoding="utf-8") as f:
f.write(text)
return path
def _resolve_path(x):
"""
รับค่า input ได้ทั้ง:
- str (filepath)
- dict ที่มี key 'name' หรือ 'path'
- gradio FileData (มี .path)
คืนค่าเป็น filepath เสมอ
"""
if x is None:
return None
if isinstance(x, str):
return x
# Gradio v4/v5 บางทีให้เป็น dict
if isinstance(x, dict):
return x.get("path") or x.get("name")
# Gradio FileData
path = getattr(x, "path", None)
if path:
return path
# เผื่อกรณี edge
return str(x)
def _transcribe_from_any(file_like, task: str):
path = _resolve_path(file_like)
if not path or not os.path.exists(path):
raise gr.Error("ไม่พบไฟล์ที่อัปโหลด (path ว่างหรือไฟล์หาย)")
# อ่านเป็น bytes แล้วให้ ffmpeg แปลงเป็นโมโน float32
with open(path, "rb") as f:
payload = f.read()
audio = ffmpeg_read(payload, asr_pipe.feature_extractor.sampling_rate)
inputs = {"array": audio, "sampling_rate": asr_pipe.feature_extractor.sampling_rate}
out = asr_pipe(
inputs,
batch_size=BATCH_SIZE,
generate_kwargs={"task": task}, # 'transcribe' = ตามภาษาเดิม, 'translate' = แปลอังกฤษ
return_timestamps=True,
)
text = out["text"]
return text, _save_text_file(text, ".txt")
def transcribe_mic(mic_path: str, task: str):
return _transcribe_from_any(mic_path, task)
def transcribe_audio(file_path: str, task: str):
return _transcribe_from_any(file_path, task)
def transcribe_video(video_file, task: str):
# รับเป็นไฟล์วิดีโอ (mp4, mov, webm ก็ได้) แล้วให้ ffmpeg_read ดึงเสียง
return _transcribe_from_any(video_file, task)
# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Whisper V3 – Transcriber (Audio + MP4)") as demo:
gr.Markdown("## 🎙️ Whisper V3 – Record/Upload Audio or MP4 → Transcript → Download (.txt)")
with gr.Tab("🎤 Microphone"):
mic_audio = gr.Audio(sources="microphone", type="filepath", label="Record")
mic_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
mic_text = gr.Textbox(label="Transcript", lines=10)
mic_file = gr.File(label="Download Transcript (.txt)")
gr.Button("Run").click(transcribe_mic, inputs=[mic_audio, mic_task], outputs=[mic_text, mic_file])
with gr.Tab("📁 Audio file"):
up_audio = gr.Audio(sources="upload", type="filepath", label="Upload audio")
up_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
up_text = gr.Textbox(label="Transcript", lines=10)
up_file = gr.File(label="Download Transcript (.txt)")
gr.Button("Run").click(transcribe_audio, inputs=[up_audio, up_task], outputs=[up_text, up_file])
with gr.Tab("🎬 Video MP4"):
# ใช้ gr.File เพื่อให้ได้ path ที่นิ่งที่สุด
up_video = gr.File(file_count="single", file_types=["video"], label="Upload MP4 / Video")
vd_task = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
vd_text = gr.Textbox(label="Transcript", lines=10)
vd_file = gr.File(label="Download Transcript (.txt)")
gr.Button("Run").click(transcribe_video, inputs=[up_video, vd_task], outputs=[vd_text, vd_file])
demo.queue().launch()