File size: 4,457 Bytes
e8939bc
64644c3
6c69ff9
 
 
 
64644c3
6c69ff9
64644c3
7ffbf27
6c69ff9
 
 
 
 
 
 
 
 
 
7ffbf27
6c69ff9
 
64644c3
 
 
 
 
 
7ffbf27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e8939bc
64644c3
 
 
6c69ff9
64644c3
 
 
7ffbf27
64644c3
6c69ff9
64644c3
e8939bc
6c69ff9
64644c3
7ffbf27
6c69ff9
e8939bc
7ffbf27
6c69ff9
7ffbf27
 
 
e8939bc
64644c3
6c69ff9
64644c3
e8939bc
 
6c69ff9
64644c3
 
 
 
 
 
6c69ff9
 
64644c3
 
 
 
e8939bc
 
 
7ffbf27
 
e8939bc
 
 
 
6c69ff9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os, tempfile
import torch
import gradio as gr
from transformers import pipeline
from transformers.pipelines.audio_utils import ffmpeg_read

# -----------------------------
# CONFIG
# -----------------------------
ASR_MODEL = "openai/whisper-large-v3"
BATCH_SIZE = 8
HAS_CUDA = torch.cuda.is_available()
DEVICE = 0 if HAS_CUDA else "cpu"
DTYPE = torch.float16 if HAS_CUDA else torch.float32

asr_pipe = pipeline(
    task="automatic-speech-recognition",
    model=ASR_MODEL,
    device=DEVICE,
    torch_dtype=DTYPE,
    chunk_length_s=30,
)

def _save_text_file(text: str, suffix: str = ".txt") -> str:
    fd, path = tempfile.mkstemp(suffix=suffix)
    with os.fdopen(fd, "w", encoding="utf-8") as f:
        f.write(text)
    return path

def _resolve_path(x):
    """
    รับค่า input ได้ทั้ง:
    - str (filepath)
    - dict ที่มี key 'name' หรือ 'path'
    - gradio FileData (มี .path)
    คืนค่าเป็น filepath เสมอ
    """
    if x is None:
        return None
    if isinstance(x, str):
        return x
    # Gradio v4/v5 บางทีให้เป็น dict
    if isinstance(x, dict):
        return x.get("path") or x.get("name")
    # Gradio FileData
    path = getattr(x, "path", None)
    if path:
        return path
    # เผื่อกรณี edge
    return str(x)

def _transcribe_from_any(file_like, task: str):
    path = _resolve_path(file_like)
    if not path or not os.path.exists(path):
        raise gr.Error("ไม่พบไฟล์ที่อัปโหลด (path ว่างหรือไฟล์หาย)")

    # อ่านเป็น bytes แล้วให้ ffmpeg แปลงเป็นโมโน float32
    with open(path, "rb") as f:
        payload = f.read()
    audio = ffmpeg_read(payload, asr_pipe.feature_extractor.sampling_rate)
    inputs = {"array": audio, "sampling_rate": asr_pipe.feature_extractor.sampling_rate}

    out = asr_pipe(
        inputs,
        batch_size=BATCH_SIZE,
        generate_kwargs={"task": task},  # 'transcribe' = ตามภาษาเดิม, 'translate' = แปลอังกฤษ
        return_timestamps=True,
    )
    text = out["text"]
    return text, _save_text_file(text, ".txt")

def transcribe_mic(mic_path: str, task: str):
    return _transcribe_from_any(mic_path, task)

def transcribe_audio(file_path: str, task: str):
    return _transcribe_from_any(file_path, task)

def transcribe_video(video_file, task: str):
    # รับเป็นไฟล์วิดีโอ (mp4, mov, webm ก็ได้) แล้วให้ ffmpeg_read ดึงเสียง
    return _transcribe_from_any(video_file, task)

# -----------------------------
# UI
# -----------------------------
with gr.Blocks(title="Whisper V3 – Transcriber (Audio + MP4)") as demo:
    gr.Markdown("## 🎙️ Whisper V3 – Record/Upload Audio or MP4 → Transcript → Download (.txt)")

    with gr.Tab("🎤 Microphone"):
        mic_audio = gr.Audio(sources="microphone", type="filepath", label="Record")
        mic_task  = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
        mic_text  = gr.Textbox(label="Transcript", lines=10)
        mic_file  = gr.File(label="Download Transcript (.txt)")
        gr.Button("Run").click(transcribe_mic, inputs=[mic_audio, mic_task], outputs=[mic_text, mic_file])

    with gr.Tab("📁 Audio file"):
        up_audio  = gr.Audio(sources="upload", type="filepath", label="Upload audio")
        up_task   = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
        up_text   = gr.Textbox(label="Transcript", lines=10)
        up_file   = gr.File(label="Download Transcript (.txt)")
        gr.Button("Run").click(transcribe_audio, inputs=[up_audio, up_task], outputs=[up_text, up_file])

    with gr.Tab("🎬 Video MP4"):
        # ใช้ gr.File เพื่อให้ได้ path ที่นิ่งที่สุด
        up_video  = gr.File(file_count="single", file_types=["video"], label="Upload MP4 / Video")
        vd_task   = gr.Radio(["transcribe", "translate"], value="transcribe", label="Task")
        vd_text   = gr.Textbox(label="Transcript", lines=10)
        vd_file   = gr.File(label="Download Transcript (.txt)")
        gr.Button("Run").click(transcribe_video, inputs=[up_video, vd_task], outputs=[vd_text, vd_file])

demo.queue().launch()