videxam / app.py
aiqtech's picture
Update app.py
7980a34 verified
import gradio as gr
import whisper
import yt_dlp
import os
import tempfile
import time
# Whisper ๋ชจ๋ธ ๋กœ๋“œ (Spaces GPU ํ™˜๊ฒฝ์—์„œ๋Š” "medium" ๊ถŒ์žฅ, CPU๋Š” "base")
model = None
def load_model(model_size="base"):
global model
if model is None:
print(f"Loading Whisper {model_size} model...")
model = whisper.load_model(model_size)
print("Model loaded!")
return model
def extract_audio_from_youtube(url, progress=gr.Progress()):
"""YouTube URL์—์„œ ์˜ค๋””์˜ค ์ถ”์ถœ"""
progress(0.1, desc="YouTube ์˜ค๋””์˜ค ๋‹ค์šด๋กœ๋“œ ์ค‘...")
temp_dir = tempfile.mkdtemp()
output_path = os.path.join(temp_dir, "audio")
ydl_opts = {
'format': 'bestaudio/best',
'postprocessors': [{
'key': 'FFmpegExtractAudio',
'preferredcodec': 'mp3',
'preferredquality': '192',
}],
'outtmpl': output_path,
'quiet': True,
'no_warnings': True,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
title = info.get('title', 'Unknown')
duration = info.get('duration', 0)
except Exception as e:
raise gr.Error(f"YouTube ๋‹ค์šด๋กœ๋“œ ์‹คํŒจ: {str(e)}")
audio_file = output_path + ".mp3"
if not os.path.exists(audio_file):
# ํ™•์žฅ์ž๊ฐ€ ๋‹ค๋ฅผ ์ˆ˜ ์žˆ์Œ
for ext in ['.mp3', '.m4a', '.wav', '.webm', '.opus']:
candidate = output_path + ext
if os.path.exists(candidate):
audio_file = candidate
break
if not os.path.exists(audio_file):
raise gr.Error("์˜ค๋””์˜ค ํŒŒ์ผ ์ถ”์ถœ ์‹คํŒจ")
return audio_file, title, duration
def format_timestamp(seconds):
"""์ดˆ๋ฅผ HH:MM:SS ํ˜•์‹์œผ๋กœ ๋ณ€ํ™˜"""
h = int(seconds // 3600)
m = int((seconds % 3600) // 60)
s = int(seconds % 60)
if h > 0:
return f"{h:02d}:{m:02d}:{s:02d}"
return f"{m:02d}:{s:02d}"
def transcribe_youtube(url, model_size, language, output_format, progress=gr.Progress()):
"""๋ฉ”์ธ ์ฒ˜๋ฆฌ ํ•จ์ˆ˜: YouTube URL โ†’ ํ…์ŠคํŠธ"""
if not url or not url.strip():
raise gr.Error("YouTube URL์„ ์ž…๋ ฅํ•ด์ฃผ์„ธ์š”.")
start_time = time.time()
# 1) ์˜ค๋””์˜ค ์ถ”์ถœ
audio_file, title, duration = extract_audio_from_youtube(url, progress)
# 2) Whisper ๋ชจ๋ธ ๋กœ๋“œ
progress(0.3, desc=f"Whisper {model_size} ๋ชจ๋ธ ๋กœ๋”ฉ ์ค‘...")
whisper_model = load_model(model_size)
# 3) ์Œ์„ฑ ์ธ์‹
progress(0.5, desc="์Œ์„ฑ ์ธ์‹ ์ค‘... (์˜์ƒ ๊ธธ์ด์— ๋”ฐ๋ผ ์‹œ๊ฐ„์ด ์†Œ์š”๋ฉ๋‹ˆ๋‹ค)")
transcribe_opts = {
"verbose": False,
"fp16": False,
}
if language != "auto":
transcribe_opts["language"] = language
result = whisper_model.transcribe(audio_file, **transcribe_opts)
progress(0.9, desc="๊ฒฐ๊ณผ ์ •๋ฆฌ ์ค‘...")
# 4) ๊ฒฐ๊ณผ ํฌ๋งทํŒ…
detected_lang = result.get("language", "unknown")
segments = result.get("segments", [])
if output_format == "ํ…์ŠคํŠธ๋งŒ":
transcript = result["text"].strip()
elif output_format == "ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ":
lines = []
for seg in segments:
ts = format_timestamp(seg["start"])
lines.append(f"[{ts}] {seg['text'].strip()}")
transcript = "\n".join(lines)
else: # SRT ์ž๋ง‰
srt_lines = []
for i, seg in enumerate(segments, 1):
start = seg["start"]
end = seg["end"]
start_ts = f"{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d},{int((start%1)*1000):03d}"
end_ts = f"{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d},{int((end%1)*1000):03d}"
srt_lines.append(f"{i}")
srt_lines.append(f"{start_ts} --> {end_ts}")
srt_lines.append(seg["text"].strip())
srt_lines.append("")
transcript = "\n".join(srt_lines)
elapsed = time.time() - start_time
# 5) ๋ฉ”ํƒ€ ์ •๋ณด
info_text = f"""๐Ÿ“น ์ œ๋ชฉ: {title}
โฑ๏ธ ์˜์ƒ ๊ธธ์ด: {format_timestamp(duration)}
๐ŸŒ ๊ฐ์ง€๋œ ์–ธ์–ด: {detected_lang}
๐Ÿ“ ์„ธ๊ทธ๋จผํŠธ ์ˆ˜: {len(segments)}
โšก ์ฒ˜๋ฆฌ ์‹œ๊ฐ„: {elapsed:.1f}์ดˆ"""
# 6) ํ…์ŠคํŠธ ํŒŒ์ผ ์ €์žฅ
txt_path = os.path.join(tempfile.mkdtemp(), f"{title[:50]}_transcript.txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(transcript)
# ์˜ค๋””์˜ค ํŒŒ์ผ ์ •๋ฆฌ
try:
os.remove(audio_file)
except:
pass
progress(1.0, desc="์™„๋ฃŒ!")
return info_text, transcript, txt_path
# ==================== Gradio UI ====================
css = """
#title { text-align: center; margin-bottom: 0.5em; }
#subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
.output-text textarea { font-size: 14px !important; line-height: 1.6 !important; }
"""
with gr.Blocks(
title="YouTube Speech-to-Text"
) as demo:
gr.HTML("<h1 id='title'>๐ŸŽฌ YouTube Speech-to-Text</h1>")
gr.HTML("<p id='subtitle'>YouTube ์˜์ƒ์˜ ์Œ์„ฑ์„ ํ…์ŠคํŠธ๋กœ ๋ณ€ํ™˜ํ•ฉ๋‹ˆ๋‹ค</p>")
with gr.Row():
with gr.Column(scale=3):
url_input = gr.Textbox(
label="YouTube URL",
placeholder="https://www.youtube.com/watch?v=... ๋˜๋Š” https://youtu.be/...",
lines=1,
)
with gr.Column(scale=1):
model_size = gr.Dropdown(
choices=["tiny", "base", "small", "medium", "large"],
value="base",
label="Whisper ๋ชจ๋ธ",
info="ํฌ๊ธฐ๊ฐ€ ํด์ˆ˜๋ก ์ •ํ™•ํ•˜์ง€๋งŒ ๋А๋ฆฝ๋‹ˆ๋‹ค"
)
with gr.Row():
language = gr.Dropdown(
choices=[
("์ž๋™ ๊ฐ์ง€", "auto"),
("ํ•œ๊ตญ์–ด", "ko"),
("์˜์–ด", "en"),
("์ผ๋ณธ์–ด", "ja"),
("์ค‘๊ตญ์–ด", "zh"),
],
value="auto",
label="์–ธ์–ด ์„ค์ •",
)
output_format = gr.Dropdown(
choices=["ํ…์ŠคํŠธ๋งŒ", "ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ", "SRT ์ž๋ง‰"],
value="ํƒ€์ž„์Šคํƒฌํ”„ ํฌํ•จ",
label="์ถœ๋ ฅ ํ˜•์‹",
)
run_btn = gr.Button("๐Ÿš€ ๋ณ€ํ™˜ ์‹œ์ž‘", variant="primary", size="lg")
with gr.Row():
info_output = gr.Textbox(label="๐Ÿ“‹ ์˜์ƒ ์ •๋ณด", lines=5, interactive=False)
transcript_output = gr.Textbox(
label="๐Ÿ“ ๋ณ€ํ™˜ ๊ฒฐ๊ณผ",
lines=15,
interactive=True,
elem_classes=["output-text"],
)
file_output = gr.File(label="๐Ÿ’พ ํ…์ŠคํŠธ ํŒŒ์ผ ๋‹ค์šด๋กœ๋“œ")
run_btn.click(
fn=transcribe_youtube,
inputs=[url_input, model_size, language, output_format],
outputs=[info_output, transcript_output, file_output],
)
gr.Markdown("""
---
**์‚ฌ์šฉ ํŒ:**
- `tiny`/`base`: ๋น ๋ฅด์ง€๋งŒ ์ •ํ™•๋„ ๋‚ฎ์Œ (CPU ํ™˜๊ฒฝ ๊ถŒ์žฅ)
- `small`/`medium`: ๊ท ํ˜• ์žกํžŒ ์„ ํƒ
- `large`: ์ตœ๊ณ  ์ •ํ™•๋„ (GPU ํ•„์ˆ˜, ์‹œ๊ฐ„ ์†Œ์š”)
- ํ•œ๊ตญ์–ด ์˜์ƒ์€ ์–ธ์–ด๋ฅผ `ํ•œ๊ตญ์–ด`๋กœ ์ง€์ •ํ•˜๋ฉด ๋” ์ •ํ™•ํ•ฉ๋‹ˆ๋‹ค
""")
if __name__ == "__main__":
demo.launch(theme=gr.themes.Soft(), css=css)