Spaces:

aiqtech
/

videxam

Paused

File size: 7,337 Bytes

import gradio as gr
import whisper
import yt_dlp
import os
import tempfile
import time

# Whisper 모델 로드 (Spaces GPU 환경에서는 "medium" 권장, CPU는 "base")
model = None

def load_model(model_size="base"):
    global model
    if model is None:
        print(f"Loading Whisper {model_size} model...")
        model = whisper.load_model(model_size)
        print("Model loaded!")
    return model

def extract_audio_from_youtube(url, progress=gr.Progress()):
    """YouTube URL에서 오디오 추출"""
    progress(0.1, desc="YouTube 오디오 다운로드 중...")
    
    temp_dir = tempfile.mkdtemp()
    output_path = os.path.join(temp_dir, "audio")
    
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'outtmpl': output_path,
        'quiet': True,
        'no_warnings': True,
    }
    
    try:
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(url, download=True)
            title = info.get('title', 'Unknown')
            duration = info.get('duration', 0)
    except Exception as e:
        raise gr.Error(f"YouTube 다운로드 실패: {str(e)}")
    
    audio_file = output_path + ".mp3"
    if not os.path.exists(audio_file):
        # 확장자가 다를 수 있음
        for ext in ['.mp3', '.m4a', '.wav', '.webm', '.opus']:
            candidate = output_path + ext
            if os.path.exists(candidate):
                audio_file = candidate
                break
    
    if not os.path.exists(audio_file):
        raise gr.Error("오디오 파일 추출 실패")
    
    return audio_file, title, duration

def format_timestamp(seconds):
    """초를 HH:MM:SS 형식으로 변환"""
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)
    if h > 0:
        return f"{h:02d}:{m:02d}:{s:02d}"
    return f"{m:02d}:{s:02d}"

def transcribe_youtube(url, model_size, language, output_format, progress=gr.Progress()):
    """메인 처리 함수: YouTube URL → 텍스트"""
    if not url or not url.strip():
        raise gr.Error("YouTube URL을 입력해주세요.")
    
    start_time = time.time()
    
    # 1) 오디오 추출
    audio_file, title, duration = extract_audio_from_youtube(url, progress)
    
    # 2) Whisper 모델 로드
    progress(0.3, desc=f"Whisper {model_size} 모델 로딩 중...")
    whisper_model = load_model(model_size)
    
    # 3) 음성 인식
    progress(0.5, desc="음성 인식 중... (영상 길이에 따라 시간이 소요됩니다)")
    
    transcribe_opts = {
        "verbose": False,
        "fp16": False,
    }
    
    if language != "auto":
        transcribe_opts["language"] = language
    
    result = whisper_model.transcribe(audio_file, **transcribe_opts)
    
    progress(0.9, desc="결과 정리 중...")
    
    # 4) 결과 포맷팅
    detected_lang = result.get("language", "unknown")
    segments = result.get("segments", [])
    
    if output_format == "텍스트만":
        transcript = result["text"].strip()
    elif output_format == "타임스탬프 포함":
        lines = []
        for seg in segments:
            ts = format_timestamp(seg["start"])
            lines.append(f"[{ts}] {seg['text'].strip()}")
        transcript = "\n".join(lines)
    else:  # SRT 자막
        srt_lines = []
        for i, seg in enumerate(segments, 1):
            start = seg["start"]
            end = seg["end"]
            start_ts = f"{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d},{int((start%1)*1000):03d}"
            end_ts = f"{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d},{int((end%1)*1000):03d}"
            srt_lines.append(f"{i}")
            srt_lines.append(f"{start_ts} --> {end_ts}")
            srt_lines.append(seg["text"].strip())
            srt_lines.append("")
        transcript = "\n".join(srt_lines)
    
    elapsed = time.time() - start_time
    
    # 5) 메타 정보
    info_text = f"""📹 제목: {title}
⏱️ 영상 길이: {format_timestamp(duration)}
🌐 감지된 언어: {detected_lang}
📝 세그먼트 수: {len(segments)}
⚡ 처리 시간: {elapsed:.1f}초"""
    
    # 6) 텍스트 파일 저장
    txt_path = os.path.join(tempfile.mkdtemp(), f"{title[:50]}_transcript.txt")
    with open(txt_path, "w", encoding="utf-8") as f:
        f.write(transcript)
    
    # 오디오 파일 정리
    try:
        os.remove(audio_file)
    except:
        pass
    
    progress(1.0, desc="완료!")
    
    return info_text, transcript, txt_path


# ==================== Gradio UI ====================
css = """
#title { text-align: center; margin-bottom: 0.5em; }
#subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
.output-text textarea { font-size: 14px !important; line-height: 1.6 !important; }
"""

with gr.Blocks(
    title="YouTube Speech-to-Text"
) as demo:
    
    gr.HTML("<h1 id='title'>🎬 YouTube Speech-to-Text</h1>")
    gr.HTML("<p id='subtitle'>YouTube 영상의 음성을 텍스트로 변환합니다</p>")
    
    with gr.Row():
        with gr.Column(scale=3):
            url_input = gr.Textbox(
                label="YouTube URL",
                placeholder="https://www.youtube.com/watch?v=... 또는 https://youtu.be/...",
                lines=1,
            )
        with gr.Column(scale=1):
            model_size = gr.Dropdown(
                choices=["tiny", "base", "small", "medium", "large"],
                value="base",
                label="Whisper 모델",
                info="크기가 클수록 정확하지만 느립니다"
            )
    
    with gr.Row():
        language = gr.Dropdown(
            choices=[
                ("자동 감지", "auto"),
                ("한국어", "ko"),
                ("영어", "en"),
                ("일본어", "ja"),
                ("중국어", "zh"),
            ],
            value="auto",
            label="언어 설정",
        )
        output_format = gr.Dropdown(
            choices=["텍스트만", "타임스탬프 포함", "SRT 자막"],
            value="타임스탬프 포함",
            label="출력 형식",
        )
    
    run_btn = gr.Button("🚀 변환 시작", variant="primary", size="lg")
    
    with gr.Row():
        info_output = gr.Textbox(label="📋 영상 정보", lines=5, interactive=False)
    
    transcript_output = gr.Textbox(
        label="📝 변환 결과",
        lines=15,
        interactive=True,
        elem_classes=["output-text"],
    )
    
    file_output = gr.File(label="💾 텍스트 파일 다운로드")
    
    run_btn.click(
        fn=transcribe_youtube,
        inputs=[url_input, model_size, language, output_format],
        outputs=[info_output, transcript_output, file_output],
    )
    
    gr.Markdown("""
    ---
    **사용 팁:**
    - `tiny`/`base`: 빠르지만 정확도 낮음 (CPU 환경 권장)
    - `small`/`medium`: 균형 잡힌 선택
    - `large`: 최고 정확도 (GPU 필수, 시간 소요)
    - 한국어 영상은 언어를 `한국어`로 지정하면 더 정확합니다
    """)

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft(), css=css)