Spaces:

aiqtech
/

videxam

Paused

App Files Files Community

videxam / app.py

aiqtech

Update app.py

7980a34 verified 27 days ago

raw

history blame contribute delete

7.34 kB

	import gradio as gr
	import whisper
	import yt_dlp
	import os
	import tempfile
	import time

	# Whisper 모델 로드 (Spaces GPU 환경에서는 "medium" 권장, CPU는 "base")
	model = None

	def load_model(model_size="base"):
	global model
	if model is None:
	print(f"Loading Whisper {model_size} model...")
	model = whisper.load_model(model_size)
	print("Model loaded!")
	return model

	def extract_audio_from_youtube(url, progress=gr.Progress()):
	"""YouTube URL에서 오디오 추출"""
	progress(0.1, desc="YouTube 오디오 다운로드 중...")

	temp_dir = tempfile.mkdtemp()
	output_path = os.path.join(temp_dir, "audio")

	ydl_opts = {
	'format': 'bestaudio/best',
	'postprocessors': [{
	'key': 'FFmpegExtractAudio',
	'preferredcodec': 'mp3',
	'preferredquality': '192',
	}],
	'outtmpl': output_path,
	'quiet': True,
	'no_warnings': True,
	}

	try:
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	info = ydl.extract_info(url, download=True)
	title = info.get('title', 'Unknown')
	duration = info.get('duration', 0)
	except Exception as e:
	raise gr.Error(f"YouTube 다운로드 실패: {str(e)}")

	audio_file = output_path + ".mp3"
	if not os.path.exists(audio_file):
	# 확장자가 다를 수 있음
	for ext in ['.mp3', '.m4a', '.wav', '.webm', '.opus']:
	candidate = output_path + ext
	if os.path.exists(candidate):
	audio_file = candidate
	break

	if not os.path.exists(audio_file):
	raise gr.Error("오디오 파일 추출 실패")

	return audio_file, title, duration

	def format_timestamp(seconds):
	"""초를 HH:MM:SS 형식으로 변환"""
	h = int(seconds // 3600)
	m = int((seconds % 3600) // 60)
	s = int(seconds % 60)
	if h > 0:
	return f"{h:02d}:{m:02d}:{s:02d}"
	return f"{m:02d}:{s:02d}"

	def transcribe_youtube(url, model_size, language, output_format, progress=gr.Progress()):
	"""메인 처리 함수: YouTube URL → 텍스트"""
	if not url or not url.strip():
	raise gr.Error("YouTube URL을 입력해주세요.")

	start_time = time.time()

	# 1) 오디오 추출
	audio_file, title, duration = extract_audio_from_youtube(url, progress)

	# 2) Whisper 모델 로드
	progress(0.3, desc=f"Whisper {model_size} 모델 로딩 중...")
	whisper_model = load_model(model_size)

	# 3) 음성 인식
	progress(0.5, desc="음성 인식 중... (영상 길이에 따라 시간이 소요됩니다)")

	transcribe_opts = {
	"verbose": False,
	"fp16": False,
	}

	if language != "auto":
	transcribe_opts["language"] = language

	result = whisper_model.transcribe(audio_file, **transcribe_opts)

	progress(0.9, desc="결과 정리 중...")

	# 4) 결과 포맷팅
	detected_lang = result.get("language", "unknown")
	segments = result.get("segments", [])

	if output_format == "텍스트만":
	transcript = result["text"].strip()
	elif output_format == "타임스탬프 포함":
	lines = []
	for seg in segments:
	ts = format_timestamp(seg["start"])
	lines.append(f"[{ts}] {seg['text'].strip()}")
	transcript = "\n".join(lines)
	else: # SRT 자막
	srt_lines = []
	for i, seg in enumerate(segments, 1):
	start = seg["start"]
	end = seg["end"]
	start_ts = f"{int(start//3600):02d}:{int((start%3600)//60):02d}:{int(start%60):02d},{int((start%1)*1000):03d}"
	end_ts = f"{int(end//3600):02d}:{int((end%3600)//60):02d}:{int(end%60):02d},{int((end%1)*1000):03d}"
	srt_lines.append(f"{i}")
	srt_lines.append(f"{start_ts} --> {end_ts}")
	srt_lines.append(seg["text"].strip())
	srt_lines.append("")
	transcript = "\n".join(srt_lines)

	elapsed = time.time() - start_time

	# 5) 메타 정보
	info_text = f"""📹 제목: {title}
	⏱️ 영상 길이: {format_timestamp(duration)}
	🌐 감지된 언어: {detected_lang}
	📝 세그먼트 수: {len(segments)}
	⚡ 처리 시간: {elapsed:.1f}초"""

	# 6) 텍스트 파일 저장
	txt_path = os.path.join(tempfile.mkdtemp(), f"{title[:50]}_transcript.txt")
	with open(txt_path, "w", encoding="utf-8") as f:
	f.write(transcript)

	# 오디오 파일 정리
	try:
	os.remove(audio_file)
	except:
	pass

	progress(1.0, desc="완료!")

	return info_text, transcript, txt_path


	# ==================== Gradio UI ====================
	css = """
	#title { text-align: center; margin-bottom: 0.5em; }
	#subtitle { text-align: center; color: #666; margin-bottom: 1.5em; }
	.output-text textarea { font-size: 14px !important; line-height: 1.6 !important; }
	"""

	with gr.Blocks(
	title="YouTube Speech-to-Text"
	) as demo:

	gr.HTML("<h1 id='title'>🎬 YouTube Speech-to-Text</h1>")
	gr.HTML("<p id='subtitle'>YouTube 영상의 음성을 텍스트로 변환합니다</p>")

	with gr.Row():
	with gr.Column(scale=3):
	url_input = gr.Textbox(
	label="YouTube URL",
	placeholder="https://www.youtube.com/watch?v=... 또는 https://youtu.be/...",
	lines=1,
	)
	with gr.Column(scale=1):
	model_size = gr.Dropdown(
	choices=["tiny", "base", "small", "medium", "large"],
	value="base",
	label="Whisper 모델",
	info="크기가 클수록 정확하지만 느립니다"
	)

	with gr.Row():
	language = gr.Dropdown(
	choices=[
	("자동 감지", "auto"),
	("한국어", "ko"),
	("영어", "en"),
	("일본어", "ja"),
	("중국어", "zh"),
	],
	value="auto",
	label="언어 설정",
	)
	output_format = gr.Dropdown(
	choices=["텍스트만", "타임스탬프 포함", "SRT 자막"],
	value="타임스탬프 포함",
	label="출력 형식",
	)

	run_btn = gr.Button("🚀 변환 시작", variant="primary", size="lg")

	with gr.Row():
	info_output = gr.Textbox(label="📋 영상 정보", lines=5, interactive=False)

	transcript_output = gr.Textbox(
	label="📝 변환 결과",
	lines=15,
	interactive=True,
	elem_classes=["output-text"],
	)

	file_output = gr.File(label="💾 텍스트 파일 다운로드")

	run_btn.click(
	fn=transcribe_youtube,
	inputs=[url_input, model_size, language, output_format],
	outputs=[info_output, transcript_output, file_output],
	)

	gr.Markdown("""
	---
	사용 팁:
	- `tiny`/`base`: 빠르지만 정확도 낮음 (CPU 환경 권장)
	- `small`/`medium`: 균형 잡힌 선택
	- `large`: 최고 정확도 (GPU 필수, 시간 소요)
	- 한국어 영상은 언어를 `한국어`로 지정하면 더 정확합니다
	""")

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft(), css=css)