Spaces:

machinelearnAn
/

nihongoMiniteto

Sleeping

App Files Files Community

nihongoMiniteto / app.py

machinelearnAn

Update app.py

91c6a66 verified 4 months ago

raw

history blame contribute delete

2.28 kB

	import os
	import gradio as gr
	from faster_whisper import WhisperModel
	import soundfile as sf

	# Fixed config for this app
	MODEL_NAME = os.getenv("MODEL_NAME", "Systran/faster-whisper-small") # fast & accurate enough for short clips
	LANGUAGE = "ja" # force Japanese
	VAD = os.getenv("VAD_FILTER", "1") == "1"
	MAX_SECONDS = int(os.getenv("MAX_SECONDS", "120")) # 2 minutes

	_model = None

	def get_model():
	global _model
	if _model is not None:
	return _model
	# GPU first, then CPU fallbacks
	for device, compute_type in [("cuda", "float16"), ("cuda", "int8_float16"), ("cpu", "int8")]:
	try:
	m = WhisperModel(MODEL_NAME, device=device, compute_type=compute_type)
	_model = m
	print(f"[load] {MODEL_NAME} on {device}/{compute_type}")
	return _model
	except Exception as e:
	print(f"[load-failed] {device}/{compute_type}: {e}")
	continue
	raise RuntimeError("Unable to load model.")

	def transcribe_upload(audio_path):
	if not audio_path:
	return "ファイルが選択されていません。"

	# duration guard
	try:
	data, sr = sf.read(audio_path)
	duration = len(data) / float(sr)
	if duration > MAX_SECONDS:
	return f"音声が長すぎます（{duration:.1f}秒）。最大{MAX_SECONDS}秒のファイルのみ対応しています。"
	except Exception as e:
	print(f"[warn] duration check failed: {e}")

	model = get_model()
	segments, info = model.transcribe(
	audio_path,
	language=LANGUAGE, # 固定: 日本語
	task="transcribe",
	vad_filter=VAD,
	)
	text = "".join(seg.text for seg in segments)
	return text.strip()

	with gr.Blocks() as demo:
	gr.Markdown("# 🇯🇵 日本語音声→テキスト（アップロードのみ）\n- 日本語の音声ファイル（最大2分）をアップロードしてください。\n- 変換後のテキストが下に表示されます。")
	audio = gr.Audio(sources=["upload"], type="filepath", label="音声ファイルをアップロード（<2分）")
	out = gr.Textbox(lines=8, label="テキスト")
	gr.Button("文字起こし").click(transcribe_upload, inputs=[audio], outputs=[out])

	demo.launch()