Spaces:

FineToon
/

Ai-Audio-Text-To-Text

Sleeping

App Files Files Community

Ai-Audio-Text-To-Text / app.py

AiCoderv2

Update app.py

494e59f verified 5 months ago

raw

history blame contribute delete

6.42 kB

	from transformers import pipeline
	import gradio as gr
	import torch

	# Updated model options with 2 new models
	MODEL_OPTIONS = {
	"Whisper Tiny (Fastest)": "openai/whisper-tiny",
	"Whisper Base (Balanced)": "openai/whisper-base",
	"Whisper Small (Better Accuracy)": "openai/whisper-small",
	"Whisper Medium (High Accuracy)": "openai/whisper-medium",
	"Whisper Large (Highest Accuracy)": "openai/whisper-large", # New model
	"Whisper Large-v2 (Latest)": "openai/whisper-large-v2" # New model
	}

	# Language codes for Whisper
	LANGUAGE_CODES = {
	"Auto-detect": None,
	"English": "en",
	"Spanish": "es",
	"French": "fr",
	"German": "de",
	"Italian": "it",
	"Portuguese": "pt",
	"Russian": "ru",
	"Chinese": "zh",
	"Japanese": "ja",
	"Korean": "ko",
	"Arabic": "ar",
	"Hindi": "hi",
	"Dutch": "nl"
	}

	def transcribe_audio(audio_file, model_choice, task_choice, language_choice, timestamp_choice, beam_size):
	# Initialize the pipeline with selected model
	model_name = MODEL_OPTIONS[model_choice]
	task = "translate" if task_choice == "Translate to English" else "transcribe"
	language = LANGUAGE_CODES[language_choice]

	# Create pipeline
	pipe = pipeline(
	"automatic-speech-recognition",
	model=model_name,
	chunk_length_s=30,
	device=0 if torch.cuda.is_available() else -1
	)

	# Generate kwargs for the pipeline
	generate_kwargs = {
	"task": task,
	"num_beams": beam_size
	}
	if language and task == "transcribe":
	generate_kwargs["language"] = language

	# Process audio file
	if timestamp_choice:
	result = pipe(
	audio_file,
	generate_kwargs=generate_kwargs,
	return_timestamps=True
	)
	timestamp_text = "\n".join([
	f"[{chunk['timestamp'][0]:.2f}s -> {chunk['timestamp'][1]:.2f}s] {chunk['text']}"
	for chunk in result.get("chunks", [])
	])
	return result["text"], timestamp_text, gr.update(visible=True)
	else:
	result = pipe(
	audio_file,
	generate_kwargs=generate_kwargs,
	return_timestamps=False
	)
	return result["text"], "", gr.update(visible=False)

	with gr.Blocks() as demo:
	gr.Markdown("# 🎵 Audio Transcription & Translation")
	gr.Markdown("Upload an audio file or use your microphone to transcribe or translate speech.")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Audio Input",
	type="filepath"
	)

	# Updated model selection with new models
	model_choice = gr.Dropdown(
	choices=list(MODEL_OPTIONS.keys()),
	value="Whisper Tiny (Fastest)",
	label="Model Selection"
	)

	task_choice = gr.Radio(
	choices=["Transcribe", "Translate to English"],
	value="Transcribe",
	label="Task"
	)

	# Extended language options
	language_choice = gr.Dropdown(
	choices=list(LANGUAGE_CODES.keys()),
	value="Auto-detect",
	label="Language (for transcription)"
	)

	# New features
	timestamp_choice = gr.Checkbox(
	label="Include Timestamps",
	value=False
	)

	beam_size = gr.Slider(
	minimum=1,
	maximum=10,
	value=1,
	step=1,
	label="Beam Size (Higher = Better Accuracy but Slower)"
	)

	with gr.Column():
	text_output = gr.Textbox(
	lines=15,
	label="Transcription",
	interactive=False
	)

	# New output for timestamps
	timestamp_output = gr.Textbox(
	lines=8,
	label="Timestamps (if enabled)",
	interactive=False,
	visible=False
	)

	transcribe_btn = gr.Button("Transcribe Audio", variant="primary")

	transcribe_btn.click(
	transcribe_audio,
	inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
	outputs=[text_output, timestamp_output, timestamp_output]
	)

	gr.Examples(
	examples=[
	["example_audio_1.wav", "Whisper Tiny (Fastest)", "Transcribe", "Auto-detect", False, 1],
	["example_audio_2.wav", "Whisper Base (Balanced)", "Transcribe", "English", False, 1],
	["example_audio_3.wav", "Whisper Small (Better Accuracy)", "Translate to English", "Auto-detect", False, 1],
	["example_audio_4.wav", "Whisper Large (Highest Accuracy)", "Transcribe", "Spanish", True, 3]
	],
	inputs=[audio_input, model_choice, task_choice, language_choice, timestamp_choice, beam_size],
	)

	gr.Markdown("### Features")
	gr.Markdown("- Model Selection: Choose from 6 different Whisper models with speed/accuracy tradeoffs")
	gr.Markdown("- Task Options: Transcribe audio in original language or translate to English")
	gr.Markdown("- Language Selection: Auto-detect or specify input language for better accuracy")
	gr.Markdown("- Multiple Input Methods: Upload audio files or record with microphone")
	gr.Markdown("- Timestamps: Option to include word-level timestamps")
	gr.Markdown("- Beam Search: Adjustable beam size for better accuracy")

	gr.Markdown("### Model Information")
	gr.Markdown("""
	\| Model \| Parameters \| Speed \| Best For \|
	\|-------\|------------\|-------\|----------\|
	\| Whisper Tiny \| 39M \| Fastest \| Quick transcriptions, low resources \|
	\| Whisper Base \| 74M \| Fast \| Balanced performance \|
	\| Whisper Small \| 244M \| Medium \| Better accuracy \|
	\| Whisper Medium \| 769M \| Slow \| High accuracy transcriptions \|
	\| Whisper Large \| 1.5B \| Slower \| Very high accuracy \|
	\| Whisper Large-v2 \| 1.5B \| Slower \| Latest improvements \|
	""")

	gr.Markdown("- Supported Formats: WAV, MP3, M4A, FLAC")
	gr.Markdown("- Note: First transcription may take 10-60 seconds (model loading)")

	if __name__ == "__main__":
	demo.launch()