Spaces:

GenerativeIntelligence
/

voitex07122024

Build error

App Files Files Community

voitex07122024 / app.py

Pijush2023

Update app.py

a7f218a verified over 1 year ago

raw

history blame contribute delete

7.85 kB

	# import gradio as gr
	# import numpy as np
	# import torch
	# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

	# model_id = 'openai/whisper-large-v3'
	# device = "cuda:0" if torch.cuda.is_available() else "cpu"
	# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
	# processor = AutoProcessor.from_pretrained(model_id)

	# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=True)

	# def transcribe_function(new_chunk, state):
	# try:
	# sr, y = new_chunk[0], new_chunk[1]
	# except TypeError:
	# print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
	# return state, "", None

	# y = y.astype(np.float32) / np.max(np.abs(y))

	# if state is not None:
	# state = np.concatenate([state, y])
	# else:
	# state = y

	# result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False)

	# full_text = result.get("text", "")

	# return state, full_text

	# with gr.Blocks() as demo:
	# gr.Markdown("# Voice to Text Transcription")

	# state = gr.State(None)

	# with gr.Row():
	# with gr.Column():
	# audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
	# with gr.Column():
	# output_text = gr.Textbox(label="Transcription")

	# audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")

	# demo.launch(show_error=True)

	# import gradio as gr
	# import numpy as np
	# import torch
	# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

	# model_id = 'openai/whisper-large-v3'
	# device = "cuda:0" if torch.cuda.is_available() else "cpu"
	# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
	# processor = AutoProcessor.from_pretrained(model_id)

	# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)

	# def transcribe_function(new_chunk, state):
	# try:
	# sr, y = new_chunk
	# except TypeError:
	# print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
	# return state, "", None

	# y = y.astype(np.float32) / np.max(np.abs(y))

	# if state is not None:
	# state = np.concatenate([state, y])
	# else:
	# state = y

	# result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False)

	# full_text = result.get("text", "")

	# return state, full_text

	# with gr.Blocks() as demo:
	# gr.Markdown("# Voice to Text Transcription")

	# state = gr.State(None)

	# with gr.Row():
	# with gr.Column():
	# audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
	# with gr.Column():
	# output_text = gr.Textbox(label="Transcription")

	# audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")

	# demo.launch(show_error=True)
	# import gradio as gr
	# import numpy as np
	# import torch
	# from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

	# model_id = 'openai/whisper-large-v3'
	# device = "cuda:0" if torch.cuda.is_available() else "cpu"
	# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	# model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
	# processor = AutoProcessor.from_pretrained(model_id)

	# pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)

	# def transcribe_function(new_chunk, state):
	# try:
	# sr, y = new_chunk
	# except TypeError:
	# print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
	# return state, "", None

	# y = y.astype(np.float32) / np.max(np.abs(y))

	# if state is not None:
	# state = np.concatenate([state, y])
	# else:
	# state = y

	# result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False)

	# full_text = result.get("text", "")

	# return state, full_text

	# with gr.Blocks() as demo:
	# gr.Markdown("# Voice to Text Transcription")

	# state = gr.State(None)

	# with gr.Row():
	# with gr.Column():
	# audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
	# with gr.Column():
	# output_text = gr.Textbox(label="Transcription")

	# audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")

	# demo.launch(show_error=True)


	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline, AutoModelForSpeechSeq2Seq, AutoProcessor

	model_id = 'openai/whisper-large-v3'
	device = "cuda:0" if torch.cuda.is_available() else "cpu"
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
	model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype).to(device)
	processor = AutoProcessor.from_pretrained(model_id)

	pipe_asr = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=16, torch_dtype=torch_dtype, device=device, return_timestamps=False)

	def ensure_mono(y):
	if len(y.shape) > 1 and y.shape[1] > 1:
	y = np.mean(y, axis=1)
	return y

	def transcribe_function(new_chunk, state):
	try:
	sr, y = new_chunk
	except TypeError:
	print(f"Error chunk structure: {type(new_chunk)}, content: {new_chunk}")
	return state, "", None

	y = ensure_mono(y)
	y = y.astype(np.float32) / np.max(np.abs(y))

	if state is not None:
	state = np.concatenate([state, y])
	else:
	state = y

	result = pipe_asr({"array": state, "sampling_rate": sr}, return_timestamps=False)

	full_text = result.get("text", "")

	return state, full_text

	def upload_transcribe(file):
	sr, y = file
	y = ensure_mono(y)
	y = y.astype(np.float32) / np.max(np.abs(y))
	result = pipe_asr({"array": y, "sampling_rate": sr}, return_timestamps=False)
	return result.get("text", "")

	with gr.Blocks() as demo:
	gr.Markdown("# Voice to Text Transcription")

	state = gr.State(None)

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(sources=["microphone"], streaming=True, type='numpy', label="Microphone Input")
	audio_upload = gr.Audio(sources="upload", type='numpy', label="Upload Audio File")
	with gr.Column():
	output_text = gr.Textbox(label="Transcription")
	upload_text = gr.Textbox(label="Uploaded Audio Transcription")

	audio_input.stream(transcribe_function, inputs=[audio_input, state], outputs=[state, output_text], api_name="SAMLOne_real_time")
	audio_upload.change(upload_transcribe, inputs=audio_upload, outputs=upload_text)

	demo.launch(show_error=True)