Spaces:

ShashankSS1205
/

ml_fiesta

Sleeping

App Files Files Community

ml_fiesta / app.py

ShashankSS1205

dataset added

d675cd3 over 1 year ago

raw

history blame contribute delete

4.79 kB

	import gradio as gr
	from pydub import AudioSegment
	import numpy as np
	import pandas as pd
	from sentence_transformers import SentenceTransformer
	import whisper

	# Load the model (choose "tiny", "base", "small", "medium", or "large")
	audio_model = whisper.load_model("large", device='cpu')

	# Load the model
	sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Simulated DataFrame containing chunk paths and transcriptions for demo
	# Replace with your actual data
	df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv')

	# Function to process the input audio and retrieve the most similar audio chunk
	def process_and_find_audio(audio_file):
	# Load audio from the file path
	audio_path = "./temp_audio.wav" # Path to temporarily save audio if needed
	sample_rate,audio_np = audio_file
	# Save the numpy array as an audio file if you need to pass it to the Whisper model
	audio_segment = AudioSegment(
	audio_np.tobytes(),
	frame_rate=sample_rate, # Set the frame rate as appropriate
	sample_width=2, # Assuming 16-bit samples (adjust if necessary)
	channels=1 # Assuming mono channel (adjust if necessary)
	)
	# audio_file = audio_path
	# audio_segment.export(audio_file, format="wav")
	# Save the audio to a temporary file
	audio_segment.export(audio_path, format="wav")

	# audio_path = audio_file.name
	transcription = audio_model.transcribe(audio_path, task="translate")['text']

	# Compute embeddings for database transcriptions and user transcription
	# sentences = df_mapping['transcription'].tolist()
	# embeddings = model.encode(sentences)
	embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32')
	embedding_query = sentence_model.encode(transcription)

	# Find the most similar transcription
	similarities = sentence_model.similarity(embeddings, embedding_query)
	index_of_most_similar_item = int(similarities.argmax())

	# Retrieve the matching audio chunk path and transcription
	matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"]
	matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"]
	print(matched_chunk_path, matched_chunk_text)
	# Return the text and audio data
	return matched_chunk_text, matched_chunk_path
	# return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np)

	# Set up the Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("### Upload an audio file and retrieve the most similar database audio.")

	# Use gr.File for file upload and define text + audio outputs
	# audio_input = gr.File(label="Upload Your Audio")
	mic = gr.Audio(type="numpy", label="Record Your Audio")
	output_text = gr.Textbox(label="Matched Transcription")
	output_audio = gr.Audio(label="Matched Audio Playback")

	# Link the function to Gradio inputs and outputs
	# audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio])
	mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio])

	# Launch the app
	demo.launch(share=True)













	# import gradio as gr
	# from pydub import AudioSegment
	# import numpy as np
	# from io import BytesIO

	# # Simulated function to fetch audio from a "database"
	# def get_audio_from_database():
	# # Replace with actual database retrieval logic
	# return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3" # Example path to an audio file

	# # Define the function that takes the user-recorded audio and returns database audio
	# def process_audio(user_audio):
	# # Process the user audio if needed
	# # Here we’re just passing it through without saving for demonstration purposes

	# # Get the database audio
	# db_audio_path = get_audio_from_database()
	# db_audio_segment = AudioSegment.from_file(db_audio_path)

	# # Convert db audio to numpy array and sample rate
	# buffer = BytesIO()
	# db_audio_segment.export(buffer, format="wav")
	# buffer.seek(0)
	# db_audio = np.frombuffer(buffer.read(), dtype=np.int16)

	# # Return the database audio as a response
	# return (db_audio_segment.frame_rate, db_audio)

	# # Set up the Gradio interface
	# with gr.Blocks() as demo:
	# gr.Markdown("### Record your audio and play a sample from the database.")

	# # Define microphone input and audio output without `source` argument
	# mic = gr.Audio(type="numpy", label="Record Your Audio")
	# output_audio = gr.Audio(label="Database Audio Response")

	# # Connect the function with Gradio components
	# mic.change(process_audio, inputs=mic, outputs=output_audio)

	# # Launch the app
	# demo.launch(share=True)