Spaces:

ShashankSS1205
/

ml_fiesta

Build error

App Files Files Community

ShashankSS1205 commited on Nov 9, 2024

Commit

076147f

1 Parent(s): da72d94

app.py added

Browse files

Files changed (3) hide show

app.py +125 -0
audio_chunk_mapping_with_transcription_embeddings.csv +0 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,125 @@

+import gradio as gr
+from pydub import AudioSegment
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import whisper
+# Load the model (choose "tiny", "base", "small", "medium", or "large")
+audio_model = whisper.load_model("large", device='cpu')
+# Load the model
+sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# Simulated DataFrame containing chunk paths and transcriptions for demo
+# Replace with your actual data
+df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv')
+# Function to process the input audio and retrieve the most similar audio chunk
+def process_and_find_audio(audio_file):
+    # Load audio from the file path
+    audio_path = "/teamspace/studios/this_studio/temp_audio.wav"  # Path to temporarily save audio if needed
+    sample_rate,audio_np  = audio_file
+    # Save the numpy array as an audio file if you need to pass it to the Whisper model
+    audio_segment = AudioSegment(
+        audio_np.tobytes(),
+        frame_rate=sample_rate,  # Set the frame rate as appropriate
+        sample_width=2,  # Assuming 16-bit samples (adjust if necessary)
+        channels=1  # Assuming mono channel (adjust if necessary)
+    )
+    # audio_file = audio_path
+    # audio_segment.export(audio_file, format="wav")
+    # Save the audio to a temporary file
+    audio_segment.export(audio_path, format="wav")
+    # audio_path = audio_file.name
+    transcription = audio_model.transcribe(audio_path, task="translate")['text']
+    # Compute embeddings for database transcriptions and user transcription
+    # sentences = df_mapping['transcription'].tolist()
+    # embeddings = model.encode(sentences)
+    embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32')
+    embedding_query = sentence_model.encode(transcription)
+    # Find the most similar transcription
+    similarities = sentence_model.similarity(embeddings, embedding_query)
+    index_of_most_similar_item = int(similarities.argmax())
+    # Retrieve the matching audio chunk path and transcription
+    matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"]
+    matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"]
+    print(matched_chunk_path, matched_chunk_text)
+    # Return the text and audio data
+    return matched_chunk_text, matched_chunk_path
+    # return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np)
+# Set up the Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("### Upload an audio file and retrieve the most similar database audio.")
+    # Use gr.File for file upload and define text + audio outputs
+    # audio_input = gr.File(label="Upload Your Audio")
+    mic = gr.Audio(type="numpy", label="Record Your Audio")
+    output_text = gr.Textbox(label="Matched Transcription")
+    output_audio = gr.Audio(label="Matched Audio Playback")
+    # Link the function to Gradio inputs and outputs
+    # audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio])
+    mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio])
+# Launch the app
+demo.launch(share=True)
+# import gradio as gr
+# from pydub import AudioSegment
+# import numpy as np
+# from io import BytesIO
+# # Simulated function to fetch audio from a "database"
+# def get_audio_from_database():
+#     # Replace with actual database retrieval logic
+#     return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3"  # Example path to an audio file
+# # Define the function that takes the user-recorded audio and returns database audio
+# def process_audio(user_audio):
+#     # Process the user audio if needed
+#     # Here we’re just passing it through without saving for demonstration purposes
+#     # Get the database audio
+#     db_audio_path = get_audio_from_database()
+#     db_audio_segment = AudioSegment.from_file(db_audio_path)
+#     # Convert db audio to numpy array and sample rate
+#     buffer = BytesIO()
+#     db_audio_segment.export(buffer, format="wav")
+#     buffer.seek(0)
+#     db_audio = np.frombuffer(buffer.read(), dtype=np.int16)
+#     # Return the database audio as a response
+#     return (db_audio_segment.frame_rate, db_audio)
+# # Set up the Gradio interface
+# with gr.Blocks() as demo:
+#     gr.Markdown("### Record your audio and play a sample from the database.")
+#     # Define microphone input and audio output without `source` argument
+#     mic = gr.Audio(type="numpy", label="Record Your Audio")
+#     output_audio = gr.Audio(label="Database Audio Response")
+#     # Connect the function with Gradio components
+#     mic.change(process_audio, inputs=mic, outputs=output_audio)
+# # Launch the app
+# demo.launch(share=True)

audio_chunk_mapping_with_transcription_embeddings.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+pydub
+gradio
+numpy
+pandas
+sentence_transformers
+whisper