Spaces:
Build error
Build error
Commit ·
076147f
1
Parent(s): da72d94
app.py added
Browse files- app.py +125 -0
- audio_chunk_mapping_with_transcription_embeddings.csv +0 -0
- requirements.txt +6 -0
app.py
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from pydub import AudioSegment
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from sentence_transformers import SentenceTransformer
|
| 6 |
+
import whisper
|
| 7 |
+
|
| 8 |
+
# Load the model (choose "tiny", "base", "small", "medium", or "large")
|
| 9 |
+
audio_model = whisper.load_model("large", device='cpu')
|
| 10 |
+
|
| 11 |
+
# Load the model
|
| 12 |
+
sentence_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 13 |
+
|
| 14 |
+
# Simulated DataFrame containing chunk paths and transcriptions for demo
|
| 15 |
+
# Replace with your actual data
|
| 16 |
+
df_mapping = pd.read_csv('audio_chunk_mapping_with_transcription_embeddings.csv')
|
| 17 |
+
|
| 18 |
+
# Function to process the input audio and retrieve the most similar audio chunk
|
| 19 |
+
def process_and_find_audio(audio_file):
|
| 20 |
+
# Load audio from the file path
|
| 21 |
+
audio_path = "/teamspace/studios/this_studio/temp_audio.wav" # Path to temporarily save audio if needed
|
| 22 |
+
sample_rate,audio_np = audio_file
|
| 23 |
+
# Save the numpy array as an audio file if you need to pass it to the Whisper model
|
| 24 |
+
audio_segment = AudioSegment(
|
| 25 |
+
audio_np.tobytes(),
|
| 26 |
+
frame_rate=sample_rate, # Set the frame rate as appropriate
|
| 27 |
+
sample_width=2, # Assuming 16-bit samples (adjust if necessary)
|
| 28 |
+
channels=1 # Assuming mono channel (adjust if necessary)
|
| 29 |
+
)
|
| 30 |
+
# audio_file = audio_path
|
| 31 |
+
# audio_segment.export(audio_file, format="wav")
|
| 32 |
+
# Save the audio to a temporary file
|
| 33 |
+
audio_segment.export(audio_path, format="wav")
|
| 34 |
+
|
| 35 |
+
# audio_path = audio_file.name
|
| 36 |
+
transcription = audio_model.transcribe(audio_path, task="translate")['text']
|
| 37 |
+
|
| 38 |
+
# Compute embeddings for database transcriptions and user transcription
|
| 39 |
+
# sentences = df_mapping['transcription'].tolist()
|
| 40 |
+
# embeddings = model.encode(sentences)
|
| 41 |
+
embeddings = df_mapping.iloc[:, 4:].to_numpy().astype('float32')
|
| 42 |
+
embedding_query = sentence_model.encode(transcription)
|
| 43 |
+
|
| 44 |
+
# Find the most similar transcription
|
| 45 |
+
similarities = sentence_model.similarity(embeddings, embedding_query)
|
| 46 |
+
index_of_most_similar_item = int(similarities.argmax())
|
| 47 |
+
|
| 48 |
+
# Retrieve the matching audio chunk path and transcription
|
| 49 |
+
matched_chunk_path = df_mapping.loc[index_of_most_similar_item, "chunk_path"]
|
| 50 |
+
matched_chunk_text = df_mapping.loc[index_of_most_similar_item, "transcription"]
|
| 51 |
+
print(matched_chunk_path, matched_chunk_text)
|
| 52 |
+
# Return the text and audio data
|
| 53 |
+
return matched_chunk_text, matched_chunk_path
|
| 54 |
+
# return matched_chunk_text, (matched_audio_segment.frame_rate, matched_audio_np)
|
| 55 |
+
|
| 56 |
+
# Set up the Gradio interface
|
| 57 |
+
with gr.Blocks() as demo:
|
| 58 |
+
gr.Markdown("### Upload an audio file and retrieve the most similar database audio.")
|
| 59 |
+
|
| 60 |
+
# Use gr.File for file upload and define text + audio outputs
|
| 61 |
+
# audio_input = gr.File(label="Upload Your Audio")
|
| 62 |
+
mic = gr.Audio(type="numpy", label="Record Your Audio")
|
| 63 |
+
output_text = gr.Textbox(label="Matched Transcription")
|
| 64 |
+
output_audio = gr.Audio(label="Matched Audio Playback")
|
| 65 |
+
|
| 66 |
+
# Link the function to Gradio inputs and outputs
|
| 67 |
+
# audio_input.change(process_and_find_audio, inputs=audio_input, outputs=[output_text, output_audio])
|
| 68 |
+
mic.change(process_and_find_audio, inputs=mic, outputs=[output_text, output_audio])
|
| 69 |
+
|
| 70 |
+
# Launch the app
|
| 71 |
+
demo.launch(share=True)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
# import gradio as gr
|
| 86 |
+
# from pydub import AudioSegment
|
| 87 |
+
# import numpy as np
|
| 88 |
+
# from io import BytesIO
|
| 89 |
+
|
| 90 |
+
# # Simulated function to fetch audio from a "database"
|
| 91 |
+
# def get_audio_from_database():
|
| 92 |
+
# # Replace with actual database retrieval logic
|
| 93 |
+
# return "/home/shashank/Desktop/ml_fiest/Dataset/SandalWoodNewsStories_2.mp3" # Example path to an audio file
|
| 94 |
+
|
| 95 |
+
# # Define the function that takes the user-recorded audio and returns database audio
|
| 96 |
+
# def process_audio(user_audio):
|
| 97 |
+
# # Process the user audio if needed
|
| 98 |
+
# # Here we’re just passing it through without saving for demonstration purposes
|
| 99 |
+
|
| 100 |
+
# # Get the database audio
|
| 101 |
+
# db_audio_path = get_audio_from_database()
|
| 102 |
+
# db_audio_segment = AudioSegment.from_file(db_audio_path)
|
| 103 |
+
|
| 104 |
+
# # Convert db audio to numpy array and sample rate
|
| 105 |
+
# buffer = BytesIO()
|
| 106 |
+
# db_audio_segment.export(buffer, format="wav")
|
| 107 |
+
# buffer.seek(0)
|
| 108 |
+
# db_audio = np.frombuffer(buffer.read(), dtype=np.int16)
|
| 109 |
+
|
| 110 |
+
# # Return the database audio as a response
|
| 111 |
+
# return (db_audio_segment.frame_rate, db_audio)
|
| 112 |
+
|
| 113 |
+
# # Set up the Gradio interface
|
| 114 |
+
# with gr.Blocks() as demo:
|
| 115 |
+
# gr.Markdown("### Record your audio and play a sample from the database.")
|
| 116 |
+
|
| 117 |
+
# # Define microphone input and audio output without `source` argument
|
| 118 |
+
# mic = gr.Audio(type="numpy", label="Record Your Audio")
|
| 119 |
+
# output_audio = gr.Audio(label="Database Audio Response")
|
| 120 |
+
|
| 121 |
+
# # Connect the function with Gradio components
|
| 122 |
+
# mic.change(process_audio, inputs=mic, outputs=output_audio)
|
| 123 |
+
|
| 124 |
+
# # Launch the app
|
| 125 |
+
# demo.launch(share=True)
|
audio_chunk_mapping_with_transcription_embeddings.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pydub
|
| 2 |
+
gradio
|
| 3 |
+
numpy
|
| 4 |
+
pandas
|
| 5 |
+
sentence_transformers
|
| 6 |
+
whisper
|