Spaces:
Sleeping
Sleeping
Rivalcoder
commited on
Commit
Β·
5a416b3
1
Parent(s):
9d97df3
Add Files
Browse files- alm_pipeline.py +50 -0
- app.py +29 -0
- reasoning.py +18 -0
- requirements.txt +10 -0
alm_pipeline.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import whisper
|
| 2 |
+
import librosa
|
| 3 |
+
import numpy as np
|
| 4 |
+
import tensorflow_hub as hub
|
| 5 |
+
|
| 6 |
+
# Load ASR
|
| 7 |
+
asr_model = whisper.load_model("small")
|
| 8 |
+
|
| 9 |
+
# Load YAMNet for sound classification
|
| 10 |
+
yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
|
| 11 |
+
class_map = yamnet.class_map_path().numpy()
|
| 12 |
+
|
| 13 |
+
# Simple Emotion Estimator (from YAMNet embedding)
|
| 14 |
+
def estimate_emotion(activation):
|
| 15 |
+
mean_val = activation.mean()
|
| 16 |
+
if mean_val > 0.3:
|
| 17 |
+
return "Happy / Excited"
|
| 18 |
+
elif mean_val < -0.3:
|
| 19 |
+
return "Sad / Depressed"
|
| 20 |
+
return "Neutral"
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def speech_to_text(audio):
|
| 24 |
+
result = asr_model.transcribe(audio)
|
| 25 |
+
return result["text"]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def detect_sound(audio):
|
| 29 |
+
waveform, sr = librosa.load(audio, sr=16000)
|
| 30 |
+
waveform = waveform.reshape(1, -1)
|
| 31 |
+
scores, embeddings, _ = yamnet(waveform)
|
| 32 |
+
mean_scores = np.mean(scores.numpy(), axis=0)
|
| 33 |
+
top_idx = np.argmax(mean_scores)
|
| 34 |
+
return class_map[top_idx].decode("utf-8"), mean_scores.max()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def analyze_audio(audio_file):
|
| 38 |
+
summary = {}
|
| 39 |
+
|
| 40 |
+
summary["transcription"] = speech_to_text(audio_file)
|
| 41 |
+
|
| 42 |
+
event, confidence = detect_sound(audio_file)
|
| 43 |
+
summary["sound_event"] = event
|
| 44 |
+
summary["sound_confidence"] = float(confidence)
|
| 45 |
+
|
| 46 |
+
summary["emotion"] = "Neutral (approx)"
|
| 47 |
+
|
| 48 |
+
summary["speakers"] = "Not available in HF-free version"
|
| 49 |
+
|
| 50 |
+
return summary
|
app.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from alm_pipeline import analyze_audio
|
| 3 |
+
from reasoning import generate_reasoning
|
| 4 |
+
|
| 5 |
+
def full_pipeline(audio, question):
|
| 6 |
+
if audio is None:
|
| 7 |
+
return "No audio uploaded", ""
|
| 8 |
+
|
| 9 |
+
summary = analyze_audio(audio)
|
| 10 |
+
answer = generate_reasoning(summary, question)
|
| 11 |
+
|
| 12 |
+
return summary, answer
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
ui = gr.Interface(
|
| 16 |
+
fn=full_pipeline,
|
| 17 |
+
inputs=[
|
| 18 |
+
gr.Audio(sources=["upload"], type="filepath"),
|
| 19 |
+
gr.Textbox(label="Ask a Question About the Audio")
|
| 20 |
+
],
|
| 21 |
+
outputs=[
|
| 22 |
+
gr.JSON(label="Audio Understanding Summary"),
|
| 23 |
+
gr.Textbox(label="Reasoning Answer")
|
| 24 |
+
],
|
| 25 |
+
title="ALM Prototype β Hugging Face Edition",
|
| 26 |
+
description="Upload audio β detect speech, sounds β ask any question β get AI reasoning."
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
ui.launch()
|
reasoning.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
|
| 3 |
+
# Load lightweight reasoning model
|
| 4 |
+
reasoner = pipeline("text2text-generation", model="google/flan-t5-large")
|
| 5 |
+
|
| 6 |
+
def generate_reasoning(summary, question):
|
| 7 |
+
prompt = f"""
|
| 8 |
+
Audio Summary:
|
| 9 |
+
Speech: {summary['transcription']}
|
| 10 |
+
Main Sound Event: {summary['sound_event']}
|
| 11 |
+
Emotion: {summary['emotion']}
|
| 12 |
+
Speakers: {summary['speakers']}
|
| 13 |
+
|
| 14 |
+
Question: {question}
|
| 15 |
+
Provide a detailed reasoning-based answer using the audio cues.
|
| 16 |
+
"""
|
| 17 |
+
result = reasoner(prompt, max_length=200)[0]["generated_text"]
|
| 18 |
+
return result
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
git+https://github.com/openai/whisper.git
|
| 3 |
+
torch
|
| 4 |
+
torchaudio
|
| 5 |
+
tensorflow
|
| 6 |
+
tensorflow_hub
|
| 7 |
+
librosa
|
| 8 |
+
transformers
|
| 9 |
+
accelerate
|
| 10 |
+
sentencepiece
|