Rivalcoder commited on
Commit
5a416b3
Β·
1 Parent(s): 9d97df3
Files changed (4) hide show
  1. alm_pipeline.py +50 -0
  2. app.py +29 -0
  3. reasoning.py +18 -0
  4. requirements.txt +10 -0
alm_pipeline.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import whisper
2
+ import librosa
3
+ import numpy as np
4
+ import tensorflow_hub as hub
5
+
6
+ # Load ASR
7
+ asr_model = whisper.load_model("small")
8
+
9
+ # Load YAMNet for sound classification
10
+ yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
11
+ class_map = yamnet.class_map_path().numpy()
12
+
13
+ # Simple Emotion Estimator (from YAMNet embedding)
14
+ def estimate_emotion(activation):
15
+ mean_val = activation.mean()
16
+ if mean_val > 0.3:
17
+ return "Happy / Excited"
18
+ elif mean_val < -0.3:
19
+ return "Sad / Depressed"
20
+ return "Neutral"
21
+
22
+
23
+ def speech_to_text(audio):
24
+ result = asr_model.transcribe(audio)
25
+ return result["text"]
26
+
27
+
28
+ def detect_sound(audio):
29
+ waveform, sr = librosa.load(audio, sr=16000)
30
+ waveform = waveform.reshape(1, -1)
31
+ scores, embeddings, _ = yamnet(waveform)
32
+ mean_scores = np.mean(scores.numpy(), axis=0)
33
+ top_idx = np.argmax(mean_scores)
34
+ return class_map[top_idx].decode("utf-8"), mean_scores.max()
35
+
36
+
37
+ def analyze_audio(audio_file):
38
+ summary = {}
39
+
40
+ summary["transcription"] = speech_to_text(audio_file)
41
+
42
+ event, confidence = detect_sound(audio_file)
43
+ summary["sound_event"] = event
44
+ summary["sound_confidence"] = float(confidence)
45
+
46
+ summary["emotion"] = "Neutral (approx)"
47
+
48
+ summary["speakers"] = "Not available in HF-free version"
49
+
50
+ return summary
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from alm_pipeline import analyze_audio
3
+ from reasoning import generate_reasoning
4
+
5
+ def full_pipeline(audio, question):
6
+ if audio is None:
7
+ return "No audio uploaded", ""
8
+
9
+ summary = analyze_audio(audio)
10
+ answer = generate_reasoning(summary, question)
11
+
12
+ return summary, answer
13
+
14
+
15
+ ui = gr.Interface(
16
+ fn=full_pipeline,
17
+ inputs=[
18
+ gr.Audio(sources=["upload"], type="filepath"),
19
+ gr.Textbox(label="Ask a Question About the Audio")
20
+ ],
21
+ outputs=[
22
+ gr.JSON(label="Audio Understanding Summary"),
23
+ gr.Textbox(label="Reasoning Answer")
24
+ ],
25
+ title="ALM Prototype – Hugging Face Edition",
26
+ description="Upload audio β†’ detect speech, sounds β†’ ask any question β†’ get AI reasoning."
27
+ )
28
+
29
+ ui.launch()
reasoning.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+
3
+ # Load lightweight reasoning model
4
+ reasoner = pipeline("text2text-generation", model="google/flan-t5-large")
5
+
6
+ def generate_reasoning(summary, question):
7
+ prompt = f"""
8
+ Audio Summary:
9
+ Speech: {summary['transcription']}
10
+ Main Sound Event: {summary['sound_event']}
11
+ Emotion: {summary['emotion']}
12
+ Speakers: {summary['speakers']}
13
+
14
+ Question: {question}
15
+ Provide a detailed reasoning-based answer using the audio cues.
16
+ """
17
+ result = reasoner(prompt, max_length=200)[0]["generated_text"]
18
+ return result
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ git+https://github.com/openai/whisper.git
3
+ torch
4
+ torchaudio
5
+ tensorflow
6
+ tensorflow_hub
7
+ librosa
8
+ transformers
9
+ accelerate
10
+ sentencepiece