Spaces:

Rivalcoder
/

Lite

Sleeping

Rivalcoder commited on Dec 1, 2025

Commit

5a416b3

1 Parent(s): 9d97df3

Add Files

Files changed (4) hide show

alm_pipeline.py ADDED Viewed

+import whisper
+import librosa
+import numpy as np
+import tensorflow_hub as hub
+# Load ASR
+asr_model = whisper.load_model("small")
+# Load YAMNet for sound classification
+yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
+class_map = yamnet.class_map_path().numpy()
+# Simple Emotion Estimator (from YAMNet embedding)
+def estimate_emotion(activation):
+    mean_val = activation.mean()
+    if mean_val > 0.3:
+        return "Happy / Excited"
+    elif mean_val < -0.3:
+        return "Sad / Depressed"
+    return "Neutral"
+def speech_to_text(audio):
+    result = asr_model.transcribe(audio)
+    return result["text"]
+def detect_sound(audio):
+    waveform, sr = librosa.load(audio, sr=16000)
+    waveform = waveform.reshape(1, -1)
+    scores, embeddings, _ = yamnet(waveform)
+    mean_scores = np.mean(scores.numpy(), axis=0)
+    top_idx = np.argmax(mean_scores)
+    return class_map[top_idx].decode("utf-8"), mean_scores.max()
+def analyze_audio(audio_file):
+    summary = {}
+    summary["transcription"] = speech_to_text(audio_file)
+    event, confidence = detect_sound(audio_file)
+    summary["sound_event"] = event
+    summary["sound_confidence"] = float(confidence)
+    summary["emotion"] = "Neutral (approx)"
+    summary["speakers"] = "Not available in HF-free version"
+    return summary

app.py ADDED Viewed

+import gradio as gr
+from alm_pipeline import analyze_audio
+from reasoning import generate_reasoning
+def full_pipeline(audio, question):
+    if audio is None:
+        return "No audio uploaded", ""
+    summary = analyze_audio(audio)
+    answer = generate_reasoning(summary, question)
+    return summary, answer
+ui = gr.Interface(
+    fn=full_pipeline,
+    inputs=[
+        gr.Audio(sources=["upload"], type="filepath"),
+        gr.Textbox(label="Ask a Question About the Audio")
+    ],
+    outputs=[
+        gr.JSON(label="Audio Understanding Summary"),
+        gr.Textbox(label="Reasoning Answer")
+    ],
+    title="ALM Prototype – Hugging Face Edition",
+    description="Upload audio → detect speech, sounds → ask any question → get AI reasoning."
+)
+ui.launch()

reasoning.py ADDED Viewed

+from transformers import pipeline
+# Load lightweight reasoning model
+reasoner = pipeline("text2text-generation", model="google/flan-t5-large")
+def generate_reasoning(summary, question):
+    prompt = f"""
+Audio Summary:
+Speech: {summary['transcription']}
+Main Sound Event: {summary['sound_event']}
+Emotion: {summary['emotion']}
+Speakers: {summary['speakers']}
+Question: {question}
+Provide a detailed reasoning-based answer using the audio cues.
+"""
+    result = reasoner(prompt, max_length=200)[0]["generated_text"]
+    return result

requirements.txt ADDED Viewed

+gradio
+git+https://github.com/openai/whisper.git
+torch
+torchaudio
+tensorflow
+tensorflow_hub
+librosa
+transformers
+accelerate
+sentencepiece