|
|
--- |
|
|
pipeline_tag: audio-classification |
|
|
--- |
|
|
This model is an ONNX version of a YAMNet-based classifier trained to recognize environmental and human-made sounds (e.g., speech, music, silence, barking, etc.). |
|
|
|
|
|
Format: ONNX (.onnx) |
|
|
|
|
|
Input: float32 mono audio, sampled at 16,000 Hz |
|
|
|
|
|
Output: Score matrix per class [num_frames, num_classes] |
|
|
|
|
|
Classes: See mapping in yamnet_class_map.csv |
|
|
|
|
|
Example for uses |
|
|
```python |
|
|
|
|
|
import sounddevice as sd |
|
|
import numpy as np |
|
|
import onnxruntime |
|
|
import scipy.signal |
|
|
import csv |
|
|
import threading |
|
|
import time |
|
|
from collections import deque, Counter |
|
|
|
|
|
# Path to the ONNX model file |
|
|
MODEL_PATH = "./yamnet.onnx" |
|
|
# Create an inference session with ONNX runtime using CPU provider |
|
|
session = onnxruntime.InferenceSession(MODEL_PATH, providers=['CPUExecutionProvider']) |
|
|
# Get the name of the model's input node |
|
|
input_name = session.get_inputs()[0].name |
|
|
|
|
|
def load_class_map(csv_path="yamnet_class_map.csv"): |
|
|
"""Load class names from a CSV file into a dictionary mapping class IDs to names.""" |
|
|
class_map = {} |
|
|
with open(csv_path, newline='') as csvfile: |
|
|
reader = csv.reader(csvfile) |
|
|
next(reader) # Skip header row |
|
|
for row in reader: |
|
|
class_id = int(row[0]) |
|
|
class_name = row[2] # Class name is in the third column |
|
|
class_map[class_id] = class_name |
|
|
return class_map |
|
|
|
|
|
# Load the class mapping from CSV |
|
|
class_map = load_class_map() |
|
|
|
|
|
# Audio buffer settings (1.5 seconds at 16kHz sample rate) |
|
|
BUFFER_SIZE = int(16000 * 1.5) |
|
|
# Circular buffer to store audio samples |
|
|
audio_buffer = deque(maxlen=BUFFER_SIZE) |
|
|
|
|
|
# Queues for storing and consolidating detection results |
|
|
detections_queue = deque(maxlen=3) # Stores recent detections |
|
|
consolidated_results = deque(maxlen=3) # Stores consolidated results |
|
|
last_printed_result = None # Last result printed to console |
|
|
last_inference_time = 0 # Timestamp of last inference |
|
|
|
|
|
# Configuration for sleep mode (temporarily pausing inference) |
|
|
sleep_triggers = {"music", "speech", "silence"} # Classes that trigger sleep mode |
|
|
sleep_duration = 3.0 # How long to sleep after detecting trigger classes |
|
|
same_class_count = 0 # Counter for consecutive same-class detections |
|
|
last_detected_class = None # Last class detected |
|
|
is_sleeping_until = 0 # Timestamp until which we're in sleep mode |
|
|
|
|
|
def resample_if_needed(audio, original_sr, target_sr=16000): |
|
|
"""Resample audio to target sample rate if needed.""" |
|
|
if original_sr != target_sr: |
|
|
audio = scipy.signal.resample_poly(audio, target_sr, int(original_sr)) |
|
|
return audio |
|
|
|
|
|
def run_inference(audio_chunk): |
|
|
"""Run the ONNX model inference on an audio chunk and process results.""" |
|
|
global detections_queue, consolidated_results, last_printed_result |
|
|
global is_sleeping_until, same_class_count, last_detected_class |
|
|
|
|
|
try: |
|
|
# Run the ONNX model |
|
|
outputs = session.run(None, {input_name: audio_chunk.astype(np.float32)}) |
|
|
scores = outputs[0] |
|
|
mean_scores = np.mean(scores, axis=0) # Average scores across frames |
|
|
|
|
|
# Get the class with highest score |
|
|
class_id = np.argmax(mean_scores) |
|
|
confidence = mean_scores[class_id] |
|
|
class_name = class_map.get(class_id, f"Class {class_id}") |
|
|
|
|
|
# Track consecutive detections of the same class |
|
|
if class_name == last_detected_class: |
|
|
same_class_count += 1 |
|
|
else: |
|
|
same_class_count = 1 |
|
|
last_detected_class = class_name |
|
|
|
|
|
# Enter sleep mode if we detect trigger classes consecutively |
|
|
if same_class_count >= 3 and class_name in sleep_triggers: |
|
|
is_sleeping_until = time.time() + sleep_duration |
|
|
print(f" Pausing inferences for {sleep_duration} seconds (detected: {class_name})") |
|
|
same_class_count = 0 |
|
|
|
|
|
# Add detection to queue |
|
|
detections_queue.append((class_name, confidence)) |
|
|
|
|
|
# When queue is full, consolidate results |
|
|
if len(detections_queue) == 1: |
|
|
most_common_name, _ = Counter([d[0] for d in detections_queue]).most_common(1)[0] |
|
|
relevant_conf = [conf for n, conf in detections_queue if n == most_common_name] |
|
|
avg_conf = np.mean(relevant_conf) * 100 |
|
|
if avg_conf > 20: # Only consider results with >20% confidence |
|
|
consolidated_results.append((most_common_name, avg_conf)) |
|
|
detections_queue.clear() |
|
|
|
|
|
# When we have enough consolidated results, print the final result |
|
|
if len(consolidated_results) == 3: |
|
|
names = [r[0] for r in consolidated_results] |
|
|
confidences = [r[1] for r in consolidated_results] |
|
|
most_common_name, _ = Counter(names).most_common(1)[0] |
|
|
avg_conf = np.mean([c for n, c in consolidated_results if n == most_common_name]) |
|
|
msg = f" Consolidated result: {most_common_name} with average confidence {avg_conf:.2f}%" |
|
|
if msg != last_printed_result: |
|
|
print(msg) |
|
|
last_printed_result = msg |
|
|
|
|
|
except Exception as e: |
|
|
print(f" ONNX Error: {e}") |
|
|
|
|
|
def audio_callback(indata, frames, time_info, status): |
|
|
"""Callback function for audio input stream.""" |
|
|
global last_inference_time, is_sleeping_until |
|
|
|
|
|
if status: |
|
|
print(f" Status: {status}") |
|
|
|
|
|
# Get mono audio and resample if needed |
|
|
audio = indata[:, 0] |
|
|
original_sr = sd.query_devices(sd.default.device[0], 'input')['default_samplerate'] |
|
|
audio = resample_if_needed(audio, original_sr) |
|
|
|
|
|
# Normalize audio |
|
|
max_val = np.max(np.abs(audio)) |
|
|
if max_val > 0: |
|
|
audio = audio / max_val |
|
|
|
|
|
# Add audio to buffer |
|
|
audio_buffer.extend(audio) |
|
|
|
|
|
# Run inference if buffer is full and not in sleep mode |
|
|
if len(audio_buffer) >= BUFFER_SIZE: |
|
|
now = time.time() |
|
|
|
|
|
if now < is_sleeping_until: |
|
|
return |
|
|
|
|
|
# Throttle inference to once per second |
|
|
if now - last_inference_time > 1.0: |
|
|
last_inference_time = now |
|
|
audio_chunk = np.array(audio_buffer) |
|
|
# Run inference in a separate thread |
|
|
threading.Thread(target=run_inference, args=(audio_chunk,), daemon=True).start() |
|
|
|
|
|
def main(): |
|
|
"""Main function to start audio streaming and processing.""" |
|
|
print(" Starting real-time listening (Ctrl+C to stop)...") |
|
|
|
|
|
# List available input devices |
|
|
devices = sd.query_devices() |
|
|
input_devices = [i for i, d in enumerate(devices) if d['max_input_channels'] > 0] |
|
|
|
|
|
print("\nAvailable input devices:") |
|
|
for i in input_devices: |
|
|
print(f"{i}: {devices[i]['name']}") |
|
|
|
|
|
# Let user select input device |
|
|
device_id = int(input("Select input device ID: ")) |
|
|
sd.default.device = (device_id, None) |
|
|
|
|
|
# Configure audio settings |
|
|
samplerate = int(sd.query_devices(device_id, 'input')['default_samplerate']) |
|
|
sd.default.samplerate = samplerate |
|
|
sd.default.channels = 1 # Mono audio |
|
|
|
|
|
print("🎧 Listening...") |
|
|
|
|
|
# Start audio stream with callback |
|
|
with sd.InputStream(callback=audio_callback, channels=1, samplerate=samplerate): |
|
|
try: |
|
|
while True: |
|
|
time.sleep(0.1) # Keep main thread alive |
|
|
except KeyboardInterrupt: |
|
|
print("\n Stopping listener...") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
``` |
|
|
|
|
|
|