yamnetonnx / README.md

Update README.md

337f563 verified 5 months ago

7.28 kB

	---
	pipeline_tag: audio-classification
	---
	This model is an ONNX version of a YAMNet-based classifier trained to recognize environmental and human-made sounds (e.g., speech, music, silence, barking, etc.).

	Format: ONNX (.onnx)

	Input: float32 mono audio, sampled at 16,000 Hz

	Output: Score matrix per class [num_frames, num_classes]

	Classes: See mapping in yamnet_class_map.csv

	Example for uses
	```python

	import sounddevice as sd
	import numpy as np
	import onnxruntime
	import scipy.signal
	import csv
	import threading
	import time
	from collections import deque, Counter

	# Path to the ONNX model file
	MODEL_PATH = "./yamnet.onnx"
	# Create an inference session with ONNX runtime using CPU provider
	session = onnxruntime.InferenceSession(MODEL_PATH, providers=['CPUExecutionProvider'])
	# Get the name of the model's input node
	input_name = session.get_inputs()[0].name

	def load_class_map(csv_path="yamnet_class_map.csv"):
	"""Load class names from a CSV file into a dictionary mapping class IDs to names."""
	class_map = {}
	with open(csv_path, newline='') as csvfile:
	reader = csv.reader(csvfile)
	next(reader) # Skip header row
	for row in reader:
	class_id = int(row[0])
	class_name = row[2] # Class name is in the third column
	class_map[class_id] = class_name
	return class_map

	# Load the class mapping from CSV
	class_map = load_class_map()

	# Audio buffer settings (1.5 seconds at 16kHz sample rate)
	BUFFER_SIZE = int(16000 * 1.5)
	# Circular buffer to store audio samples
	audio_buffer = deque(maxlen=BUFFER_SIZE)

	# Queues for storing and consolidating detection results
	detections_queue = deque(maxlen=3) # Stores recent detections
	consolidated_results = deque(maxlen=3) # Stores consolidated results
	last_printed_result = None # Last result printed to console
	last_inference_time = 0 # Timestamp of last inference

	# Configuration for sleep mode (temporarily pausing inference)
	sleep_triggers = {"music", "speech", "silence"} # Classes that trigger sleep mode
	sleep_duration = 3.0 # How long to sleep after detecting trigger classes
	same_class_count = 0 # Counter for consecutive same-class detections
	last_detected_class = None # Last class detected
	is_sleeping_until = 0 # Timestamp until which we're in sleep mode

	def resample_if_needed(audio, original_sr, target_sr=16000):
	"""Resample audio to target sample rate if needed."""
	if original_sr != target_sr:
	audio = scipy.signal.resample_poly(audio, target_sr, int(original_sr))
	return audio

	def run_inference(audio_chunk):
	"""Run the ONNX model inference on an audio chunk and process results."""
	global detections_queue, consolidated_results, last_printed_result
	global is_sleeping_until, same_class_count, last_detected_class

	try:
	# Run the ONNX model
	outputs = session.run(None, {input_name: audio_chunk.astype(np.float32)})
	scores = outputs[0]
	mean_scores = np.mean(scores, axis=0) # Average scores across frames

	# Get the class with highest score
	class_id = np.argmax(mean_scores)
	confidence = mean_scores[class_id]
	class_name = class_map.get(class_id, f"Class {class_id}")

	# Track consecutive detections of the same class
	if class_name == last_detected_class:
	same_class_count += 1
	else:
	same_class_count = 1
	last_detected_class = class_name

	# Enter sleep mode if we detect trigger classes consecutively
	if same_class_count >= 3 and class_name in sleep_triggers:
	is_sleeping_until = time.time() + sleep_duration
	print(f" Pausing inferences for {sleep_duration} seconds (detected: {class_name})")
	same_class_count = 0

	# Add detection to queue
	detections_queue.append((class_name, confidence))

	# When queue is full, consolidate results
	if len(detections_queue) == 1:
	most_common_name, _ = Counter([d[0] for d in detections_queue]).most_common(1)[0]
	relevant_conf = [conf for n, conf in detections_queue if n == most_common_name]
	avg_conf = np.mean(relevant_conf) * 100
	if avg_conf > 20: # Only consider results with >20% confidence
	consolidated_results.append((most_common_name, avg_conf))
	detections_queue.clear()

	# When we have enough consolidated results, print the final result
	if len(consolidated_results) == 3:
	names = [r[0] for r in consolidated_results]
	confidences = [r[1] for r in consolidated_results]
	most_common_name, _ = Counter(names).most_common(1)[0]
	avg_conf = np.mean([c for n, c in consolidated_results if n == most_common_name])
	msg = f" Consolidated result: {most_common_name} with average confidence {avg_conf:.2f}%"
	if msg != last_printed_result:
	print(msg)
	last_printed_result = msg

	except Exception as e:
	print(f" ONNX Error: {e}")

	def audio_callback(indata, frames, time_info, status):
	"""Callback function for audio input stream."""
	global last_inference_time, is_sleeping_until

	if status:
	print(f" Status: {status}")

	# Get mono audio and resample if needed
	audio = indata[:, 0]
	original_sr = sd.query_devices(sd.default.device[0], 'input')['default_samplerate']
	audio = resample_if_needed(audio, original_sr)

	# Normalize audio
	max_val = np.max(np.abs(audio))
	if max_val > 0:
	audio = audio / max_val

	# Add audio to buffer
	audio_buffer.extend(audio)

	# Run inference if buffer is full and not in sleep mode
	if len(audio_buffer) >= BUFFER_SIZE:
	now = time.time()

	if now < is_sleeping_until:
	return

	# Throttle inference to once per second
	if now - last_inference_time > 1.0:
	last_inference_time = now
	audio_chunk = np.array(audio_buffer)
	# Run inference in a separate thread
	threading.Thread(target=run_inference, args=(audio_chunk,), daemon=True).start()

	def main():
	"""Main function to start audio streaming and processing."""
	print(" Starting real-time listening (Ctrl+C to stop)...")

	# List available input devices
	devices = sd.query_devices()
	input_devices = [i for i, d in enumerate(devices) if d['max_input_channels'] > 0]

	print("\nAvailable input devices:")
	for i in input_devices:
	print(f"{i}: {devices[i]['name']}")

	# Let user select input device
	device_id = int(input("Select input device ID: "))
	sd.default.device = (device_id, None)

	# Configure audio settings
	samplerate = int(sd.query_devices(device_id, 'input')['default_samplerate'])
	sd.default.samplerate = samplerate
	sd.default.channels = 1 # Mono audio

	print("🎧 Listening...")

	# Start audio stream with callback
	with sd.InputStream(callback=audio_callback, channels=1, samplerate=samplerate):
	try:
	while True:
	time.sleep(0.1) # Keep main thread alive
	except KeyboardInterrupt:
	print("\n Stopping listener...")

	if __name__ == "__main__":
	main()

	```