Spaces:

Rivalcoder
/

Lite

Sleeping

Lite / alm_pipeline.py

Rivalcoder

Add Files -Update New

ba773e9 15 days ago

2.76 kB

	import os
	import warnings

	import whisper
	import librosa
	import numpy as np
	import tensorflow as tf
	import tensorflow_hub as hub
	import csv

	# Reduce TensorFlow log noise and avoid attempting GPU / oneDNN on CPU-only envs.
	# NOTE: These env vars must be set before TensorFlow fully initializes; setting them
	# here greatly reduces, but may not completely remove, startup logs on some platforms.
	os.environ.setdefault("TF_CPP_MIN_LOG_LEVEL", "2") # hide INFO/WARNING logs
	os.environ.setdefault("TF_ENABLE_ONEDNN_OPTS", "0") # disable oneDNN custom ops
	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "-1") # don't try to use CUDA GPUs

	# Suppress specific library warnings that are expected in this setup.
	warnings.filterwarnings(
	"ignore",
	category=UserWarning,
	message="FP16 is not supported on CPU; using FP32 instead",
	)
	warnings.filterwarnings(
	"ignore",
	category=FutureWarning,
	module="librosa",
	)

	# Load ASR
	asr_model = whisper.load_model("small")

	# Load YAMNet for sound classification
	yamnet = hub.load("https://tfhub.dev/google/yamnet/1")
	class_map_path = yamnet.class_map_path().numpy()
	if isinstance(class_map_path, bytes):
	class_map_path = class_map_path.decode("utf-8")

	# Parse YAMNet class map CSV to get human-readable labels
	with tf.io.gfile.GFile(class_map_path) as f:
	reader = csv.DictReader(f)
	yamnet_labels = [row["display_name"] for row in reader]

	# Simple Emotion Estimator (from YAMNet embedding)
	def estimate_emotion(activation):
	mean_val = activation.mean()
	if mean_val > 0.3:
	return "Happy / Excited"
	elif mean_val < -0.3:
	return "Sad / Depressed"
	return "Neutral"


	def speech_to_text(audio):
	# Force FP32 on CPU to avoid FP16 warnings and ensure compatibility
	result = asr_model.transcribe(audio, fp16=False)
	return result["text"]


	def detect_sound(audio):
	# Load mono waveform at 16 kHz as 1D float32 array, as expected by YAMNet
	waveform, sr = librosa.load(audio, sr=16000, mono=True)
	waveform = waveform.astype(np.float32)
	scores, embeddings, _ = yamnet(waveform)
	mean_scores = np.mean(scores.numpy(), axis=0)
	top_idx = int(np.argmax(mean_scores))
	# Look up human-readable class label from YAMNet's CSV class map
	label = yamnet_labels[top_idx] if 0 <= top_idx < len(yamnet_labels) else "Unknown"
	return label, float(mean_scores.max())


	def analyze_audio(audio_file):
	summary = {}

	summary["transcription"] = speech_to_text(audio_file)

	event, confidence = detect_sound(audio_file)
	summary["sound_event"] = event
	summary["sound_confidence"] = float(confidence)

	summary["emotion"] = "Neutral (approx)"

	summary["speakers"] = "Not available in HF-free version"

	return summary