Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import librosa | |
| import sounddevice as sd | |
| import queue | |
| import threading | |
| import time | |
| import argparse | |
| import sys | |
| from collections import deque | |
| SAMPLE_RATE = 22050 | |
| CHUNK_DURATION = 2 | |
| CHUNK_SAMPLES = SAMPLE_RATE * CHUNK_DURATION | |
| SILENCE_THRESHOLD = 0.01 | |
| SCORE_WINDOW = 5 | |
| score_history = deque(maxlen=SCORE_WINDOW) | |
| audio_queue = queue.Queue() | |
| def extract_features(audio: np.ndarray, sr: float = SAMPLE_RATE) -> dict: | |
| if len(audio) < 512: | |
| return None | |
| features = {} | |
| # RMS energy - volume/projection | |
| rms = librosa.feature.rms(y=audio)[0] | |
| features["rms_mean"] = float(np.mean(rms)) | |
| features["rms_std"] = float(np.std(rms)) | |
| # Zero crossing rate - voice steadiness | |
| zcr = librosa.feature.zero_crossing_rate(audio)[0] | |
| features["zcr_mean"] = float(np.mean(zcr)) | |
| # Pitch (F0) - monotone vs varied pitch | |
| pitches, magnitudes = librosa.piptrack(y=audio, sr=sr) | |
| pitch_values = pitches[magnitudes > np.median(magnitudes)] | |
| if len(pitch_values) > 0: | |
| features["pitch_mean"] = float(np.mean(pitch_values)) | |
| features["pitch_std"] = float(np.std(pitch_values)) | |
| else: | |
| features["pitch_mean"] = 0.0 | |
| features["pitch_std"] = 0.0 | |
| # Speech rate proxy - number of energy bursts | |
| onset_frames = librosa.onset.onset_detect(y=audio, sr=sr) | |
| features["speech_rate"] = len(onset_frames) / CHUNK_DURATION | |
| # Pause detection - ratio of silent frames | |
| silent_frames = np.sum(rms < SILENCE_THRESHOLD) | |
| features["pause_ratio"] = float(silent_frames / len(rms)) | |
| return features | |
| def compute_audio_score(features: dict) -> dict: | |
| if features is None: | |
| return {"score": 0, "tips": ["No audio detected"], "breakdown": {}} | |
| score = 100 | |
| tips = [] | |
| breakdown = {} | |
| # 1. Volume/Energy (25 pts) | |
| rms = features["rms_mean"] | |
| if rms < 0.02: | |
| vol_score = 10 | |
| tips.append("Speak louder — your voice is too soft") | |
| elif rms < 0.05: | |
| vol_score = 18 | |
| tips.append("Try projecting your voice more confidently") | |
| elif rms > 0.3: | |
| vol_score = 18 | |
| tips.append("Slightly lower your volume for a calmer tone") | |
| else: | |
| vol_score = 25 | |
| breakdown["volume"] = vol_score | |
| # 2. Pitch variation (25 pts) - monotone = low confidence | |
| pitch_std = features["pitch_std"] | |
| if pitch_std < 10: | |
| pitch_score = 10 | |
| tips.append("Avoid monotone — vary your pitch to sound engaging") | |
| elif pitch_std < 30: | |
| pitch_score = 18 | |
| else: | |
| pitch_score = 25 | |
| breakdown["pitch_variation"] = pitch_score | |
| # 3. Speech rate (25 pts) | |
| rate = features["speech_rate"] | |
| if rate < 1.5: | |
| rate_score = 12 | |
| tips.append("You're speaking too slowly — pick up the pace slightly") | |
| elif rate > 6: | |
| rate_score = 12 | |
| tips.append("Slow down — speaking too fast signals nervousness") | |
| else: | |
| rate_score = 25 | |
| breakdown["speech_rate"] = rate_score | |
| # 4. Pause ratio (25 pts) | |
| pause = features["pause_ratio"] | |
| if pause > 0.6: | |
| pause_score = 10 | |
| tips.append("Too many pauses - try to maintain a steady flow") | |
| elif pause > 0.4: | |
| pause_score = 18 | |
| else: | |
| pause_score = 25 | |
| breakdown["pauses"] = pause_score | |
| score = sum(breakdown.values()) | |
| score_history.append(score) | |
| smoothed = round(float(np.mean(score_history)), 1) | |
| if not tips: | |
| tips.append("Voice confidence is good - keep it up!") | |
| return { | |
| "score": smoothed, | |
| "raw_score": score, | |
| "breakdown": breakdown, | |
| "tips": tips, | |
| "features": { | |
| "rms": round(rms, 4), | |
| "pitch_std": round(features["pitch_std"], 2), | |
| "speech_rate": round(features["speech_rate"], 2), | |
| "pause_ratio": round(features["pause_ratio"], 2), | |
| }, | |
| } | |
| def get_label(score: float) -> str: | |
| if score >= 75: | |
| return "Confident" | |
| elif score >= 50: | |
| return "Moderate" | |
| else: | |
| return "Needs Improvement" | |
| def analyze_file(path: str): | |
| print(f"\nLoading: {path}") | |
| try: | |
| audio, sr = librosa.load(path, sr=SAMPLE_RATE, mono=True) | |
| except Exception as e: | |
| print(f"Error loading file: {e}") | |
| sys.exit(1) | |
| total_chunks = len(audio) // CHUNK_SAMPLES | |
| if total_chunks == 0: | |
| print("Audio too short (need at least 2 seconds)") | |
| sys.exit(1) | |
| print(f"Duration: {len(audio)/sr:.1f}s | Chunks: {total_chunks}\n") | |
| all_scores = [] | |
| for i in range(total_chunks): | |
| chunk = audio[i * CHUNK_SAMPLES : (i + 1) * CHUNK_SAMPLES] | |
| features = extract_features(chunk, sr) | |
| result = compute_audio_score(features) | |
| all_scores.append(result["score"]) | |
| print(f"[Chunk {i+1}/{total_chunks}]") | |
| print(f" Score : {result['score']} — {get_label(result['score'])}") | |
| print(f" Volume : {result['breakdown'].get('volume', 0)}/25") | |
| print(f" Pitch Var : {result['breakdown'].get('pitch_variation', 0)}/25") | |
| print(f" Rate : {result['breakdown'].get('speech_rate', 0)}/25") | |
| print(f" Pauses : {result['breakdown'].get('pauses', 0)}/25") | |
| print(f" Tip : {result['tips'][0]}") | |
| print() | |
| final = round(float(np.mean(all_scores)), 1) | |
| print("=" * 45) | |
| print(f"FINAL AUDIO CONFIDENCE SCORE: {final}/100") | |
| print(f"Overall: {get_label(final)}") | |
| print("=" * 45) | |
| def audio_callback(indata, frames, time_info, status): | |
| audio_queue.put(indata.copy()) | |
| def analyze_mic(): | |
| print("\nMic mode started. Press Ctrl+C to stop.\n") | |
| buffer = np.array([], dtype=np.float32) | |
| with sd.InputStream( | |
| samplerate=SAMPLE_RATE, | |
| channels=1, | |
| dtype="float32", | |
| callback=audio_callback, | |
| ): | |
| try: | |
| while True: | |
| chunk_data = audio_queue.get() | |
| buffer = np.append(buffer, chunk_data.flatten()) | |
| if len(buffer) >= CHUNK_SAMPLES: | |
| chunk = buffer[:CHUNK_SAMPLES] | |
| buffer = buffer[CHUNK_SAMPLES:] | |
| features = extract_features(chunk) | |
| result = compute_audio_score(features) | |
| print(f"\rScore: {result['score']:5.1f} | {get_label(result['score']):<20} | Tip: {result['tips'][0][:50]}", end="", flush=True) | |
| except KeyboardInterrupt: | |
| print("\n\nSession ended.") | |
| if score_history: | |
| final = round(float(np.mean(score_history)), 1) | |
| print(f"Session Avg Score: {final}/100 — {get_label(final)}") | |
| def get_latest_result() -> dict: | |
| """Called by fusion_scoring.py or Streamlit to get current audio score.""" | |
| if not score_history: | |
| return {"score": 0, "tips": ["No audio data yet"], "breakdown": {}} | |
| return {"score": round(float(np.mean(score_history)), 1)} | |
| def process_frame_audio(audio_chunk: np.ndarray) -> dict: | |
| """Called per-frame from main.py for real-time integration.""" | |
| features = extract_features(audio_chunk) | |
| return compute_audio_score(features) | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Audio Confidence Analyzer") | |
| parser.add_argument("--mic", action="store_true", help="Live mic analysis") | |
| parser.add_argument("--file", type=str, help="Path to audio/video file") | |
| args = parser.parse_args() | |
| if args.mic: | |
| analyze_mic() | |
| elif args.file: | |
| analyze_file(args.file) | |
| else: | |
| print("\nSelect mode:") | |
| print("1. Live Microphone") | |
| print("2. Audio File") | |
| choice = input("Enter choice (1/2): ").strip() | |
| if choice == "1": | |
| analyze_mic() | |
| elif choice == "2": | |
| path = input("Enter file path: ").strip() | |
| analyze_file(path) | |
| else: | |
| print("Invalid choice") | |
| sys.exit(1) |