Spaces:
Sleeping
Sleeping
| """ | |
| Voice Break API - FastAPI Backend | |
| Endpoints per analisi vocale, ML classification e dataset management | |
| """ | |
| from fastapi import FastAPI, File, UploadFile, HTTPException, Form | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import JSONResponse | |
| from pydantic import BaseModel | |
| from typing import List, Dict, Optional | |
| import librosa | |
| import numpy as np | |
| import io | |
| import base64 | |
| from datetime import datetime | |
| import json | |
| import os | |
| import tempfile | |
| app = FastAPI( | |
| title="Voice Break API", | |
| description="API per analisi prosodica e classificazione vocale", | |
| version="2.0.0" | |
| ) | |
| # CORS - Permetti richieste da Lovable | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], # In produzione, specifica il dominio Lovable | |
| allow_credentials=True, | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ============================================================================ | |
| # MODELS | |
| # ============================================================================ | |
| class VoiceAnalysisResult(BaseModel): | |
| element: str # aria, acqua, terra, fuoco | |
| confidence: float | |
| features: Dict[str, float] | |
| prosody_score: Dict[str, float] | |
| timestamp: str | |
| class BaselineCalibration(BaseModel): | |
| rms_baseline: float | |
| pitch_baseline: float | |
| duration_ms: float | |
| success: bool | |
| message: str | |
| class TrainingRequest(BaseModel): | |
| element: str | |
| audio_base64: str | |
| duration_ms: float | |
| # ============================================================================ | |
| # ANALISI AUDIO - Core Functions | |
| # ============================================================================ | |
| def extract_features(audio_data: bytes) -> Dict[str, float]: | |
| """Estrai feature prosodiche dall'audio""" | |
| temp_file = None | |
| try: | |
| print(f"[DEBUG] Received audio data: {len(audio_data)} bytes") | |
| # Salva temporaneamente il file con estensione | |
| # Prova prima .webm (formato comune browser) | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.webm') as temp_file: | |
| temp_file.write(audio_data) | |
| temp_path = temp_file.name | |
| print(f"[DEBUG] Saved to temp file: {temp_path}") | |
| # Carica con librosa | |
| audio_array, sr = librosa.load(temp_path, sr=22050) | |
| print(f"[DEBUG] Loaded audio: shape={audio_array.shape}, sr={sr}") | |
| # Feature extraction | |
| rms = float(np.sqrt(np.mean(audio_array**2))) | |
| zcr = float(np.mean(librosa.feature.zero_crossing_rate(audio_array))) | |
| # Pitch (F0) | |
| pitches, magnitudes = librosa.piptrack(y=audio_array, sr=sr) | |
| pitch = float(np.mean(pitches[pitches > 0])) if len(pitches[pitches > 0]) > 0 else 0.0 | |
| # Spectral features | |
| spectral_centroid = float(np.mean(librosa.feature.spectral_centroid(y=audio_array, sr=sr))) | |
| spectral_rolloff = float(np.mean(librosa.feature.spectral_rolloff(y=audio_array, sr=sr))) | |
| # MFCC | |
| mfccs = librosa.feature.mfcc(y=audio_array, sr=sr, n_mfcc=13) | |
| mfcc_mean = [float(np.mean(mfcc)) for mfcc in mfccs] | |
| print(f"[DEBUG] Features extracted successfully") | |
| return { | |
| "rms": rms, | |
| "zcr": zcr, | |
| "pitch": pitch, | |
| "spectral_centroid": spectral_centroid, | |
| "spectral_rolloff": spectral_rolloff, | |
| "mfcc_mean": mfcc_mean[0] if mfcc_mean else 0.0, | |
| "duration": float(len(audio_array) / sr) | |
| } | |
| except Exception as e: | |
| print(f"[ERROR] Feature extraction failed: {str(e)}") | |
| import traceback | |
| traceback.print_exc() | |
| raise HTTPException(status_code=500, detail=f"Feature extraction error: {str(e)}") | |
| finally: | |
| # Pulisci file temporaneo | |
| if temp_file and os.path.exists(temp_path): | |
| try: | |
| os.unlink(temp_path) | |
| print(f"[DEBUG] Cleaned up temp file") | |
| except: | |
| pass | |
| def classify_voice_element(features: Dict[str, float], baseline: Optional[Dict] = None) -> Dict: | |
| """ | |
| Classifica l'elemento vocale basandosi sulle feature prosodiche | |
| Regole semplificate - in produzione usa il modello ML addestrato | |
| """ | |
| rms = features["rms"] | |
| pitch = features["pitch"] | |
| zcr = features["zcr"] | |
| # Scoring per ogni elemento (0-100) | |
| scores = { | |
| "aria": 0.0, | |
| "acqua": 0.0, | |
| "terra": 0.0, | |
| "fuoco": 0.0 | |
| } | |
| # ARIA: Alta energia, pitch variabile, ZCR medio-alto | |
| if rms > 0.05: | |
| scores["aria"] += 30 | |
| if zcr > 0.1: | |
| scores["aria"] += 30 | |
| if pitch > 200: | |
| scores["aria"] += 40 | |
| # ACQUA: Energia fluida, pitch medio-basso, variazioni smooth | |
| if 0.02 < rms < 0.06: | |
| scores["acqua"] += 40 | |
| if 150 < pitch < 250: | |
| scores["acqua"] += 35 | |
| if zcr < 0.12: | |
| scores["acqua"] += 25 | |
| # TERRA: Energia stabile, pitch basso, ZCR basso | |
| if rms < 0.04: | |
| scores["terra"] += 35 | |
| if pitch < 180: | |
| scores["terra"] += 40 | |
| if zcr < 0.08: | |
| scores["terra"] += 25 | |
| # FUOCO: Alta energia, pitch alto, ZCR molto alto | |
| if rms > 0.06: | |
| scores["fuoco"] += 35 | |
| if pitch > 250: | |
| scores["fuoco"] += 35 | |
| if zcr > 0.15: | |
| scores["fuoco"] += 30 | |
| # Normalizza scores | |
| total = sum(scores.values()) | |
| if total > 0: | |
| scores = {k: (v / total) * 100 for k, v in scores.items()} | |
| # Determina elemento dominante | |
| dominant_element = max(scores, key=scores.get) | |
| confidence = scores[dominant_element] / 100.0 | |
| return { | |
| "element": dominant_element, | |
| "confidence": confidence, | |
| "scores": scores | |
| } | |
| # ============================================================================ | |
| # ENDPOINTS | |
| # ============================================================================ | |
| async def root(): | |
| """Health check""" | |
| return { | |
| "status": "online", | |
| "service": "Voice Break API", | |
| "version": "2.0.0", | |
| "endpoints": [ | |
| "/api/calibrate-baseline", | |
| "/api/analyze-voice", | |
| "/api/train-sample", | |
| "/api/dataset/list", | |
| "/api/dataset/upload" | |
| ] | |
| } | |
| async def calibrate_baseline(audio: UploadFile = File(...)): | |
| """ | |
| Endpoint 1: Calibrazione Baseline | |
| L'utente registra la sua voce "normale" per creare un riferimento | |
| """ | |
| try: | |
| audio_bytes = await audio.read() | |
| # Estrai feature baseline | |
| features = extract_features(audio_bytes) | |
| return BaselineCalibration( | |
| rms_baseline=features["rms"], | |
| pitch_baseline=features["pitch"], | |
| duration_ms=features["duration"] * 1000, | |
| success=True, | |
| message="Calibrazione baseline completata con successo" | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def analyze_voice( | |
| audio: UploadFile = File(...), | |
| baseline_rms: Optional[float] = Form(None), | |
| baseline_pitch: Optional[float] = Form(None) | |
| ): | |
| """ | |
| Endpoint 2: Analisi Vocale Real-time | |
| Analizza un audio e classifica l'elemento prosodico | |
| """ | |
| try: | |
| audio_bytes = await audio.read() | |
| # Estrai features | |
| features = extract_features(audio_bytes) | |
| # Prepara baseline se fornito | |
| baseline = None | |
| if baseline_rms and baseline_pitch: | |
| baseline = { | |
| "rms": baseline_rms, | |
| "pitch": baseline_pitch | |
| } | |
| # Classifica elemento | |
| classification = classify_voice_element(features, baseline) | |
| return VoiceAnalysisResult( | |
| element=classification["element"], | |
| confidence=classification["confidence"], | |
| features=features, | |
| prosody_score=classification["scores"], | |
| timestamp=datetime.now().isoformat() | |
| ) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def train_sample( | |
| audio: UploadFile = File(...), | |
| element: str = Form(...), | |
| user_id: Optional[str] = Form(None) | |
| ): | |
| """ | |
| Endpoint 3: Aggiungi Sample al Dataset | |
| Salva un sample classificato per il training del modello | |
| """ | |
| try: | |
| # Valida elemento | |
| valid_elements = ["aria", "acqua", "terra", "fuoco"] | |
| if element not in valid_elements: | |
| raise HTTPException(status_code=400, detail=f"Elemento non valido. Usa: {valid_elements}") | |
| audio_bytes = await audio.read() | |
| # Estrai features per validazione | |
| features = extract_features(audio_bytes) | |
| # In produzione: salva su storage (S3, HF Dataset, etc.) | |
| # Per ora restituiamo conferma | |
| return { | |
| "success": True, | |
| "element": element, | |
| "features": features, | |
| "message": f"Sample {element} aggiunto al dataset", | |
| "user_id": user_id | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def get_dataset_stats(): | |
| """ | |
| Endpoint 4: Statistiche Dataset | |
| Restituisce info sul dataset di training | |
| """ | |
| # In produzione: query dal database/storage | |
| return { | |
| "total_samples": 277, | |
| "by_element": { | |
| "aria": 68, | |
| "acqua": 71, | |
| "terra": 69, | |
| "fuoco": 69 | |
| }, | |
| "last_updated": "2025-08-13T14:20:45Z", | |
| "model_accuracy": 0.875 | |
| } | |
| async def retrain_model(): | |
| """ | |
| Endpoint 5: Riaddestra Modello | |
| Triggera il retraining del modello ML | |
| (Solo per admin) | |
| """ | |
| # In produzione: trigger training job | |
| return { | |
| "success": True, | |
| "message": "Retraining iniziato", | |
| "estimated_time": "5 minuti", | |
| "status_url": "/api/training/status" | |
| } | |
| # ============================================================================ | |
| # RUN | |
| # ============================================================================ | |
| async def get_dataset_samples(element: str, limit: int = 10): | |
| """ | |
| Ottieni sample audio dal dataset per un elemento specifico | |
| """ | |
| valid_elements = ["aria", "acqua", "terra", "fuoco"] | |
| if element.lower() not in valid_elements: | |
| raise HTTPException(status_code=400, detail=f"Elemento non valido. Usa: {valid_elements}") | |
| try: | |
| dataset = load_audio_dataset() | |
| # Filtra per elemento | |
| filtered = [sample for sample in dataset['train'] if sample['label'].lower() == element.lower()] | |
| # Limita risultati | |
| samples = filtered[:limit] | |
| return { | |
| "element": element, | |
| "total_available": len(filtered), | |
| "samples_returned": len(samples), | |
| "samples": [ | |
| { | |
| "audio_path": s['audio']['path'], | |
| "duration": s['audio'].get('duration', 0), | |
| "sample_rate": s['audio'].get('sampling_rate', 0) | |
| } | |
| for s in samples | |
| ] | |
| } | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Errore caricamento dataset: {str(e)}") | |
| if __name__ == "__main__": | |
| import uvicorn | |
| uvicorn.run(app, host="0.0.0.0", port=7860) | |