|
|
"""
|
|
|
Voice Emotion Recognition API
|
|
|
FastAPI application for analyzing voice emotions using Hugging Face transformers
|
|
|
"""
|
|
|
|
|
|
import logging
|
|
|
import tempfile
|
|
|
import os
|
|
|
from typing import Dict, Optional
|
|
|
from fastapi import FastAPI, UploadFile, File, HTTPException
|
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
|
from fastapi.responses import JSONResponse
|
|
|
import uvicorn
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO)
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
app = FastAPI(
|
|
|
title="Voice Emotion Recognition API",
|
|
|
description="API for analyzing voice emotions using Hugging Face transformers",
|
|
|
version="1.0.0"
|
|
|
)
|
|
|
|
|
|
|
|
|
app.add_middleware(
|
|
|
CORSMiddleware,
|
|
|
allow_origins=["*"],
|
|
|
allow_credentials=True,
|
|
|
allow_methods=["*"],
|
|
|
allow_headers=["*"],
|
|
|
)
|
|
|
|
|
|
|
|
|
_voice_emotion_pipeline = None
|
|
|
|
|
|
def get_voice_emotion_pipeline():
|
|
|
"""
|
|
|
Get or initialize the voice emotion recognition pipeline.
|
|
|
|
|
|
Returns:
|
|
|
transformers.pipeline: Voice emotion recognition pipeline
|
|
|
"""
|
|
|
global _voice_emotion_pipeline
|
|
|
|
|
|
if _voice_emotion_pipeline is None:
|
|
|
try:
|
|
|
from transformers import pipeline
|
|
|
logger.info("Loading voice emotion recognition model...")
|
|
|
_voice_emotion_pipeline = pipeline(
|
|
|
"audio-classification",
|
|
|
model="firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
|
|
|
)
|
|
|
logger.info("Voice emotion recognition model loaded successfully")
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to load voice emotion model: {e}")
|
|
|
raise
|
|
|
|
|
|
return _voice_emotion_pipeline
|
|
|
|
|
|
|
|
|
def analyze_voice_emotion_from_file(audio_file: UploadFile) -> Dict[str, any]:
|
|
|
"""
|
|
|
Analyze voice emotion from an uploaded audio file.
|
|
|
|
|
|
Args:
|
|
|
audio_file: FastAPI UploadFile containing audio data
|
|
|
|
|
|
Returns:
|
|
|
dict: Analysis results with emotion, confidence, and all results
|
|
|
"""
|
|
|
temp_file_path = None
|
|
|
try:
|
|
|
|
|
|
pipe = get_voice_emotion_pipeline()
|
|
|
|
|
|
|
|
|
file_extension = os.path.splitext(audio_file.filename)[1] if audio_file.filename else '.webm'
|
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension, mode='wb') as temp_file:
|
|
|
|
|
|
content = audio_file.file.read()
|
|
|
temp_file.write(content)
|
|
|
temp_file_path = temp_file.name
|
|
|
|
|
|
logger.info(f"Wrote {len(content)} bytes to temp file: {temp_file_path}")
|
|
|
|
|
|
try:
|
|
|
|
|
|
logger.info(f"Analyzing voice emotion from file: {audio_file.filename}")
|
|
|
results = pipe(temp_file_path)
|
|
|
|
|
|
|
|
|
if not results:
|
|
|
raise ValueError("No emotion analysis results returned")
|
|
|
|
|
|
top_result = max(results, key=lambda x: x['score'])
|
|
|
emotion_detected = top_result['label']
|
|
|
confidence = top_result['score']
|
|
|
|
|
|
logger.info(f"Voice emotion detected: {emotion_detected} (confidence: {confidence:.3f})")
|
|
|
|
|
|
return {
|
|
|
'emotion': emotion_detected,
|
|
|
'confidence': confidence,
|
|
|
'all_results': results
|
|
|
}
|
|
|
|
|
|
finally:
|
|
|
|
|
|
if temp_file_path and os.path.exists(temp_file_path):
|
|
|
try:
|
|
|
os.unlink(temp_file_path)
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Failed to delete temporary file {temp_file_path}: {e}")
|
|
|
|
|
|
except Exception as e:
|
|
|
logger.error(f"Voice emotion analysis failed: {e}")
|
|
|
|
|
|
if temp_file_path and os.path.exists(temp_file_path):
|
|
|
try:
|
|
|
os.unlink(temp_file_path)
|
|
|
except:
|
|
|
pass
|
|
|
raise
|
|
|
|
|
|
|
|
|
@app.get("/")
|
|
|
async def greet_json():
|
|
|
"""Health check / greeting endpoint"""
|
|
|
return {
|
|
|
"message": "Voice Emotion Recognition API",
|
|
|
"status": "running",
|
|
|
"version": "1.0.0",
|
|
|
"endpoints": {
|
|
|
"/analyze": "POST - Analyze voice emotion from audio file",
|
|
|
"/health": "GET - Health check",
|
|
|
"/model-info": "GET - Model information",
|
|
|
"/docs": "GET - API documentation"
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
@app.get("/health")
|
|
|
async def health_check():
|
|
|
"""Health check endpoint"""
|
|
|
try:
|
|
|
|
|
|
pipeline = get_voice_emotion_pipeline()
|
|
|
model_loaded = pipeline is not None
|
|
|
|
|
|
return {
|
|
|
"status": "healthy",
|
|
|
"model_loaded": model_loaded,
|
|
|
"service": "voice-emotion-recognition"
|
|
|
}
|
|
|
except Exception as e:
|
|
|
logger.error(f"Health check failed: {e}")
|
|
|
return JSONResponse(
|
|
|
status_code=503,
|
|
|
content={
|
|
|
"status": "unhealthy",
|
|
|
"error": str(e),
|
|
|
"service": "voice-emotion-recognition"
|
|
|
}
|
|
|
)
|
|
|
|
|
|
|
|
|
@app.get("/model-info")
|
|
|
async def model_info():
|
|
|
"""Get model information endpoint"""
|
|
|
try:
|
|
|
pipeline = get_voice_emotion_pipeline()
|
|
|
model_loaded = pipeline is not None
|
|
|
|
|
|
return {
|
|
|
"model_name": "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3",
|
|
|
"model_loaded": model_loaded,
|
|
|
"supported_formats": ["wav", "mp3", "flac", "m4a", "webm", "ogg", "opus"],
|
|
|
"max_duration_seconds": 30,
|
|
|
"sample_rate": 16000,
|
|
|
"channels": 1,
|
|
|
"max_file_size_mb": 15
|
|
|
}
|
|
|
except Exception as e:
|
|
|
logger.error(f"Failed to get model info: {e}")
|
|
|
raise HTTPException(status_code=500, detail=f"Failed to get model info: {str(e)}")
|
|
|
|
|
|
|
|
|
@app.post("/analyze")
|
|
|
async def analyze_audio(audio: UploadFile = File(...)):
|
|
|
"""
|
|
|
Analyze voice emotion from uploaded audio file.
|
|
|
|
|
|
Args:
|
|
|
audio: Audio file (wav, mp3, flac, m4a, webm, ogg, opus)
|
|
|
|
|
|
Returns:
|
|
|
JSON response with emotion, confidence, and all results
|
|
|
"""
|
|
|
try:
|
|
|
|
|
|
if not audio.filename:
|
|
|
raise HTTPException(status_code=400, detail="No filename provided")
|
|
|
|
|
|
|
|
|
audio.file.seek(0, os.SEEK_END)
|
|
|
file_size = audio.file.tell()
|
|
|
audio.file.seek(0)
|
|
|
|
|
|
if file_size == 0:
|
|
|
raise HTTPException(status_code=400, detail="Audio file is empty")
|
|
|
|
|
|
if file_size > 15 * 1024 * 1024:
|
|
|
raise HTTPException(status_code=400, detail="Audio file too large (max 15MB)")
|
|
|
|
|
|
|
|
|
result = analyze_voice_emotion_from_file(audio)
|
|
|
|
|
|
return {
|
|
|
"ok": True,
|
|
|
"emotion": result["emotion"],
|
|
|
"confidence": result["confidence"],
|
|
|
"all_results": result["all_results"]
|
|
|
}
|
|
|
|
|
|
except HTTPException:
|
|
|
raise
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error analyzing audio: {e}")
|
|
|
raise HTTPException(
|
|
|
status_code=500,
|
|
|
detail=f"Failed to analyze audio: {str(e)}"
|
|
|
)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
|
|
|
|