Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, File, UploadFile | |
| import uvicorn | |
| import openai | |
| import torch | |
| import torchaudio | |
| import torchaudio.transforms as T | |
| from transformers import Wav2Vec2FeatureExtractor, AutoModelForAudioClassification | |
| import whisper | |
| import os | |
| app = FastAPI() | |
| # Load Whisper model for transcription | |
| whisper_model = whisper.load_model("small") | |
| # Load speech emotion recognition model | |
| ser_model_name = "superb/wav2vec2-base-superb-er" | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(ser_model_name) | |
| ser_model = AutoModelForAudioClassification.from_pretrained(ser_model_name) | |
| # OpenAI API Key | |
| openai.api_key = os.getenv("OPENAI_API_KEY") # Ensure you set this in the terminal before running | |
| async def process_audio(file: UploadFile = File(...)): | |
| try: | |
| print(f"β File received: {file.filename}") | |
| # Save audio | |
| audio_path = "temp_audio.wav" | |
| with open(audio_path, "wb") as f: | |
| f.write(await file.read()) | |
| print("β Audio saved successfully!") | |
| # π’ TEST 1: Check if the file is corrupted | |
| try: | |
| waveform, sample_rate = torchaudio.load(audio_path) | |
| print(f"β Audio loaded! Shape: {waveform.shape}, Sample Rate: {sample_rate}") | |
| except Exception as e: | |
| return {"error": f"β Audio loading failed: {e}"} | |
| # π’ TEST 2: Whisper Transcription | |
| try: | |
| transcription = whisper_model.transcribe(audio_path)["text"] | |
| print(f"β Whisper Transcription: {transcription}") | |
| except Exception as e: | |
| return {"error": f"β Whisper failed: {e}"} | |
| # π’ TEST 3: Emotion Recognition | |
| try: | |
| if waveform.shape[0] > 1: | |
| waveform = torch.mean(waveform, dim=0, keepdim=True) | |
| if sample_rate != 16000: | |
| resampler = T.Resample(sample_rate, 16000) | |
| waveform = resampler(waveform) | |
| inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| logits = ser_model(**inputs).logits | |
| predicted_class = torch.argmax(logits, dim=-1).item() | |
| emotions = ["neutral", "happy", "sad", "angry", "fearful", "disgust", "surprised"] | |
| emotion_detected = emotions[predicted_class] if predicted_class < len(emotions) else "unknown" | |
| print(f"β Emotion Detected: {emotion_detected}") | |
| except Exception as e: | |
| return {"error": f"β Emotion recognition failed: {e}"} | |
| # π’ TEST 4: OpenAI API Summarization | |
| try: | |
| summary_response = openai.ChatCompletion.create( | |
| model="gpt-4-turbo", | |
| messages=[ | |
| {"role": "system", "content": "Summarize the following text."}, | |
| {"role": "user", "content": transcription} | |
| ] | |
| ) | |
| summary = summary_response["choices"][0]["message"]["content"] | |
| print(f"β OpenAI Summary: {summary}") | |
| except Exception as e: | |
| return {"error": f"β OpenAI Summarization failed: {e}"} | |
| return { | |
| "transcription": transcription, | |
| "emotion": emotion_detected, | |
| "summary": summary | |
| } | |
| except Exception as e: | |
| print(f"β Error in process_audio: {e}") | |
| return {"error": str(e)} | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8000) | |