import os import shutil import time from concurrent.futures import ThreadPoolExecutor from fastapi import APIRouter, FastAPI, File, Form, UploadFile from fastapi.responses import HTMLResponse from pydub import AudioSegment from app.utils.ai_speech import pronunciation_assessment from app.utils.common import aggregate_scores, remove_files, split_audio from app.utils.get_scores import get_content_score from app.models.speech_analysis import SpeechAnalysisResponse from app.utils.constants import UPLOAD_DIR from loguru import logger speech_analysis_router = APIRouter() @speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse) async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)): # Make audio directory if not available os.makedirs(UPLOAD_DIR, exist_ok=True) # Save the uploaded file s_time = time.time() audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename) with open(audio_file_path, "wb") as buffer: shutil.copyfileobj(audio_file.file, buffer) if audio_file_path.endswith("mp3"): audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav")) # Split audio if longer than 30 seconds audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000 if audio_length_ms > 30000: chunk_paths = split_audio(audio_file_path) else: chunk_paths = [audio_file_path] # No splitting needed # Process each chunk for azure_speech_score in parallel with ThreadPoolExecutor() as executor: content_future = executor.submit(get_content_score, audio_file_path) pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths] content_score = content_future.result() chunk_scores = [future.result() for future in pronunciation_futures] # Aggregate azure_speech_scores across chunks final_azure_speech_score = aggregate_scores(chunk_scores) # Combine final results final_azure_speech_score.update( { "grammar_score": content_score.get("grammar_score"), "intonation_score": content_score.get("intonation_score"), "comprehension_score": content_score.get("comprehension_score"), "grammar_errors": content_score.get("grammar_errors"), "pronunciation_feedback": "Demo Content", "fluency_feedback": "Demo Content", "accuracy_feedback": "Demo Content", "grammar_feedback": "Demo Content", "intonation_feedback": "Demo Content", "comprehension_feedback": "Demo Content", } ) e_time = time.time() total_time = e_time - s_time logger.info(str(total_time)) chunk_paths.append(audio_file_path) remove_files(chunk_paths) return final_azure_speech_score