speech-analysis / app /services /speech_analysis.py
jaykishan-b's picture
init
79b7942
import os
import shutil
import time
from concurrent.futures import ThreadPoolExecutor
from fastapi import APIRouter, FastAPI, File, Form, UploadFile
from fastapi.responses import HTMLResponse
from pydub import AudioSegment
from app.utils.ai_speech import pronunciation_assessment
from app.utils.common import aggregate_scores, remove_files, split_audio
from app.utils.get_scores import get_content_score
from app.models.speech_analysis import SpeechAnalysisResponse
from app.utils.constants import UPLOAD_DIR
from loguru import logger
speech_analysis_router = APIRouter()
@speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse)
async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)):
# Make audio directory if not available
os.makedirs(UPLOAD_DIR, exist_ok=True)
# Save the uploaded file
s_time = time.time()
audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename)
with open(audio_file_path, "wb") as buffer:
shutil.copyfileobj(audio_file.file, buffer)
if audio_file_path.endswith("mp3"):
audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav"))
# Split audio if longer than 30 seconds
audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000
if audio_length_ms > 30000:
chunk_paths = split_audio(audio_file_path)
else:
chunk_paths = [audio_file_path] # No splitting needed
# Process each chunk for azure_speech_score in parallel
with ThreadPoolExecutor() as executor:
content_future = executor.submit(get_content_score, audio_file_path)
pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths]
content_score = content_future.result()
chunk_scores = [future.result() for future in pronunciation_futures]
# Aggregate azure_speech_scores across chunks
final_azure_speech_score = aggregate_scores(chunk_scores)
# Combine final results
final_azure_speech_score.update(
{
"grammar_score": content_score.get("grammar_score"),
"intonation_score": content_score.get("intonation_score"),
"comprehension_score": content_score.get("comprehension_score"),
"grammar_errors": content_score.get("grammar_errors"),
"pronunciation_feedback": "Demo Content",
"fluency_feedback": "Demo Content",
"accuracy_feedback": "Demo Content",
"grammar_feedback": "Demo Content",
"intonation_feedback": "Demo Content",
"comprehension_feedback": "Demo Content",
}
)
e_time = time.time()
total_time = e_time - s_time
logger.info(str(total_time))
chunk_paths.append(audio_file_path)
remove_files(chunk_paths)
return final_azure_speech_score