Spaces:

jaykishan-b
/

speech-analysis

Runtime error

File size: 2,896 Bytes

79b7942

import os
import shutil
import time
from concurrent.futures import ThreadPoolExecutor

from fastapi import APIRouter, FastAPI, File, Form, UploadFile
from fastapi.responses import HTMLResponse
from pydub import AudioSegment

from app.utils.ai_speech import pronunciation_assessment
from app.utils.common import aggregate_scores, remove_files, split_audio
from app.utils.get_scores import get_content_score
from app.models.speech_analysis import SpeechAnalysisResponse
from app.utils.constants import UPLOAD_DIR
from loguru import logger

speech_analysis_router = APIRouter()


@speech_analysis_router.post("/analyze", response_model=SpeechAnalysisResponse)
async def analyze_speech(audio_file: UploadFile = File(...), language: str = Form(...)):
    # Make audio directory if not available
    os.makedirs(UPLOAD_DIR, exist_ok=True)

    # Save the uploaded file
    s_time = time.time()
    audio_file_path = os.path.join(UPLOAD_DIR, audio_file.filename)

    with open(audio_file_path, "wb") as buffer:
        shutil.copyfileobj(audio_file.file, buffer)

    if audio_file_path.endswith("mp3"):
        audio_file_path = convert_mp3_to_wav(audio_file_path, audio_file_path.replace("mp3", "wav"))

    # Split audio if longer than 30 seconds
    audio_length_ms = AudioSegment.from_wav(audio_file_path).duration_seconds * 1000
    if audio_length_ms > 30000:
        chunk_paths = split_audio(audio_file_path)
    else:
        chunk_paths = [audio_file_path]  # No splitting needed

    # Process each chunk for azure_speech_score in parallel
    with ThreadPoolExecutor() as executor:
        content_future = executor.submit(get_content_score, audio_file_path)
        pronunciation_futures = [executor.submit(pronunciation_assessment, path, language) for path in chunk_paths]
        content_score = content_future.result()
        chunk_scores = [future.result() for future in pronunciation_futures]

    # Aggregate azure_speech_scores across chunks
    final_azure_speech_score = aggregate_scores(chunk_scores)

    # Combine final results
    final_azure_speech_score.update(
        {
            "grammar_score": content_score.get("grammar_score"),
            "intonation_score": content_score.get("intonation_score"),
            "comprehension_score": content_score.get("comprehension_score"),
            "grammar_errors": content_score.get("grammar_errors"),
            "pronunciation_feedback": "Demo Content",
            "fluency_feedback": "Demo Content",
            "accuracy_feedback": "Demo Content",
            "grammar_feedback": "Demo Content",
            "intonation_feedback": "Demo Content",
            "comprehension_feedback": "Demo Content",
        }
    )

    e_time = time.time()
    total_time = e_time - s_time
    logger.info(str(total_time))

    chunk_paths.append(audio_file_path)
    remove_files(chunk_paths)

    return final_azure_speech_score