from fastapi import FastAPI, File, UploadFile, Form from fastapi.responses import JSONResponse import whisper import jiwer import tempfile import os app = FastAPI() # Load once at startup (not per request) model = whisper.load_model("base") @app.get("/") def root(): return {"status": "Speech API is running"} # --- 1. Speech to Text --- @app.post("/stt") async def speech_to_text(file: UploadFile = File(...)): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(await file.read()) tmp_path = tmp.name result = model.transcribe(tmp_path) os.unlink(tmp_path) # clean up return {"transcription": result["text"]} # --- 2. Fluency Check --- @app.post("/fluency") async def fluency_check( file: UploadFile = File(...), reference_text: str = Form(...) ): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(await file.read()) tmp_path = tmp.name result = model.transcribe(tmp_path) os.unlink(tmp_path) transcription = result["text"] # Calculate Word Error Rate (lower = more fluent) wer = jiwer.wer(reference_text.lower(), transcription.lower()) # Cap fluency score at 0% minimum (prevents negative scores) fluency_score = round(max(0, (1 - wer)) * 100, 2) # Count how many reference words appeared in transcription words_in_reference = len(reference_text.split()) words_matched = sum( 1 for w in reference_text.lower().split() if w in transcription.lower().split() ) # Verdict with feedback if fluency_score >= 80: verdict = "Good" feedback = "Great job! You read the text accurately." elif fluency_score >= 50: verdict = "Average" feedback = f"You matched {words_matched} out of {words_in_reference} words. Keep practicing!" elif fluency_score > 0: verdict = "Needs Improvement" feedback = f"Only {words_matched} out of {words_in_reference} words matched. Try reading more slowly." else: verdict = "Wrong Content" feedback = "The audio does not match the reference text at all. Please read the given text." return { "transcription": transcription, "reference_text": reference_text, "fluency_score": f"{fluency_score}%", "word_error_rate": round(min(wer, 1.0), 3), # Cap WER at 1.0 "words_matched": f"{words_matched}/{words_in_reference}", "verdict": verdict, "feedback": feedback } # --- 3. Speech Verification --- @app.post("/verify") async def speech_verify( file: UploadFile = File(...), target_word: str = Form(...) ): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(await file.read()) tmp_path = tmp.name result = model.transcribe(tmp_path) os.unlink(tmp_path) transcription = result["text"].lower() target = target_word.lower() found = target in transcription return { "transcription": transcription, "target_word": target_word, "verified": found, "confidence": "high" if found else "not detected", "feedback": f"'{target_word}' was detected in your speech." if found else f"'{target_word}' was NOT detected. Please try again." }