File size: 3,311 Bytes
da6d3f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e558c6d
da6d3f0
 
e558c6d
da6d3f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e558c6d
da6d3f0
 
e558c6d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da6d3f0
 
 
 
 
e558c6d
 
 
 
da6d3f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e558c6d
 
da6d3f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
import whisper
import jiwer
import tempfile
import os

app = FastAPI()

# Load once at startup (not per request)
model = whisper.load_model("base")

@app.get("/")
def root():
    return {"status": "Speech API is running"}


# --- 1. Speech to Text ---
@app.post("/stt")
async def speech_to_text(file: UploadFile = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    result = model.transcribe(tmp_path)
    os.unlink(tmp_path)  # clean up

    return {"transcription": result["text"]}


# --- 2. Fluency Check ---
@app.post("/fluency")
async def fluency_check(
    file: UploadFile = File(...),
    reference_text: str = Form(...)
):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    result = model.transcribe(tmp_path)
    os.unlink(tmp_path)

    transcription = result["text"]

    # Calculate Word Error Rate (lower = more fluent)
    wer = jiwer.wer(reference_text.lower(), transcription.lower())

    # Cap fluency score at 0% minimum (prevents negative scores)
    fluency_score = round(max(0, (1 - wer)) * 100, 2)

    # Count how many reference words appeared in transcription
    words_in_reference = len(reference_text.split())
    words_matched = sum(
        1 for w in reference_text.lower().split()
        if w in transcription.lower().split()
    )

    # Verdict with feedback
    if fluency_score >= 80:
        verdict = "Good"
        feedback = "Great job! You read the text accurately."
    elif fluency_score >= 50:
        verdict = "Average"
        feedback = f"You matched {words_matched} out of {words_in_reference} words. Keep practicing!"
    elif fluency_score > 0:
        verdict = "Needs Improvement"
        feedback = f"Only {words_matched} out of {words_in_reference} words matched. Try reading more slowly."
    else:
        verdict = "Wrong Content"
        feedback = "The audio does not match the reference text at all. Please read the given text."

    return {
        "transcription": transcription,
        "reference_text": reference_text,
        "fluency_score": f"{fluency_score}%",
        "word_error_rate": round(min(wer, 1.0), 3),  # Cap WER at 1.0
        "words_matched": f"{words_matched}/{words_in_reference}",
        "verdict": verdict,
        "feedback": feedback
    }


# --- 3. Speech Verification ---
@app.post("/verify")
async def speech_verify(
    file: UploadFile = File(...),
    target_word: str = Form(...)
):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    result = model.transcribe(tmp_path)
    os.unlink(tmp_path)

    transcription = result["text"].lower()
    target = target_word.lower()
    found = target in transcription

    return {
        "transcription": transcription,
        "target_word": target_word,
        "verified": found,
        "confidence": "high" if found else "not detected",
        "feedback": f"'{target_word}' was detected in your speech." if found else f"'{target_word}' was NOT detected. Please try again."
    }