Oviya commited on
Commit
80adf3e
·
1 Parent(s): 8eeff6c

update pronragg

Browse files
chroma_db/6bb1d18d-491e-4b83-bb53-aa5824da7394/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db8f9dd08c89ad45ed5b37e53fb7096c1f0be75e0c9377baede6add3ae3b97c6
3
+ size 167600
chroma_db/6bb1d18d-491e-4b83-bb53-aa5824da7394/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
chroma_db/6bb1d18d-491e-4b83-bb53-aa5824da7394/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:27571dbe52639d675f7ce26f4bf06ca84d65a2c943ad57727b90871d758a0d4d
3
+ size 400
chroma_db/6bb1d18d-491e-4b83-bb53-aa5824da7394/link_lists.bin ADDED
File without changes
chroma_db/a7177db3-89c4-4f3b-b1c6-6ac2ec4b0384/data_level0.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4f6e0dd5ee483e09b514559e6411fbc53b886ea77d8b25559576d80e4642179
3
+ size 167600
chroma_db/a7177db3-89c4-4f3b-b1c6-6ac2ec4b0384/header.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e81c3b22454233bc12d0762f06dcca48261a75231cf87c79b75e69a6c00150
3
+ size 100
chroma_db/a7177db3-89c4-4f3b-b1c6-6ac2ec4b0384/length.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acc1ab79cf9173b7ffaa20ccc92926b3f13299fc8c1fcc191a99c6a56cb2cebd
3
+ size 400
chroma_db/a7177db3-89c4-4f3b-b1c6-6ac2ec4b0384/link_lists.bin ADDED
File without changes
pronragg.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import base64
4
+ import tempfile
5
+ import subprocess
6
+ import re
7
+ import random
8
+
9
+ from flask import Blueprint, request, jsonify
10
+ from flask_cors import CORS
11
+ from pydub import AudioSegment
12
+ from faster_whisper import WhisperModel
13
+ from rapidfuzz.distance import Levenshtein
14
+ import chromadb
15
+
16
+ pronragg_bp = Blueprint("pronragg", __name__)
17
+
18
+
19
+ # --------------------------------------------------
20
+ # CONFIG
21
+ # --------------------------------------------------
22
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
23
+
24
+ VIDEO_PATH = os.path.join(BASE_DIR, "feedback.mp4")
25
+ JSON_PATH = os.path.join(BASE_DIR, "teacher_feedback_sentences_category.json")
26
+ CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
27
+
28
+ WHISPER_MODEL = "base"
29
+ SAFE_PADDING = 0.05
30
+ PAUSE_SECONDS = 0.5
31
+ MAX_SEGMENTS_PER_CATEGORY = 3
32
+
33
+ # Issue priority (VERY IMPORTANT)
34
+ ISSUE_PRIORITY = [
35
+ "silence",
36
+ "multipleword",
37
+ "wrong_word",
38
+ "consonant",
39
+ "vowel",
40
+ "ending",
41
+ "syllable",
42
+ "stress",
43
+ "success"
44
+ ]
45
+
46
+ # --------------------------------------------------
47
+ # INIT MODELS
48
+ # --------------------------------------------------
49
+ whisper = WhisperModel(
50
+ WHISPER_MODEL,
51
+ device="cpu",
52
+ compute_type="int8"
53
+ )
54
+
55
+ # --------------------------------------------------
56
+ # CHROMA INIT
57
+ # --------------------------------------------------
58
+ client = chromadb.PersistentClient(path=CHROMA_DIR)
59
+ collection = client.get_or_create_collection("feedback")
60
+
61
+ def init_segments():
62
+ if collection.count() > 0:
63
+ return
64
+
65
+ with open(JSON_PATH, "r", encoding="utf-8") as f:
66
+ data = json.load(f)
67
+
68
+ for item in data:
69
+ collection.add(
70
+ ids=[item["id"]],
71
+ documents=[item["text"]],
72
+ metadatas=[{
73
+ "category": item["category"],
74
+ "start": item["start"],
75
+ "end": item["end"]
76
+ }]
77
+ )
78
+
79
+ init_segments()
80
+
81
+ # --------------------------------------------------
82
+ # HELPERS
83
+ # --------------------------------------------------
84
+ def normalize_text(text: str) -> str:
85
+ return re.sub(r"[^a-z]", "", text.lower().strip())
86
+
87
+ def transcribe(wav_path: str) -> str:
88
+ segments, _ = whisper.transcribe(
89
+ wav_path,
90
+ language="en",
91
+ beam_size=5,
92
+ vad_filter=True
93
+ )
94
+ return "".join(s.text for s in segments).strip().lower()
95
+
96
+ # --------------------------------------------------
97
+ # PRONUNCIATION LOGIC (FIXED)
98
+ # --------------------------------------------------
99
+ def analyze(expected: str, heard_raw: str):
100
+ expected_n = normalize_text(expected)
101
+ heard_n = normalize_text(heard_raw)
102
+
103
+ if not heard_n:
104
+ return ["silence"], 0
105
+
106
+ if len(heard_raw.strip().split()) > 1:
107
+ return ["multipleword"], 20
108
+
109
+ similarity = Levenshtein.normalized_similarity(expected_n, heard_n)
110
+ score = int(similarity * 100)
111
+
112
+ if similarity < 0.30:
113
+ return ["wrong_word"], score
114
+
115
+ detected = []
116
+
117
+ vowels = "aeiou"
118
+ def is_vowel(ch: str) -> bool:
119
+ return ch in vowels
120
+
121
+ # First-letter mismatch: classify based on expected character category
122
+ if expected_n[0] != heard_n[0]:
123
+ if is_vowel(expected_n[0]):
124
+ detected.append("vowel")
125
+ else:
126
+ detected.append("consonant")
127
+
128
+ # Vowel sequence mismatch (only add if not already classified as a vowel)
129
+ expected_vowels = [c for c in expected_n if c in vowels]
130
+ heard_vowels = [c for c in heard_n if c in vowels]
131
+ if expected_vowels != heard_vowels and "vowel" not in detected:
132
+ detected.append("vowel")
133
+
134
+ # Ending error
135
+ if expected_n[-1] != heard_n[-1]:
136
+ detected.append("ending")
137
+
138
+ # Syllable error
139
+ if abs(len(expected_n) - len(heard_n)) >= 2:
140
+ detected.append("syllable")
141
+
142
+ # Stress error
143
+ if similarity < 0.85 and not detected:
144
+ detected.append("stress")
145
+
146
+ if not detected:
147
+ return ["success"], score
148
+
149
+ # Pick ONLY ONE issue using priority
150
+ for p in ISSUE_PRIORITY:
151
+ if p in detected:
152
+ return [p], score
153
+
154
+ return ["success"], score
155
+
156
+ # --------------------------------------------------
157
+ # FETCH SEGMENTS (STRICT)
158
+ # --------------------------------------------------
159
+ def fetch_segments(categories):
160
+ if not categories:
161
+ return []
162
+
163
+ category = categories[0]
164
+
165
+ result = collection.get(where={"category": category})
166
+ metas = result.get("metadatas", [])
167
+
168
+ # STRICT FILTER (important)
169
+ metas = [m for m in metas if m.get("category") == category]
170
+
171
+ if not metas:
172
+ return []
173
+
174
+ random.shuffle(metas)
175
+ return metas[:MAX_SEGMENTS_PER_CATEGORY]
176
+
177
+ # --------------------------------------------------
178
+ # BUILD VIDEO WITH FREEZE-HOLD PAUSE
179
+ # --------------------------------------------------
180
+ def build_video(segments):
181
+ if not segments:
182
+ return ""
183
+
184
+ segments = sorted(segments, key=lambda x: x["start"])
185
+ clips = []
186
+
187
+ for i, seg in enumerate(segments):
188
+ clip = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
189
+
190
+ pause = PAUSE_SECONDS if i < len(segments) - 1 else 0
191
+
192
+ subprocess.run(
193
+ [
194
+ "ffmpeg", "-y",
195
+ "-ss", str(max(0, seg["start"] - SAFE_PADDING)),
196
+ "-to", str(seg["end"] + SAFE_PADDING),
197
+ "-i", VIDEO_PATH,
198
+ "-vf", f"tpad=stop_mode=clone:stop_duration={pause}",
199
+ "-af", f"apad=pad_dur={pause}",
200
+ "-c:v", "libx264",
201
+ "-c:a", "aac",
202
+ "-movflags", "+faststart",
203
+ clip.name
204
+ ],
205
+ stdout=subprocess.DEVNULL,
206
+ stderr=subprocess.DEVNULL
207
+ )
208
+
209
+ clips.append(clip.name)
210
+
211
+ concat_file = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
212
+ with open(concat_file.name, "w") as f:
213
+ for c in clips:
214
+ f.write(f"file '{c}'\n")
215
+
216
+ final_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
217
+ subprocess.run(
218
+ [
219
+ "ffmpeg", "-y",
220
+ "-f", "concat",
221
+ "-safe", "0",
222
+ "-i", concat_file.name,
223
+ "-c:v", "libx264",
224
+ "-c:a", "aac",
225
+ final_video.name
226
+ ],
227
+ stdout=subprocess.DEVNULL,
228
+ stderr=subprocess.DEVNULL
229
+ )
230
+
231
+ with open(final_video.name, "rb") as f:
232
+ return base64.b64encode(f.read()).decode("utf-8")
233
+
234
+ # --------------------------------------------------
235
+ # API
236
+ # --------------------------------------------------
237
+ @pronragg_bp.route("/score", methods=["POST"])
238
+ def score_pronunciation():
239
+ expected = request.form.get("word", "").strip()
240
+ audio = request.files.get("audio")
241
+
242
+ if not expected or not audio:
243
+ return jsonify({"error": "Missing input"}), 400
244
+
245
+ temp = tempfile.NamedTemporaryFile(delete=False, suffix=".webm")
246
+ audio.save(temp.name)
247
+
248
+ wav = temp.name.replace(".webm", ".wav")
249
+ AudioSegment.from_file(temp.name).export(wav, format="wav")
250
+
251
+ heard = transcribe(wav)
252
+ issues, score = analyze(expected, heard)
253
+
254
+ segments = fetch_segments(issues) or fetch_segments(["silence"])
255
+ video_blob = build_video(segments)
256
+
257
+ return jsonify({
258
+ "expected": expected,
259
+ "heard": heard,
260
+ "issues": issues,
261
+ "score": score,
262
+ "videoBlobBase64": video_blob
263
+ })
pronvideo.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import tempfile
4
+ from flask import Flask, Blueprint, request, jsonify
5
+ from flask_cors import CORS
6
+ from pydub import AudioSegment
7
+ from rapidfuzz.distance import Levenshtein
8
+
9
+ # ASR - WhisperX (or Faster Whisper for Forced Alignment)
10
+ try:
11
+ from faster_whisper import WhisperModel
12
+ HAS_WHISPER = True
13
+ except Exception:
14
+ HAS_WHISPER = False
15
+
16
+ # Initialize the Flask app and Blueprint
17
+
18
+ pronvideo_bp = Blueprint("pronvideo", __name__)
19
+
20
+ # -----------------------------
21
+ # Load Whisper model (CPU friendly)
22
+ # -----------------------------
23
+ WHISPER_MODEL_SIZE = os.getenv("WHISPER_MODEL_SIZE", "base")
24
+ whisper_model = None
25
+ if HAS_WHISPER:
26
+ whisper_model = WhisperModel(
27
+ WHISPER_MODEL_SIZE,
28
+ device="cpu",
29
+ compute_type="int8"
30
+ )
31
+
32
+ # -----------------------------
33
+ # Helpers
34
+ # -----------------------------
35
+ def normalize(text: str) -> str:
36
+ return "".join(ch for ch in text.lower().strip() if ch.isalpha() or ch.isspace())
37
+
38
+ def phoneme_similarity_score(expected_ph: str, spoken_ph: str) -> int:
39
+ if not expected_ph or not spoken_ph:
40
+ return 0
41
+ dist = Levenshtein.distance(expected_ph, spoken_ph)
42
+ max_len = max(len(expected_ph), len(spoken_ph))
43
+ similarity = 1 - (dist / max_len)
44
+ score = int(round(similarity * 100))
45
+ return max(0, min(100, score))
46
+
47
+ def convert_to_wav_temp(upload_file) -> str:
48
+ upload_file.stream.seek(0)
49
+ raw = upload_file.stream.read()
50
+ bio = io.BytesIO(raw)
51
+ ext = os.path.splitext(upload_file.filename)[1].replace(".", "").lower() or None
52
+
53
+ try:
54
+ audio = AudioSegment.from_file(bio, format=ext if ext else None)
55
+ except Exception:
56
+ bio.seek(0)
57
+ audio = AudioSegment.from_file(bio)
58
+
59
+ audio = audio.set_channels(1).set_frame_rate(16000)
60
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
61
+ audio.export(tmp.name, format="wav")
62
+ return tmp.name
63
+
64
+ def transcribe_audio(audio_path: str) -> str:
65
+ if not HAS_WHISPER or whisper_model is None:
66
+ raise RuntimeError("Whisper ASR is not installed/available.")
67
+ segments, info = whisper_model.transcribe(
68
+ audio_path,
69
+ language="en",
70
+ vad_filter=True
71
+ )
72
+ text_parts = []
73
+ for seg in segments:
74
+ if seg.text:
75
+ text_parts.append(seg.text.strip())
76
+ return " ".join(text_parts).strip()
77
+
78
+ # -----------------------------
79
+ # Video feedback helpers
80
+ # -----------------------------
81
+ def static_video_for(kind: str):
82
+ mapping = {
83
+ "success": {"videoId": "video-success", "videoUrl": "/assets/pronvideo/feedback/success.mp4", "hint": "Great job! Keep going."},
84
+ "silence": {"videoId": "video-silence", "videoUrl": "/assets/pronvideo/feedback/silence.mp4", "hint": "Speak clearly into the mic for at least a second."},
85
+ "wrong_word": {"videoId": "video-wrong-word", "videoUrl": "/assets/pronvideo/feedback/wrongword.mp4", "hint": "Please say only the target word."},
86
+ "vowel": {"videoId": "video-vowel", "videoUrl": "/assets/pronvideo/feedback/vowel.mp4", "hint": "Work on vowel shape and length."},
87
+ "consonant": {"videoId": "video-consonant", "videoUrl": "/assets/pronvideo/feedback/consonant.mp4", "hint": "Focus on consonant articulation, especially start/end sounds."},
88
+ "stress": {"videoId": "video-stress", "videoUrl": "/assets/pronvideo/feedback/stress.mp4", "hint": "Emphasize the primary stressed syllable."},
89
+ "syllable": {"videoId": "video-syllable", "videoUrl": "/assets/pronvideo/feedback/syllable.mp4", "hint": "Match the number of syllables and rhythm."},
90
+ "ending": {"videoId": "video-ending", "videoUrl": "/assets/pronvideo/feedback/ending.mp4", "hint": "Work on the final sound—try to finish the word cleanly."},
91
+ "multipleword": {"videoId": "video-multipleword", "videoUrl": "/assets/pronvideo/feedback/multipleword.mp4", "hint": "Please say only the target word, not multiple words."},
92
+
93
+ }
94
+ return mapping.get(kind, {"videoId": None, "videoUrl": None, "hint": None})
95
+
96
+ # -----------------------------
97
+ # Function to detect feedback based on pronunciation
98
+ # -----------------------------
99
+ def vowel_consonant_feedback(teacher_ph: str, student_ph: str):
100
+ feedback = []
101
+
102
+ # Split the IPA tokens into vowels and consonants
103
+ t_tokens = split_ipa_tokens(teacher_ph)
104
+ s_tokens = split_ipa_tokens(student_ph)
105
+
106
+ # Vowel sequence check
107
+ v_t = extract_vowel_sequence(teacher_ph)
108
+ v_s = extract_vowel_sequence(student_ph)
109
+ if v_t != v_s:
110
+ feedback.append({
111
+ "title": "Vowel Accuracy",
112
+ "message": "Your vowel sound is different. Focus on long/short quality and mouth opening."
113
+ })
114
+
115
+ # Consonant sequence check
116
+ cons_t = extract_consonant_tokens(t_tokens)
117
+ cons_s = extract_consonant_tokens(s_tokens)
118
+ if cons_t != cons_s:
119
+ feedback.append({
120
+ "title": "Consonant Accuracy",
121
+ "message": "Some consonant sounds differ. Pay attention to start and end sounds."
122
+ })
123
+
124
+ # Ending sound check
125
+ end_t = last_ending_token(t_tokens)
126
+ end_s = last_ending_token(s_tokens)
127
+ if end_t and end_s and end_t != end_s:
128
+ feedback.append({
129
+ "title": "Ending Sound",
130
+ "message": f"The final sound differs. Try to end with '{end_t}'."
131
+ })
132
+
133
+ return feedback
134
+
135
+ # -----------------------------
136
+ # Syllable estimation logic
137
+ # -----------------------------
138
+ def syllable_estimate(ipa: str):
139
+ count = 0
140
+ in_vowel = False
141
+ for ch in ipa:
142
+ if ch in VOWELS:
143
+ if not in_vowel:
144
+ count += 1
145
+ in_vowel = True
146
+ else:
147
+ in_vowel = False
148
+ else:
149
+ in_vowel = False
150
+ return max(1, count) # at least 1 syllable
151
+
152
+ def select_video_for_vc(teacher_ph: str, student_ph: str) -> str:
153
+ # Early check: if overall similarity is very low, treat as wrong word
154
+ score = phoneme_similarity_score(teacher_ph, student_ph)
155
+ # threshold chosen empirically; adjust (0-100). <40 => likely a different word.
156
+ if score < 40:
157
+ return "wrong_word"
158
+
159
+ tokens_t = split_ipa_tokens(teacher_ph)
160
+ tokens_s = split_ipa_tokens(student_ph)
161
+
162
+ v_t = extract_vowel_sequence(teacher_ph)
163
+ v_s = extract_vowel_sequence(student_ph)
164
+
165
+ cons_t = extract_consonant_tokens(tokens_t)
166
+ cons_s = extract_consonant_tokens(tokens_s)
167
+
168
+ end_t = last_ending_token(tokens_t)
169
+ end_s = last_ending_token(tokens_s)
170
+
171
+ stress_t = primary_stress_position(tokens_t)
172
+ stress_s = primary_stress_position(tokens_s)
173
+
174
+ syl_t = syllable_estimate(teacher_ph)
175
+ syl_s = syllable_estimate(student_ph)
176
+
177
+ flags = []
178
+ if v_t != v_s:
179
+ flags.append("vowel")
180
+ if cons_t != cons_s:
181
+ flags.append("consonant")
182
+ if end_t and end_s and end_t != end_s:
183
+ flags.append("ending")
184
+ if stress_t is not None and stress_s is not None and stress_t != stress_s:
185
+ flags.append("stress")
186
+ if syl_t != syl_s:
187
+ flags.append("syllable")
188
+
189
+ if not flags:
190
+ return "success" # Correct pronunciation
191
+ if len(flags) == 1:
192
+ return flags[0] # Return the first mismatch type
193
+ return "mixed" # Return mixed if multiple issues are found
194
+
195
+ # -----------------------------
196
+ # Route: Score pronunciation with targeted feedback
197
+ # -----------------------------
198
+ @pronvideo_bp.route("/score", methods=["POST"])
199
+ def score_pronunciation():
200
+ if "audio" not in request.files:
201
+ return jsonify({"score": 0, "error": "audio_required"}), 400
202
+ expected_word = request.form.get("word", "").strip().lower()
203
+ if not expected_word:
204
+ return jsonify({"score": 0, "error": "word_required"}), 400
205
+
206
+ audio_file = request.files["audio"]
207
+
208
+ temp_wav = None
209
+ try:
210
+ temp_wav = convert_to_wav_temp(audio_file)
211
+
212
+ # Transcribe the audio and get spoken text
213
+ spoken_text = transcribe_audio(temp_wav)
214
+ spoken_text = normalize(spoken_text)
215
+
216
+ # If no speech detected
217
+ if not spoken_text:
218
+ vid = static_video_for("silence")
219
+ return jsonify({
220
+ "score": 0,
221
+ "error": "no_asr_text",
222
+ "message": "No speech detected.",
223
+ "hint": vid["hint"],
224
+ "videoId": vid["videoId"],
225
+ "videoUrl": vid["videoUrl"],
226
+ "expected": expected_word,
227
+ "heard": ""
228
+ }), 200
229
+
230
+ # If multiple words detected
231
+ if len(spoken_text.split()) > 1:
232
+ vid = static_video_for("multipleword")
233
+ return jsonify({
234
+ "score": 0,
235
+ "error": "multiple_words",
236
+ "message": f"Detected multiple words: '{spoken_text}'. Please say only '{expected_word}'.",
237
+ "hint": vid["hint"],
238
+ "videoId": vid["videoId"],
239
+ "videoUrl": vid["videoUrl"],
240
+ "expected": expected_word,
241
+ "heard": spoken_text
242
+ }), 200
243
+
244
+ # Calculate phoneme similarity
245
+ expected_ph = expected_word # Assuming expected word phoneme
246
+ spoken_ph = spoken_text # Assuming spoken text phoneme
247
+ score = phoneme_similarity_score(expected_ph, spoken_ph)
248
+
249
+ # Success only when exact match and high score
250
+ if spoken_text == expected_word and score >= 90:
251
+ vid = static_video_for("success")
252
+ return jsonify({
253
+ "score": score,
254
+ "message": f"Excellent. You pronounced '{expected_word}' correctly.",
255
+ "hint": vid["hint"],
256
+ "videoId": vid["videoId"],
257
+ "videoUrl": vid["videoUrl"],
258
+ "expected": expected_word,
259
+ "heard": spoken_text
260
+ }), 200
261
+
262
+ # Phoneme mismatch -> provide targeted feedback for vowel, consonant, stress, or syllable
263
+ kind = select_video_for_vc(expected_ph, spoken_ph)
264
+ vid = static_video_for(kind)
265
+ return jsonify({
266
+ "score": score,
267
+ "message": "Good try. Some sounds need practice.",
268
+ "hint": vid["hint"],
269
+ "videoId": vid["videoId"],
270
+ "videoUrl": vid["videoUrl"],
271
+ "expected": expected_word,
272
+ "heard": spoken_text
273
+ }), 200
274
+
275
+ except Exception as e:
276
+ return jsonify({"score": 0, "error": "server_exception", "message": str(e)}), 500
277
+ finally:
278
+ if temp_wav:
279
+ try:
280
+ os.remove(temp_wav)
281
+ except Exception:
282
+ pass
283
+
284
+
285
+ # IPA helpers and constants (adds split_ipa_tokens and related helpers)
286
+ VOWELS = set("aeiouɪʊɛæɔɑəɜɒeɪoʊaɪɔɪ") # extend with additional IPA symbols as needed
287
+ PRIMARY_STRESS = "ˈ"
288
+ SECONDARY_STRESS = "ˌ"
289
+ IPA_DIGRAPHS = {"tʃ", "dʒ", "t͡ʃ", "d͡ʒ"} # common multi-char IPA consonants
290
+
291
+ def split_ipa_tokens(ipa: str):
292
+ """
293
+ Tokenize an IPA or simple-orthography string into a list of tokens.
294
+ - Preserves stress markers as separate tokens.
295
+ - Combines common digraphs (e.g. 'tʃ', 'dʒ').
296
+ - If input contains spaces, splits on words and tokenizes each chunk.
297
+ Works acceptably for plain words (will return characters) and basic IPA.
298
+ """
299
+ if not ipa:
300
+ return []
301
+ ipa = ipa.strip()
302
+ # If whitespace-separated, preserve word boundaries as contiguous tokens
303
+ if " " in ipa:
304
+ parts = []
305
+ for part in ipa.split():
306
+ parts.extend(_tokenize_chunk(part))
307
+ return parts
308
+ return _tokenize_chunk(ipa)
309
+
310
+ def _tokenize_chunk(chunk: str):
311
+ tokens = []
312
+ i = 0
313
+ while i < len(chunk):
314
+ ch = chunk[i]
315
+ # stress markers
316
+ if ch in (PRIMARY_STRESS, SECONDARY_STRESS):
317
+ tokens.append(ch)
318
+ i += 1
319
+ continue
320
+ # try two-character digraphs first
321
+ if i + 1 < len(chunk):
322
+ pair = chunk[i : i + 2]
323
+ if pair in IPA_DIGRAPHS:
324
+ tokens.append(pair)
325
+ i += 2
326
+ continue
327
+ # fallback single character token
328
+ tokens.append(ch)
329
+ i += 1
330
+ return tokens
331
+
332
+ def extract_vowel_sequence(ipa: str):
333
+ """Return concatenated vowel tokens in order (string)."""
334
+ tokens = split_ipa_tokens(ipa)
335
+ return "".join(t for t in tokens if t in VOWELS)
336
+
337
+ def extract_consonant_tokens(tokens):
338
+ """Filter out vowels and stress markers from a tokens list, return consonant tokens list."""
339
+ return [t for t in tokens if t not in VOWELS and t not in (PRIMARY_STRESS, SECONDARY_STRESS) and t.strip()]
340
+
341
+ def last_ending_token(tokens):
342
+ """Return the last non-stress, non-empty token (approx. final sound)."""
343
+ for t in reversed(tokens):
344
+ if not t or t in (PRIMARY_STRESS, SECONDARY_STRESS):
345
+ continue
346
+ return t
347
+ return None
348
+
349
+ def primary_stress_position(tokens):
350
+ """
351
+ Return index of primary stress marker if present, otherwise None.
352
+ This is a coarse approximation used to compare stress positions between expected and spoken forms.
353
+ """
354
+ try:
355
+ return tokens.index(PRIMARY_STRESS)
356
+ except ValueError:
357
+ return None
358
+
359
+
teacher_feedback_sentences_category.json ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "id": "feedback.mp4_sent_0",
4
+ "video_file": "feedback.mp4",
5
+ "start": 0.167,
6
+ "end": 1.689,
7
+ "text": "I could not hear your voice.",
8
+ "category": "silence"
9
+ },
10
+ {
11
+ "id": "feedback.mp4_sent_1",
12
+ "video_file": "feedback.mp4",
13
+ "start": 2.771,
14
+ "end": 5.735,
15
+ "text": "Hold the record button and say the word.",
16
+ "category": "silence"
17
+ },
18
+ {
19
+ "id": "feedback.mp4_sent_2",
20
+ "video_file": "feedback.mp4",
21
+ "start": 6.936,
22
+ "end": 9.319,
23
+ "text": "Speak clearly and try again.",
24
+ "category": "silence"
25
+ },
26
+ {
27
+ "id": "feedback.mp4_sent_3",
28
+ "video_file": "feedback.mp4",
29
+ "start": 9.24,
30
+ "end": 12.304,
31
+ "text": "It sounds like a different word.",
32
+ "category": "wrong_word"
33
+ },
34
+ {
35
+ "id": "feedback.mp4_sent_4",
36
+ "video_file": "feedback.mp4",
37
+ "start": 13.305,
38
+ "end": 15.568,
39
+ "text": "Please say the correct word again.",
40
+ "category": "wrong_word"
41
+ },
42
+ {
43
+ "id": "feedback.mp4_sent_5",
44
+ "video_file": "feedback.mp4",
45
+ "start": 15.548,
46
+ "end": 19.153,
47
+ "text": "Listen to the example and repeat.",
48
+ "category": "wrong_word"
49
+ },
50
+ {
51
+ "id": "feedback.mp4_sent_6",
52
+ "video_file": "feedback.mp4",
53
+ "start": 20.255,
54
+ "end": 20.956,
55
+ "text": "Excellent!",
56
+ "category": "success"
57
+ },
58
+ {
59
+ "id": "feedback.mp4_sent_7",
60
+ "video_file": "feedback.mp4",
61
+ "start": 21.456,
62
+ "end": 23.078,
63
+ "text": "You said it correctly.",
64
+ "category": "success"
65
+ },
66
+ {
67
+ "id": "feedback.mp4_sent_8",
68
+ "video_file": "feedback.mp4",
69
+ "start": 24.22,
70
+ "end": 26.463,
71
+ "text": "Your pronunciation is clear.",
72
+ "category": "success"
73
+ },
74
+ {
75
+ "id": "feedback.mp4_sent_9",
76
+ "video_file": "feedback.mp4",
77
+ "start": 27.585,
78
+ "end": 28.586,
79
+ "text": "Great job!",
80
+ "category": "success"
81
+ },
82
+ {
83
+ "id": "feedback.mp4_sent_10",
84
+ "video_file": "feedback.mp4",
85
+ "start": 28.989,
86
+ "end": 30.631,
87
+ "text": "Move to the next word",
88
+ "category": "success"
89
+ },
90
+ {
91
+ "id": "feedback.mp4_sent_11",
92
+ "video_file": "feedback.mp4",
93
+ "start": 30.571,
94
+ "end": 33.514,
95
+ "text": "Focus on the vowel sound",
96
+ "category": "vowel"
97
+ },
98
+ {
99
+ "id": "feedback.mp4_sent_12",
100
+ "video_file": "feedback.mp4",
101
+ "start": 33.454,
102
+ "end": 36.717,
103
+ "text": "Open your mouth a little more",
104
+ "category": "vowel"
105
+ },
106
+ {
107
+ "id": "feedback.mp4_sent_13",
108
+ "video_file": "feedback.mp4",
109
+ "start": 37.998,
110
+ "end": 40.441,
111
+ "text": "Say the word slowly once",
112
+ "category": "vowel"
113
+ },
114
+ {
115
+ "id": "feedback.mp4_sent_14",
116
+ "video_file": "feedback.mp4",
117
+ "start": 30.571,
118
+ "end": 44.105,
119
+ "text": "Focus on the first sound",
120
+ "category": "consonant"
121
+ },
122
+ {
123
+ "id": "feedback.mp4_sent_15",
124
+ "video_file": "feedback.mp4",
125
+ "start": 44.045,
126
+ "end": 47.388,
127
+ "text": "Make the consonant clear",
128
+ "category": "consonant"
129
+ },
130
+ {
131
+ "id": "feedback.mp4_sent_16",
132
+ "video_file": "feedback.mp4",
133
+ "start": 47.328,
134
+ "end": 50.812,
135
+ "text": "Repeat the word slowly",
136
+ "category": "consonant"
137
+ },
138
+ {
139
+ "id": "feedback.mp4_sent_17",
140
+ "video_file": "feedback.mp4",
141
+ "start": 50.732,
142
+ "end": 54.075,
143
+ "text": "Do not stop early",
144
+ "category": "ending"
145
+ },
146
+ {
147
+ "id": "feedback.mp4_sent_18",
148
+ "video_file": "feedback.mp4",
149
+ "start": 37.998,
150
+ "end": 57.999,
151
+ "text": "Say the last sound clearly",
152
+ "category": "ending"
153
+ },
154
+ {
155
+ "id": "feedback.mp4_sent_19",
156
+ "video_file": "feedback.mp4",
157
+ "start": 59.145,
158
+ "end": 60.647,
159
+ "text": "Try the word again.",
160
+ "category": "ending"
161
+ },
162
+ {
163
+ "id": "feedback.mp4_sent_20",
164
+ "video_file": "feedback.mp4",
165
+ "start": 60.587,
166
+ "end": 64.873,
167
+ "text": "Say the strong part a little louder.",
168
+ "category": "stress"
169
+ },
170
+ {
171
+ "id": "feedback.mp4_sent_21",
172
+ "video_file": "feedback.mp4",
173
+ "start": 64.813,
174
+ "end": 69.018,
175
+ "text": "Keep the rest of the word smooth.",
176
+ "category": "stress"
177
+ },
178
+ {
179
+ "id": "feedback.mp4_sent_22",
180
+ "video_file": "feedback.mp4",
181
+ "start": 70.34,
182
+ "end": 72.863,
183
+ "text": "Try again with clear stress.",
184
+ "category": "stress"
185
+ },
186
+ {
187
+ "id": "feedback.mp4_sent_23",
188
+ "video_file": "feedback.mp4",
189
+ "start": 74.365,
190
+ "end": 76.328,
191
+ "text": "Break the word into parts.",
192
+ "category": "syllable"
193
+ },
194
+ {
195
+ "id": "feedback.mp4_sent_24",
196
+ "video_file": "feedback.mp4",
197
+ "start": 77.73,
198
+ "end": 80.013,
199
+ "text": "Say each part slowly.",
200
+ "category": "syllable"
201
+ },
202
+ {
203
+ "id": "feedback.mp4_sent_25",
204
+ "video_file": "feedback.mp4",
205
+ "start": 81.154,
206
+ "end": 83.177,
207
+ "text": "Then say the full word.",
208
+ "category": "syllable"
209
+ },
210
+ {
211
+ "id": "feedback.mp4_sent_26",
212
+ "video_file": "feedback.mp4",
213
+ "start": 84.525,
214
+ "end": 88.95,
215
+ "text": "Say only the target word without extra words.",
216
+ "category": "multipleword"
217
+ },
218
+ {
219
+ "id": "feedback.mp4_sent_27",
220
+ "video_file": "feedback.mp4",
221
+ "start": 89.311,
222
+ "end": 92.474,
223
+ "text": "Use a big a sound at the start.",
224
+ "category": "apple"
225
+ },
226
+ {
227
+ "id": "feedback.mp4_sent_28",
228
+ "video_file": "feedback.mp4",
229
+ "start": 93.275,
230
+ "end": 95.738,
231
+ "text": "Apple, not apple.",
232
+ "category": "apple"
233
+ },
234
+ {
235
+ "id": "feedback.mp4_sent_29",
236
+ "video_file": "feedback.mp4",
237
+ "start": 97.079,
238
+ "end": 103.226,
239
+ "text": "Open your mouth more for a, like apple, not apple.",
240
+ "category": "apple"
241
+ },
242
+ {
243
+ "id": "feedback.mp4_sent_30",
244
+ "video_file": "feedback.mp4",
245
+ "start": 104.548,
246
+ "end": 108.372,
247
+ "text": "Make a short o sound, not o.",
248
+ "category": "ball"
249
+ },
250
+ {
251
+ "id": "feedback.mp4_sent_31",
252
+ "video_file": "feedback.mp4",
253
+ "start": 109.173,
254
+ "end": 110.775,
255
+ "text": "Say ball.",
256
+ "category": "ball"
257
+ },
258
+ {
259
+ "id": "feedback.mp4_sent_32",
260
+ "video_file": "feedback.mp4",
261
+ "start": 110.695,
262
+ "end": 114.159,
263
+ "text": "Start with a strong b.",
264
+ "category": "ball"
265
+ },
266
+ {
267
+ "id": "feedback.mp4_sent_33",
268
+ "video_file": "feedback.mp4",
269
+ "start": 114.326,
270
+ "end": 114.987,
271
+ "text": "sound.",
272
+ "category": "ball"
273
+ },
274
+ {
275
+ "id": "feedback.mp4_sent_34",
276
+ "video_file": "feedback.mp4",
277
+ "start": 116.629,
278
+ "end": 118.572,
279
+ "text": "I could not hear your voice.",
280
+ "category": "silence"
281
+ },
282
+ {
283
+ "id": "feedback.mp4_sent_35",
284
+ "video_file": "feedback.mp4",
285
+ "start": 119.233,
286
+ "end": 121.837,
287
+ "text": "Please hold the record button and say the word.",
288
+ "category": "silence"
289
+ },
290
+ {
291
+ "id": "feedback.mp4_sent_36",
292
+ "video_file": "feedback.mp4",
293
+ "start": 121.777,
294
+ "end": 124.4,
295
+ "text": "It was very quiet.",
296
+ "category": "silence"
297
+ },
298
+ {
299
+ "id": "feedback.mp4_sent_37",
300
+ "video_file": "feedback.mp4",
301
+ "start": 125.081,
302
+ "end": 127.425,
303
+ "text": "Speak a little louder and try again.",
304
+ "category": "silence"
305
+ },
306
+ {
307
+ "id": "feedback.mp4_sent_38",
308
+ "video_file": "feedback.mp4",
309
+ "start": 127.345,
310
+ "end": 130.329,
311
+ "text": "No sound was recorded.",
312
+ "category": "silence"
313
+ },
314
+ {
315
+ "id": "feedback.mp4_sent_39",
316
+ "video_file": "feedback.mp4",
317
+ "start": 130.95,
318
+ "end": 133.594,
319
+ "text": "Check your microphone and say the word again.",
320
+ "category": "silence"
321
+ },
322
+ {
323
+ "id": "feedback.mp4_sent_40",
324
+ "video_file": "feedback.mp4",
325
+ "start": 133.514,
326
+ "end": 136.177,
327
+ "text": "I think you whispered.",
328
+ "category": "silence"
329
+ },
330
+ {
331
+ "id": "feedback.mp4_sent_41",
332
+ "video_file": "feedback.mp4",
333
+ "start": 136.958,
334
+ "end": 139.522,
335
+ "text": "Use your clear classroom voice.",
336
+ "category": "silence"
337
+ }
338
+ ]
verification.py CHANGED
@@ -496,6 +496,8 @@ from findingword import finding_bp
496
  from listen import listen_bp
497
  from ragg.app import rag_bp
498
  from pron import pron_bp
 
 
499
  from ragg.ingest_trigger import ingest_trigger_bp
500
  app.register_blueprint(movie_bp, url_prefix="/media")
501
  app.register_blueprint(questions_bp, url_prefix="/media")
@@ -507,6 +509,8 @@ app.register_blueprint(listen_bp, url_prefix="/media")
507
  app.register_blueprint(rag_bp, url_prefix="/rag")
508
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
509
  app.register_blueprint(pron_bp, url_prefix="/pron")
 
 
510
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
511
  # ------------------------------------------------------------------------------
512
  # Local run (Gunicorn will import `verification:app` on Spaces)
 
496
  from listen import listen_bp
497
  from ragg.app import rag_bp
498
  from pron import pron_bp
499
+ from pronvideo import pronvideo_bp
500
+ from pronragg import pronragg_bp
501
  from ragg.ingest_trigger import ingest_trigger_bp
502
  app.register_blueprint(movie_bp, url_prefix="/media")
503
  app.register_blueprint(questions_bp, url_prefix="/media")
 
509
  app.register_blueprint(rag_bp, url_prefix="/rag")
510
  app.register_blueprint(ingest_trigger_bp, url_prefix="/rag")
511
  app.register_blueprint(pron_bp, url_prefix="/pron")
512
+ app.register_blueprint(pronvideo_bp, url_prefix="/pronvideo")
513
+ app.register_blueprint(pronragg_bp, url_prefix="/pronragg")
514
  # app.register_blueprint(questions_bp, url_prefix="/media") # <-- add this
515
  # ------------------------------------------------------------------------------
516
  # Local run (Gunicorn will import `verification:app` on Spaces)