Spaces:
Runtime error
Runtime error
Upload 4 files
Browse files- asr_api.py +20 -0
- llm.py +64 -0
- phonetics.py +37 -0
- user_data.py +28 -0
asr_api.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import requests
|
| 3 |
+
|
| 4 |
+
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY", "")
|
| 5 |
+
|
| 6 |
+
def transcribe_with_openrouter(audio_path):
|
| 7 |
+
"""Transcribe audio using OpenRouter API (speech-to-text)."""
|
| 8 |
+
if not OPENROUTER_API_KEY:
|
| 9 |
+
return "[No OpenRouter API key set]"
|
| 10 |
+
url = "https://openrouter.ai/api/v1/audio/transcriptions"
|
| 11 |
+
headers = {"Authorization": f"Bearer {OPENROUTER_API_KEY}"}
|
| 12 |
+
files = {"file": open(audio_path, "rb")}
|
| 13 |
+
data = {"model": "whisper-large-v3"}
|
| 14 |
+
try:
|
| 15 |
+
response = requests.post(url, headers=headers, files=files, data=data, timeout=60)
|
| 16 |
+
response.raise_for_status()
|
| 17 |
+
result = response.json()
|
| 18 |
+
return result.get("text", "")
|
| 19 |
+
except Exception as e:
|
| 20 |
+
return f"[ASR API error: {e}]"
|
llm.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 2 |
+
from user_data import load_user_data, save_user_data
|
| 3 |
+
from phonetics import analyze_audio_phonetically, extract_phonemes
|
| 4 |
+
|
| 5 |
+
model_name = "BeastGokul/Nika-1.5B"
|
| 6 |
+
llm_tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 7 |
+
llm_model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 8 |
+
|
| 9 |
+
SYSTEM_PROMPT = """You are a specialized pronunciation assistant for non-native English speakers.\nYour job is to provide targeted, actionable feedback based on the user's speech or description.\n\nWhen analyzing pronunciation:\n1. Identify at most 2 specific phonemes or pronunciation patterns that need improvement\n2. Explain how the sound is correctly formed (tongue position, lip movement, etc.)\n3. Suggest one simple, targeted exercise for practice\n4. Be encouraging and note any improvements from previous sessions\n5. Use simple language appropriate for language learners\n\nWhen provided with phonetic analysis data, incorporate this information into your feedback.\n"""
|
| 10 |
+
|
| 11 |
+
def get_llm_feedback(audio=None, text=None, reference_text=None, user_id="default", transcribe_func=None):
|
| 12 |
+
user_data = load_user_data(user_id)
|
| 13 |
+
# Process audio if provided
|
| 14 |
+
if audio:
|
| 15 |
+
from user_data import save_audio
|
| 16 |
+
audio_path = save_audio(audio, user_id)
|
| 17 |
+
# Transcribe if no text was provided
|
| 18 |
+
if not text and transcribe_func:
|
| 19 |
+
text = transcribe_func(audio_path)
|
| 20 |
+
# Get phonetic analysis
|
| 21 |
+
phonetic_analysis = analyze_audio_phonetically(audio_path, reference_text)
|
| 22 |
+
phonetic_info = f"""
|
| 23 |
+
Phonetic analysis:\n- Detected phonemes: {phonetic_analysis['detected_phonemes']}\n"""
|
| 24 |
+
if reference_text:
|
| 25 |
+
phonetic_info += f"- Reference phonemes: {phonetic_analysis.get('reference_phonemes', 'N/A')}\n"
|
| 26 |
+
else:
|
| 27 |
+
audio_path = None
|
| 28 |
+
phonetic_info = ""
|
| 29 |
+
# Get user history context
|
| 30 |
+
history_context = ""
|
| 31 |
+
if user_data["practice_sessions"]:
|
| 32 |
+
phoneme_counts = {p: data["practice_count"] for p, data in user_data["phoneme_progress"].items()}
|
| 33 |
+
challenging = sorted(phoneme_counts.items(), key=lambda x: x[1], reverse=True)[:3]
|
| 34 |
+
history_context = f"""
|
| 35 |
+
User has practiced {len(user_data['practice_sessions'])} times before.\nCommon challenging phonemes: {', '.join([p for p, _ in challenging])}.\n"""
|
| 36 |
+
# Build prompt for LLM
|
| 37 |
+
if text:
|
| 38 |
+
user_input = f"I said: '{text}'"
|
| 39 |
+
if reference_text and reference_text != text:
|
| 40 |
+
user_input += f". I was trying to say: '{reference_text}'"
|
| 41 |
+
else:
|
| 42 |
+
user_input = "Please analyze my pronunciation."
|
| 43 |
+
full_prompt = f"""{SYSTEM_PROMPT}\n\nUser history:\n{history_context}\n\n{phonetic_info}\n\nUser: {user_input}\n"""
|
| 44 |
+
# Get LLM response
|
| 45 |
+
inputs = llm_tokenizer(full_prompt, return_tensors="pt").to(llm_model.device)
|
| 46 |
+
import torch
|
| 47 |
+
with torch.no_grad():
|
| 48 |
+
outputs = llm_model.generate(
|
| 49 |
+
**inputs,
|
| 50 |
+
max_new_tokens=200,
|
| 51 |
+
temperature=0.7,
|
| 52 |
+
top_p=0.9,
|
| 53 |
+
do_sample=True
|
| 54 |
+
)
|
| 55 |
+
response = llm_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 56 |
+
try:
|
| 57 |
+
response = response.split("Assistant: ")[-1].strip()
|
| 58 |
+
except:
|
| 59 |
+
pass
|
| 60 |
+
# Track the session if audio was provided
|
| 61 |
+
if audio_path:
|
| 62 |
+
from user_data import track_practice_session
|
| 63 |
+
track_practice_session(user_id, audio_path, text, reference_text, response)
|
| 64 |
+
return response, text
|
phonetics.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import librosa
|
| 3 |
+
from phonemizer import phonemize
|
| 4 |
+
|
| 5 |
+
def extract_phonemes(text):
|
| 6 |
+
"""Convert text to phonemes"""
|
| 7 |
+
return phonemize(text, language='en-us', backend='espeak', strip=True)
|
| 8 |
+
|
| 9 |
+
def analyze_audio_phonetically(audio_path, reference_text=None, wav2vec_processor=None, wav2vec_model=None):
|
| 10 |
+
"""Perform phonetic analysis of the audio compared to reference text (optional, only if local model loaded)"""
|
| 11 |
+
if not wav2vec_processor or not wav2vec_model:
|
| 12 |
+
return {"detected_phonemes": "[Phoneme analysis not available]"}
|
| 13 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
| 14 |
+
inputs = wav2vec_processor(audio, sampling_rate=16000, return_tensors="pt")
|
| 15 |
+
import torch
|
| 16 |
+
with torch.no_grad():
|
| 17 |
+
logits = wav2vec_model(inputs.input_values).logits
|
| 18 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
| 19 |
+
phoneme_sequence = wav2vec_processor.batch_decode(predicted_ids)[0]
|
| 20 |
+
result = {"detected_phonemes": phoneme_sequence}
|
| 21 |
+
if reference_text:
|
| 22 |
+
reference_phonemes = extract_phonemes(reference_text)
|
| 23 |
+
result["reference_phonemes"] = reference_phonemes
|
| 24 |
+
result["analysis"] = "Phoneme comparison would be performed here"
|
| 25 |
+
return result
|
| 26 |
+
|
| 27 |
+
def extract_pronunciation_embedding(audio_path, hubert_processor=None, hubert_model=None):
|
| 28 |
+
"""Extract pronunciation embedding for comparison purposes (optional, only if local model loaded)"""
|
| 29 |
+
if not hubert_model or not hubert_processor:
|
| 30 |
+
return None
|
| 31 |
+
audio, sr = librosa.load(audio_path, sr=16000)
|
| 32 |
+
inputs = hubert_processor(audio, sampling_rate=16000, return_tensors="pt")
|
| 33 |
+
import torch
|
| 34 |
+
with torch.no_grad():
|
| 35 |
+
outputs = hubert_model(**inputs)
|
| 36 |
+
embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
|
| 37 |
+
return embedding
|
user_data.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
|
| 5 |
+
def get_user_data_path(user_id="default"):
|
| 6 |
+
return f"user_data/{user_id}_data.json"
|
| 7 |
+
|
| 8 |
+
def load_user_data(user_id="default"):
|
| 9 |
+
file_path = get_user_data_path(user_id)
|
| 10 |
+
if os.path.exists(file_path):
|
| 11 |
+
with open(file_path, "r") as f:
|
| 12 |
+
return json.load(f)
|
| 13 |
+
return {
|
| 14 |
+
"profile": {
|
| 15 |
+
"native_language": "",
|
| 16 |
+
"challenge_sounds": [],
|
| 17 |
+
"practice_count": 0,
|
| 18 |
+
"joined_date": datetime.now().strftime("%Y-%m-%d")
|
| 19 |
+
},
|
| 20 |
+
"practice_sessions": [],
|
| 21 |
+
"phoneme_progress": {},
|
| 22 |
+
"word_progress": {},
|
| 23 |
+
"goals": []
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
def save_user_data(data, user_id="default"):
|
| 27 |
+
with open(get_user_data_path(user_id), "w") as f:
|
| 28 |
+
json.dump(data, f, indent=2)
|