Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -14,13 +14,12 @@ from huggingface_hub import InferenceClient
|
|
| 14 |
# --- AUTHENTICATION ---
|
| 15 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 16 |
|
| 17 |
-
# --- CONFIGURATION
|
| 18 |
-
#
|
| 19 |
LLM_MODELS = {
|
| 20 |
-
"Llama 3.2 3B (
|
| 21 |
-
"Qwen 2.5 7B (
|
| 22 |
-
"Gemma 2 9B (
|
| 23 |
-
"Llama 3.3 70B (Powerhouse - Busy)": "meta-llama/Llama-3.3-70B-Instruct"
|
| 24 |
}
|
| 25 |
|
| 26 |
LANGUAGES = {
|
|
@@ -31,41 +30,38 @@ LANGUAGES = {
|
|
| 31 |
"Chinese (Mandarin)": {"code": "zh-CN", "ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
|
| 32 |
}
|
| 33 |
|
| 34 |
-
# Load ASR model (Whisper Tiny
|
| 35 |
print("Loading Whisper ASR...")
|
| 36 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 37 |
|
| 38 |
# --- FUNCTIONS ---
|
| 39 |
|
| 40 |
def get_llm_response(model_id, system_prompt, user_prompt):
|
|
|
|
| 41 |
client = InferenceClient(model=model_id, token=HF_TOKEN)
|
| 42 |
try:
|
| 43 |
-
response = ""
|
| 44 |
messages = [
|
| 45 |
{"role": "system", "content": system_prompt},
|
| 46 |
{"role": "user", "content": user_prompt}
|
| 47 |
]
|
| 48 |
|
| 49 |
-
# We allow the router to find the best provider automatically for better stability
|
| 50 |
output = client.chat_completion(
|
| 51 |
messages,
|
| 52 |
max_tokens=500,
|
| 53 |
-
stream=False
|
| 54 |
)
|
| 55 |
return output.choices[0].message.content
|
| 56 |
|
| 57 |
except Exception as e:
|
| 58 |
-
|
| 59 |
-
if "
|
| 60 |
-
return "
|
| 61 |
-
|
| 62 |
-
return "β³ The model is currently 'waking up' or busy. Please wait 30 seconds and try again."
|
| 63 |
-
return f"System Note: {error_str}"
|
| 64 |
|
| 65 |
def generate_curriculum(model_name, language, topic):
|
| 66 |
model_id = LLM_MODELS[model_name]
|
| 67 |
-
system_prompt = f"You are PANINI LLM, a
|
| 68 |
-
user_prompt = f"Topic: {topic}. Provide 5 words/phrases with English translations
|
| 69 |
return get_llm_response(model_id, system_prompt, user_prompt)
|
| 70 |
|
| 71 |
async def play_target_audio(text, lang_name):
|
|
@@ -78,66 +74,69 @@ async def play_target_audio(text, lang_name):
|
|
| 78 |
|
| 79 |
def analyze_speech(model_name, lang_name, target_text, audio_path):
|
| 80 |
if not audio_path or not target_text:
|
| 81 |
-
return "Incomplete data.", "", "
|
| 82 |
|
| 83 |
-
# 1. Transcription
|
| 84 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 85 |
|
| 86 |
-
# 2.
|
| 87 |
ipa_code = LANGUAGES[lang_name]["ipa"]
|
| 88 |
try:
|
|
|
|
| 89 |
target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
|
| 90 |
user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
|
| 91 |
except:
|
| 92 |
target_ipa = "IPA Unavailable"
|
| 93 |
user_ipa = "IPA Unavailable"
|
| 94 |
|
| 95 |
-
# 3. LLM
|
| 96 |
model_id = LLM_MODELS[model_name]
|
| 97 |
-
system_prompt = "You are
|
| 98 |
user_prompt = (
|
| 99 |
f"Target: '{target_text}' (IPA: /{target_ipa}/). "
|
| 100 |
f"Student: '{asr_res}' (IPA: /{user_ipa}/). "
|
| 101 |
-
f"Identify the primary error and give
|
| 102 |
)
|
| 103 |
|
| 104 |
feedback = get_llm_response(model_id, system_prompt, user_prompt)
|
| 105 |
return asr_res, f"/{user_ipa}/", feedback
|
| 106 |
|
| 107 |
-
# --- UI ---
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
gr.HTML("<
|
|
|
|
| 111 |
|
| 112 |
-
with gr.Tab("Step 1: Curriculum"):
|
| 113 |
with gr.Row():
|
| 114 |
-
llm_choice = gr.Dropdown(list(LLM_MODELS.keys()), label="Select AI Teacher", value="Qwen 2.5 7B (
|
| 115 |
lang_choice = gr.Dropdown(list(LANGUAGES.keys()), label="Language", value="English (US)")
|
| 116 |
|
| 117 |
-
topic_input = gr.Textbox(label="
|
| 118 |
-
btn_gen = gr.Button("π
|
| 119 |
curr_output = gr.Markdown("---")
|
| 120 |
|
| 121 |
-
with gr.Tab("Step 2: Pronunciation"):
|
| 122 |
with gr.Row():
|
| 123 |
-
target_word = gr.Textbox(label="
|
| 124 |
-
btn_tts = gr.Button("π
|
| 125 |
|
| 126 |
-
audio_ref = gr.Audio(label="
|
| 127 |
|
| 128 |
with gr.Row():
|
| 129 |
-
audio_user = gr.Audio(label="
|
| 130 |
-
btn_analyze = gr.Button("π Analyze Accent", variant="primary")
|
| 131 |
|
| 132 |
with gr.Row():
|
| 133 |
-
out_transcript = gr.Textbox(label="
|
| 134 |
-
out_ipa = gr.Textbox(label="Your
|
| 135 |
|
| 136 |
-
out_feedback = gr.Markdown("
|
| 137 |
|
| 138 |
-
# Event
|
| 139 |
btn_gen.click(generate_curriculum, inputs=[llm_choice, lang_choice, topic_input], outputs=curr_output)
|
| 140 |
btn_tts.click(fn=lambda t, l: asyncio.run(play_target_audio(t, l)), inputs=[target_word, lang_choice], outputs=audio_ref)
|
| 141 |
btn_analyze.click(analyze_speech, inputs=[llm_choice, lang_choice, target_word, audio_user], outputs=[out_transcript, out_ipa, out_feedback])
|
| 142 |
|
|
|
|
| 143 |
demo.launch()
|
|
|
|
| 14 |
# --- AUTHENTICATION ---
|
| 15 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 16 |
|
| 17 |
+
# --- CONFIGURATION ---
|
| 18 |
+
# We use 3B to 9B models because they are the most stable on the free Inference API.
|
| 19 |
LLM_MODELS = {
|
| 20 |
+
"Llama 3.2 3B (Fastest)": "meta-llama/Llama-3.2-3B-Instruct",
|
| 21 |
+
"Qwen 2.5 7B (Most Accurate)": "Qwen/Qwen2.5-7B-Instruct",
|
| 22 |
+
"Gemma 2 9B (Excellent English)": "google/gemma-2-9b-it"
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
LANGUAGES = {
|
|
|
|
| 30 |
"Chinese (Mandarin)": {"code": "zh-CN", "ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
|
| 31 |
}
|
| 32 |
|
| 33 |
+
# Load ASR model (Whisper Tiny for CPU efficiency)
|
| 34 |
print("Loading Whisper ASR...")
|
| 35 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 36 |
|
| 37 |
# --- FUNCTIONS ---
|
| 38 |
|
| 39 |
def get_llm_response(model_id, system_prompt, user_prompt):
|
| 40 |
+
# Fixed: Removed the 'provider' argument to prevent TypeError
|
| 41 |
client = InferenceClient(model=model_id, token=HF_TOKEN)
|
| 42 |
try:
|
|
|
|
| 43 |
messages = [
|
| 44 |
{"role": "system", "content": system_prompt},
|
| 45 |
{"role": "user", "content": user_prompt}
|
| 46 |
]
|
| 47 |
|
|
|
|
| 48 |
output = client.chat_completion(
|
| 49 |
messages,
|
| 50 |
max_tokens=500,
|
| 51 |
+
stream=False
|
| 52 |
)
|
| 53 |
return output.choices[0].message.content
|
| 54 |
|
| 55 |
except Exception as e:
|
| 56 |
+
err = str(e)
|
| 57 |
+
if "503" in err:
|
| 58 |
+
return "β³ The model is currently loading on Hugging Face servers. Please wait 30 seconds and try again."
|
| 59 |
+
return f"PANINI LLM Note: {err}"
|
|
|
|
|
|
|
| 60 |
|
| 61 |
def generate_curriculum(model_name, language, topic):
|
| 62 |
model_id = LLM_MODELS[model_name]
|
| 63 |
+
system_prompt = f"You are PANINI LLM, a world-class {language} teacher. Create a focused lesson plan."
|
| 64 |
+
user_prompt = f"Topic: {topic}. Provide 5 useful words/phrases in {language} with English translations, then give one expert learning tip."
|
| 65 |
return get_llm_response(model_id, system_prompt, user_prompt)
|
| 66 |
|
| 67 |
async def play_target_audio(text, lang_name):
|
|
|
|
| 74 |
|
| 75 |
def analyze_speech(model_name, lang_name, target_text, audio_path):
|
| 76 |
if not audio_path or not target_text:
|
| 77 |
+
return "Incomplete data.", "", "Please provide both text and recording."
|
| 78 |
|
| 79 |
+
# 1. ASR Transcription
|
| 80 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 81 |
|
| 82 |
+
# 2. Linguistic IPA Layer
|
| 83 |
ipa_code = LANGUAGES[lang_name]["ipa"]
|
| 84 |
try:
|
| 85 |
+
# Requires espeak-ng installed via packages.txt
|
| 86 |
target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
|
| 87 |
user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
|
| 88 |
except:
|
| 89 |
target_ipa = "IPA Unavailable"
|
| 90 |
user_ipa = "IPA Unavailable"
|
| 91 |
|
| 92 |
+
# 3. LLM Anatomical Feedback
|
| 93 |
model_id = LLM_MODELS[model_name]
|
| 94 |
+
system_prompt = "You are a professional Speech-Language Pathologist. Compare the student's pronunciation to the target using IPA."
|
| 95 |
user_prompt = (
|
| 96 |
f"Target: '{target_text}' (IPA: /{target_ipa}/). "
|
| 97 |
f"Student: '{asr_res}' (IPA: /{user_ipa}/). "
|
| 98 |
+
f"Identify the primary phonetic error and give 1 specific anatomical tip (tongue/lip placement) in English."
|
| 99 |
)
|
| 100 |
|
| 101 |
feedback = get_llm_response(model_id, system_prompt, user_prompt)
|
| 102 |
return asr_res, f"/{user_ipa}/", feedback
|
| 103 |
|
| 104 |
+
# --- UI DESIGN ---
|
| 105 |
+
|
| 106 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate"), css=".gradio-container {max-width: 950px !important}") as demo:
|
| 107 |
+
gr.HTML("<h1 style='text-align: center; color: #1e40af;'>ποΈ PANINI LLM</h1>")
|
| 108 |
+
gr.HTML("<p style='text-align: center; margin-top: -10px;'>Intelligent Multi-Model Language Tutoring</p>")
|
| 109 |
|
| 110 |
+
with gr.Tab("Step 1: Curriculum Creation"):
|
| 111 |
with gr.Row():
|
| 112 |
+
llm_choice = gr.Dropdown(list(LLM_MODELS.keys()), label="Select AI Teacher (LLM)", value="Qwen 2.5 7B (Most Accurate)")
|
| 113 |
lang_choice = gr.Dropdown(list(LANGUAGES.keys()), label="Language", value="English (US)")
|
| 114 |
|
| 115 |
+
topic_input = gr.Textbox(label="Lesson Topic", placeholder="e.g., Ordering Food, Job Interview, Airport Travel")
|
| 116 |
+
btn_gen = gr.Button("π Build My Lesson", variant="primary")
|
| 117 |
curr_output = gr.Markdown("---")
|
| 118 |
|
| 119 |
+
with gr.Tab("Step 2: Pronunciation Practice"):
|
| 120 |
with gr.Row():
|
| 121 |
+
target_word = gr.Textbox(label="Word/Phrase to Practice", placeholder="Copy a phrase from Step 1 here")
|
| 122 |
+
btn_tts = gr.Button("π Play Native AI", scale=0)
|
| 123 |
|
| 124 |
+
audio_ref = gr.Audio(label="Teacher Reference", type="filepath")
|
| 125 |
|
| 126 |
with gr.Row():
|
| 127 |
+
audio_user = gr.Audio(label="Your Voice Recording", sources=["microphone"], type="filepath")
|
| 128 |
+
btn_analyze = gr.Button("π Analyze My Accent", variant="primary")
|
| 129 |
|
| 130 |
with gr.Row():
|
| 131 |
+
out_transcript = gr.Textbox(label="AI Heard")
|
| 132 |
+
out_ipa = gr.Textbox(label="Your Phonetics (IPA)")
|
| 133 |
|
| 134 |
+
out_feedback = gr.Markdown("### Feedback from the AI Coach")
|
| 135 |
|
| 136 |
+
# Event Wireup
|
| 137 |
btn_gen.click(generate_curriculum, inputs=[llm_choice, lang_choice, topic_input], outputs=curr_output)
|
| 138 |
btn_tts.click(fn=lambda t, l: asyncio.run(play_target_audio(t, l)), inputs=[target_word, lang_choice], outputs=audio_ref)
|
| 139 |
btn_analyze.click(analyze_speech, inputs=[llm_choice, lang_choice, target_word, audio_user], outputs=[out_transcript, out_ipa, out_feedback])
|
| 140 |
|
| 141 |
+
# Run app
|
| 142 |
demo.launch()
|