import os import asyncio import edge_tts import librosa import torch import numpy as np import pandas as pd import re import gradio as gr from phonemizer import phonemize from transformers import pipeline from huggingface_hub import InferenceClient # --- AUTHENTICATION --- HF_TOKEN = os.getenv("HF_TOKEN") # --- CONFIGURATION --- # We use 3B to 9B models because they are the most stable on the free Inference API. LLM_MODELS = { "Llama 3.2 3B (Fastest)": "meta-llama/Llama-3.2-3B-Instruct", "Qwen 2.5 7B (Most Accurate)": "Qwen/Qwen2.5-7B-Instruct", "Gemma 2 9B (Excellent English)": "google/gemma-2-9b-it" } LANGUAGES = { "English (US)": {"code": "en-US", "ipa": "en-us", "voice": "en-US-ChristopherNeural"}, "German": {"code": "de-DE", "ipa": "de", "voice": "de-DE-ConradNeural"}, "French": {"code": "fr-FR", "ipa": "fr-fr", "voice": "fr-FR-HenriNeural"}, "Spanish": {"code": "es-ES", "ipa": "es", "voice": "es-ES-AlvaroNeural"}, "Chinese (Mandarin)": {"code": "zh-CN", "ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"} } # Load ASR model (Whisper Tiny for CPU efficiency) print("Loading Whisper ASR...") asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1) # --- FUNCTIONS --- def get_llm_response(model_id, system_prompt, user_prompt): # Fixed: Removed the 'provider' argument to prevent TypeError client = InferenceClient(model=model_id, token=HF_TOKEN) try: messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ] output = client.chat_completion( messages, max_tokens=500, stream=False ) return output.choices[0].message.content except Exception as e: err = str(e) if "503" in err: return "⏳ The model is currently loading on Hugging Face servers. Please wait 30 seconds and try again." return f"PANINI LLM Note: {err}" def generate_curriculum(model_name, language, topic): model_id = LLM_MODELS[model_name] system_prompt = f"You are PANINI LLM, a world-class {language} teacher. Create a focused lesson plan." user_prompt = f"Topic: {topic}. Provide 5 useful words/phrases in {language} with English translations, then give one expert learning tip." return get_llm_response(model_id, system_prompt, user_prompt) async def play_target_audio(text, lang_name): if not text: return None voice = LANGUAGES[lang_name]["voice"] output_path = "target.mp3" communicate = edge_tts.Communicate(text, voice) await communicate.save(output_path) return output_path def analyze_speech(model_name, lang_name, target_text, audio_path): if not audio_path or not target_text: return "Incomplete data.", "", "Please provide both text and recording." # 1. ASR Transcription asr_res = asr_pipe(audio_path)["text"].strip() # 2. Linguistic IPA Layer ipa_code = LANGUAGES[lang_name]["ipa"] try: # Requires espeak-ng installed via packages.txt target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True) user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True) except: target_ipa = "IPA Unavailable" user_ipa = "IPA Unavailable" # 3. LLM Anatomical Feedback model_id = LLM_MODELS[model_name] system_prompt = "You are a professional Speech-Language Pathologist. Compare the student's pronunciation to the target using IPA." user_prompt = ( f"Target: '{target_text}' (IPA: /{target_ipa}/). " f"Student: '{asr_res}' (IPA: /{user_ipa}/). " f"Identify the primary phonetic error and give 1 specific anatomical tip (tongue/lip placement) in English." ) feedback = get_llm_response(model_id, system_prompt, user_prompt) return asr_res, f"/{user_ipa}/", feedback # --- UI DESIGN --- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="slate"), css=".gradio-container {max-width: 950px !important}") as demo: gr.HTML("
Intelligent Multi-Model Language Tutoring
") with gr.Tab("Step 1: Curriculum Creation"): with gr.Row(): llm_choice = gr.Dropdown(list(LLM_MODELS.keys()), label="Select AI Teacher (LLM)", value="Qwen 2.5 7B (Most Accurate)") lang_choice = gr.Dropdown(list(LANGUAGES.keys()), label="Language", value="English (US)") topic_input = gr.Textbox(label="Lesson Topic", placeholder="e.g., Ordering Food, Job Interview, Airport Travel") btn_gen = gr.Button("📚 Build My Lesson", variant="primary") curr_output = gr.Markdown("---") with gr.Tab("Step 2: Pronunciation Practice"): with gr.Row(): target_word = gr.Textbox(label="Word/Phrase to Practice", placeholder="Copy a phrase from Step 1 here") btn_tts = gr.Button("🔊 Play Native AI", scale=0) audio_ref = gr.Audio(label="Teacher Reference", type="filepath") with gr.Row(): audio_user = gr.Audio(label="Your Voice Recording", sources=["microphone"], type="filepath") btn_analyze = gr.Button("🚀 Analyze My Accent", variant="primary") with gr.Row(): out_transcript = gr.Textbox(label="AI Heard") out_ipa = gr.Textbox(label="Your Phonetics (IPA)") out_feedback = gr.Markdown("### Feedback from the AI Coach") # Event Wireup btn_gen.click(generate_curriculum, inputs=[llm_choice, lang_choice, topic_input], outputs=curr_output) btn_tts.click(fn=lambda t, l: asyncio.run(play_target_audio(t, l)), inputs=[target_word, lang_choice], outputs=audio_ref) btn_analyze.click(analyze_speech, inputs=[llm_choice, lang_choice, target_word, audio_user], outputs=[out_transcript, out_ipa, out_feedback]) # Run app demo.launch()