import os import asyncio import edge_tts import librosa import torch import numpy as np import pandas as pd import gradio as gr import re import time from PIL import Image from ultralytics import YOLOWorld from phonemizer import phonemize from transformers import pipeline from huggingface_hub import InferenceClient # --- INITIALIZATION --- HF_TOKEN = os.getenv("HF_TOKEN") # Load YOLO World (Small) model_vision = YOLOWorld('yolov8s-world.pt') # Whisper for ASR (Using tiny for speed) asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1) LANG_CONFIG = { "English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"}, "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"}, "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"}, "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"}, "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"} } # --- VISION LOGIC --- def detect_objects(img, target_queries): if img is None: return None, "Please upload an image first." # 1. Reset/Set Vocabulary if target_queries and len(target_queries.strip()) > 0: classes = [x.strip() for x in target_queries.split(",")] else: # Balanced default list to prevent "bottle" bias classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", "sandwich", "orange", "broccoli", "carrot", "pizza", "donut", "cake", "chair", "couch", "potted plant", "bed", "dining table", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", "microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"] # Force YOLO to update its internal class list model_vision.set_classes(classes) # 2. Prediction (Slightly higher confidence to reduce noise) results = model_vision.predict(img, conf=0.4) # 3. Process Image annotated_img = results[0].plot() # Flip BGR to RGB annotated_img = annotated_img[..., ::-1] # 4. Extract Labels found_labels = [] for c in results[0].boxes.cls: found_labels.append(model_vision.names[int(c)]) label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'." return annotated_img, label_list # --- TRANSLATION --- def translate_labels(lang_name, labels_str): if not labels_str or "No objects" in labels_str: return "No objects detected to translate." client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list." try: output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200) return output.choices[0].message.content except Exception as e: return f"Translation Error: {str(e)}" # --- SPEECH LOGIC (FIXED) --- async def tts_core(text, lang_name): voice = LANG_CONFIG[lang_name]["voice"] # Use timestamp to prevent browser audio caching issues filename = f"ref_{int(time.time())}.mp3" communicate = edge_tts.Communicate(text, voice) await communicate.save(filename) return filename def handle_tts(text, lang_name): if not text: return None return asyncio.run(tts_core(text, lang_name)) def analyze_speech(lang_name, target_text, audio_path): if not audio_path or not target_text: return "Missing recording or target word.", "", "Please provide both." # ASR asr_res = asr_pipe(audio_path)["text"].strip() # IPA ipa_code = LANG_CONFIG[lang_name]["ipa"] try: target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True) user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True) except: target_ipa = "Error" user_ipa = "Error" # LLM Feedback client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. " "Identify the error and give 1 specific anatomical tip for tongue/lips in English.") try: fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150) feedback = fb.choices[0].message.content except: feedback = "Speech analysis busy. Try again." return asr_res, f"/{user_ipa}/", feedback # --- UI --- with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.HTML("