import os import asyncio import edge_tts import librosa import torch import time import random import numpy as np import gradio as gr from PIL import Image from ultralytics import YOLOWorld from phonemizer import phonemize from transformers import pipeline from huggingface_hub import InferenceClient from datasets import load_dataset # --- CONFIG & MODELS --- HF_TOKEN = os.getenv("HF_TOKEN") model_vision = YOLOWorld('yolov8s-world.pt') asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1) # Initialize COCO Dataset Streaming print("Initialising COCO Dataset streaming...") try: ds = load_dataset("detection-datasets/coco", split="val", streaming=True) ds_iter = iter(ds) except Exception as e: print(f"Dataset init failed: {e}") ds_iter = None LANG_CONFIG = { "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"}, "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"}, "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"}, "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"} } # --- FUNCTIONS --- def get_random_coco_image(): global ds_iter try: if ds_iter is None: raise ValueError("Dataset not ready") for _ in range(random.randint(1, 3)): sample = next(ds_iter) return sample['image'] except Exception as e: return "http://images.cocodataset.org/val2017/000000000632.jpg" def scan_scene(img, lang_name, custom_tags): if img is None: return None, "Please get a scene first.", [] # 1. SET VOCABULARY (Open Vocabulary Feature) if custom_tags and len(custom_tags.strip()) > 0: # User defined search classes = [x.strip() for x in custom_tags.split(",")] else: # General discovery mode classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink", "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"] model_vision.set_classes(classes) # 2. PREDICT results = model_vision.predict(img, conf=0.25) annotated_img = results[0].plot()[..., ::-1] # BGR to RGB # 3. EXTRACT AND TRANSLATE eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes])) client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) trans_map = {} if eng_labels: # Prompt LLM to create a translation dictionary prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'." try: res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content # Parse pairs like 'table:der Tisch' for pair in res_text.split(","): if ":" in pair: eng, trans = pair.split(":") trans_map[eng.strip().lower()] = trans.strip() except Exception as e: print(f"Translation Error: {e}") trans_map = {lbl.lower(): lbl for lbl in eng_labels} # 4. MAP DETECTIONS (Link box to translated word) detections = [] for box in results[0].boxes: eng_label = model_vision.names[int(box.cls)].lower() translated_label = trans_map.get(eng_label, eng_label) coords = box.xyxy[0].tolist() detections.append({"translated": translated_label, "english": eng_label, "box": coords}) vocab_display = ", ".join(trans_map.values()) return annotated_img, vocab_display, detections def on_image_click(evt: gr.SelectData, detections): """Triggered when user clicks an object in the annotated image""" if not detections: return "Scan the image first!", "" click_x, click_y = evt.index for det in detections: x1, y1, x2, y2 = det["box"] # Check if click point is inside the detection box if x1 <= click_x <= x2 and y1 <= click_y <= y2: translated_word = det['translated'] return f"🎯 Selected: **{translated_word}** ({det['english']})", translated_word return "💡 Click directly inside a colored box!", "" async def tts_task(text, lang_name): if not text: return None voice = LANG_CONFIG[lang_name]["voice"] path = f"speech_{int(time.time())}.mp3" await edge_tts.Communicate(text, voice).save(path) return path def run_feedback(target, lang_name, audio_path): if not audio_path or not target: return "Select a word and record audio.", "", "" asr_res = asr_pipe(audio_path)["text"].strip() ipa_code = LANG_CONFIG[lang_name]["ipa"] try: t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True) u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True) except: t_ipa, u_ipa = "N/A", "N/A" client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English." try: fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content return asr_res, f"/{u_ipa}/", fb except: return asr_res, f"/{u_ipa}/", "Coach is busy." # --- UI --- CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }" with gr.Blocks(css=CSS) as demo: gr.HTML("