Spaces:
Sleeping
Sleeping
| import os | |
| import asyncio | |
| import edge_tts | |
| import librosa | |
| import torch | |
| import numpy as np | |
| import pandas as pd | |
| import gradio as gr | |
| import re | |
| import time | |
| from PIL import Image | |
| from ultralytics import YOLOWorld | |
| from phonemizer import phonemize | |
| from transformers import pipeline | |
| from huggingface_hub import InferenceClient | |
| # --- INITIALIZATION --- | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # Load YOLO World (Small) | |
| model_vision = YOLOWorld('yolov8s-world.pt') | |
| # Whisper for ASR (Using tiny for speed) | |
| asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1) | |
| LANG_CONFIG = { | |
| "English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"}, | |
| "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"}, | |
| "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"}, | |
| "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"}, | |
| "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"} | |
| } | |
| # --- VISION LOGIC --- | |
| def detect_objects(img, target_queries): | |
| if img is None: | |
| return None, "Please upload an image first." | |
| # 1. Reset/Set Vocabulary | |
| if target_queries and len(target_queries.strip()) > 0: | |
| classes = [x.strip() for x in target_queries.split(",")] | |
| else: | |
| # Balanced default list to prevent "bottle" bias | |
| classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase", | |
| "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", | |
| "sandwich", "orange", "broccoli", "carrot", "pizza", "donut", | |
| "cake", "chair", "couch", "potted plant", "bed", "dining table", | |
| "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", | |
| "microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"] | |
| # Force YOLO to update its internal class list | |
| model_vision.set_classes(classes) | |
| # 2. Prediction (Slightly higher confidence to reduce noise) | |
| results = model_vision.predict(img, conf=0.4) | |
| # 3. Process Image | |
| annotated_img = results[0].plot() | |
| # Flip BGR to RGB | |
| annotated_img = annotated_img[..., ::-1] | |
| # 4. Extract Labels | |
| found_labels = [] | |
| for c in results[0].boxes.cls: | |
| found_labels.append(model_vision.names[int(c)]) | |
| label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'." | |
| return annotated_img, label_list | |
| # --- TRANSLATION --- | |
| def translate_labels(lang_name, labels_str): | |
| if not labels_str or "No objects" in labels_str: | |
| return "No objects detected to translate." | |
| client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) | |
| prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list." | |
| try: | |
| output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200) | |
| return output.choices[0].message.content | |
| except Exception as e: | |
| return f"Translation Error: {str(e)}" | |
| # --- SPEECH LOGIC (FIXED) --- | |
| async def tts_core(text, lang_name): | |
| voice = LANG_CONFIG[lang_name]["voice"] | |
| # Use timestamp to prevent browser audio caching issues | |
| filename = f"ref_{int(time.time())}.mp3" | |
| communicate = edge_tts.Communicate(text, voice) | |
| await communicate.save(filename) | |
| return filename | |
| def handle_tts(text, lang_name): | |
| if not text: return None | |
| return asyncio.run(tts_core(text, lang_name)) | |
| def analyze_speech(lang_name, target_text, audio_path): | |
| if not audio_path or not target_text: | |
| return "Missing recording or target word.", "", "Please provide both." | |
| # ASR | |
| asr_res = asr_pipe(audio_path)["text"].strip() | |
| # IPA | |
| ipa_code = LANG_CONFIG[lang_name]["ipa"] | |
| try: | |
| target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True) | |
| user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True) | |
| except: | |
| target_ipa = "Error" | |
| user_ipa = "Error" | |
| # LLM Feedback | |
| client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN) | |
| prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. " | |
| "Identify the error and give 1 specific anatomical tip for tongue/lips in English.") | |
| try: | |
| fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150) | |
| feedback = fb.choices[0].message.content | |
| except: | |
| feedback = "Speech analysis busy. Try again." | |
| return asr_res, f"/{user_ipa}/", feedback | |
| # --- UI --- | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: | |
| gr.HTML("<h1 style='text-align: center;'>ποΈ PANINI Vision</h1>") | |
| with gr.Accordion("π How to use (Instruction)", open=False): | |
| gr.Markdown(""" | |
| ### 1. Vision Step | |
| * **Upload a photo** of your room or desk. | |
| * **Custom Tags (Open Vocabulary):** This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look *specifically* for those items. If you leave it blank, it uses a general list. | |
| * Click **Scan Environment**. | |
| ### 2. Translation & Speech Step | |
| * Select your **Target Language**. | |
| * Click **Translate Labels** to turn the English names into your learning language. | |
| * **Copy** one of those words into the 'Word to Practice' box. | |
| * **Listen** to the AI, then **Record** yourself to get feedback! | |
| """) | |
| with gr.Tab("1. Discover Objects"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_img = gr.Image(type="pil", label="Capture your world") | |
| target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book") | |
| btn_scan = gr.Button("π Scan Environment", variant="primary") | |
| with gr.Column(): | |
| output_img = gr.Image(label="AI Detection") | |
| detected_list = gr.Textbox(label="Detected Objects (English)") | |
| with gr.Tab("2. Language Practice"): | |
| with gr.Row(): | |
| lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish") | |
| btn_trans = gr.Button("π Translate Labels") | |
| vocab_output = gr.Textbox(label="Translated Vocabulary") | |
| with gr.Row(): | |
| practice_word = gr.Textbox(label="Word to Practice") | |
| btn_listen = gr.Button("π Listen", scale=0) | |
| audio_ref = gr.Audio(label="Native Reference", type="filepath") | |
| with gr.Row(): | |
| audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath") | |
| btn_analyze = gr.Button("π Analyze Pronunciation", variant="primary") | |
| with gr.Row(): | |
| out_heard = gr.Textbox(label="AI Heard") | |
| out_ipa = gr.Textbox(label="Your IPA") | |
| out_feedback = gr.Markdown() | |
| # --- BUTTON LOGIC --- | |
| btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list]) | |
| btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output) | |
| # Fixed Speech logic | |
| btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref) | |
| btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback]) | |
| demo.launch() |