File size: 7,613 Bytes
7132f70
 
 
 
 
 
 
 
038431f
 
7132f70
 
 
 
 
 
 
 
9471780
038431f
7132f70
 
038431f
7132f70
 
 
 
 
 
 
 
 
 
 
 
9471780
038431f
9471780
038431f
 
7132f70
 
038431f
 
 
 
 
 
 
 
 
 
7132f70
038431f
 
7132f70
038431f
7132f70
038431f
9471780
 
038431f
 
7132f70
038431f
7132f70
038431f
 
 
7132f70
038431f
9471780
038431f
 
7132f70
038431f
 
7132f70
038431f
 
 
 
 
7132f70
038431f
 
7132f70
038431f
 
7132f70
038431f
 
 
 
 
 
7132f70
038431f
9471780
038431f
7132f70
038431f
7132f70
 
038431f
7132f70
9471780
 
 
 
038431f
 
9471780
038431f
 
 
 
7132f70
038431f
 
 
 
 
 
7132f70
 
 
038431f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7132f70
038431f
7132f70
 
038431f
 
7132f70
 
038431f
 
7132f70
038431f
7132f70
038431f
7132f70
 
038431f
7132f70
 
038431f
 
 
7132f70
 
9471780
038431f
7132f70
 
 
9471780
7132f70
 
038431f
7132f70
038431f
 
 
 
 
7132f70
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import os
import asyncio
import edge_tts
import librosa
import torch
import numpy as np
import pandas as pd
import gradio as gr
import re
import time
from PIL import Image
from ultralytics import YOLOWorld
from phonemizer import phonemize
from transformers import pipeline
from huggingface_hub import InferenceClient

# --- INITIALIZATION ---
HF_TOKEN = os.getenv("HF_TOKEN")

# Load YOLO World (Small)
model_vision = YOLOWorld('yolov8s-world.pt') 

# Whisper for ASR (Using tiny for speed)
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)

LANG_CONFIG = {
    "English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"},
    "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
    "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
    "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
    "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
}

# --- VISION LOGIC ---
def detect_objects(img, target_queries):
    if img is None:
        return None, "Please upload an image first."
        
    # 1. Reset/Set Vocabulary
    if target_queries and len(target_queries.strip()) > 0:
        classes = [x.strip() for x in target_queries.split(",")]
    else:
        # Balanced default list to prevent "bottle" bias
        classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase", 
                   "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", 
                   "sandwich", "orange", "broccoli", "carrot", "pizza", "donut", 
                   "cake", "chair", "couch", "potted plant", "bed", "dining table", 
                   "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", 
                   "microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"]
    
    # Force YOLO to update its internal class list
    model_vision.set_classes(classes)

    # 2. Prediction (Slightly higher confidence to reduce noise)
    results = model_vision.predict(img, conf=0.4)
    
    # 3. Process Image
    annotated_img = results[0].plot()
    # Flip BGR to RGB
    annotated_img = annotated_img[..., ::-1] 
    
    # 4. Extract Labels
    found_labels = []
    for c in results[0].boxes.cls:
        found_labels.append(model_vision.names[int(c)])
    
    label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'."
    
    return annotated_img, label_list

# --- TRANSLATION ---
def translate_labels(lang_name, labels_str):
    if not labels_str or "No objects" in labels_str:
        return "No objects detected to translate."
    
    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
    prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list."
    
    try:
        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
        return output.choices[0].message.content
    except Exception as e:
        return f"Translation Error: {str(e)}"

# --- SPEECH LOGIC (FIXED) ---
async def tts_core(text, lang_name):
    voice = LANG_CONFIG[lang_name]["voice"]
    # Use timestamp to prevent browser audio caching issues
    filename = f"ref_{int(time.time())}.mp3"
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(filename)
    return filename

def handle_tts(text, lang_name):
    if not text: return None
    return asyncio.run(tts_core(text, lang_name))

def analyze_speech(lang_name, target_text, audio_path):
    if not audio_path or not target_text:
        return "Missing recording or target word.", "", "Please provide both."
    
    # ASR
    asr_res = asr_pipe(audio_path)["text"].strip()
    
    # IPA
    ipa_code = LANG_CONFIG[lang_name]["ipa"]
    try:
        target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
        user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
    except:
        target_ipa = "Error"
        user_ipa = "Error"
    
    # LLM Feedback
    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
    prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. "
              "Identify the error and give 1 specific anatomical tip for tongue/lips in English.")
    
    try:
        fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
        feedback = fb.choices[0].message.content
    except:
        feedback = "Speech analysis busy. Try again."
        
    return asr_res, f"/{user_ipa}/", feedback

# --- UI ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
    gr.HTML("<h1 style='text-align: center;'>πŸŽ™οΈ PANINI Vision</h1>")
    
    with gr.Accordion("πŸ“– How to use (Instruction)", open=False):
        gr.Markdown("""
        ### 1. Vision Step
        * **Upload a photo** of your room or desk.
        * **Custom Tags (Open Vocabulary):** This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look *specifically* for those items. If you leave it blank, it uses a general list.
        * Click **Scan Environment**.
        
        ### 2. Translation & Speech Step
        * Select your **Target Language**.
        * Click **Translate Labels** to turn the English names into your learning language.
        * **Copy** one of those words into the 'Word to Practice' box.
        * **Listen** to the AI, then **Record** yourself to get feedback!
        """)

    with gr.Tab("1. Discover Objects"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(type="pil", label="Capture your world")
                target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book")
                btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
            with gr.Column():
                output_img = gr.Image(label="AI Detection")
                detected_list = gr.Textbox(label="Detected Objects (English)")
                
    with gr.Tab("2. Language Practice"):
        with gr.Row():
            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish")
            btn_trans = gr.Button("🌐 Translate Labels")
        
        vocab_output = gr.Textbox(label="Translated Vocabulary")
        
        with gr.Row():
            practice_word = gr.Textbox(label="Word to Practice")
            btn_listen = gr.Button("πŸ”Š Listen", scale=0)
            audio_ref = gr.Audio(label="Native Reference", type="filepath")
            
        with gr.Row():
            audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
            btn_analyze = gr.Button("πŸš€ Analyze Pronunciation", variant="primary")
            
        with gr.Row():
            out_heard = gr.Textbox(label="AI Heard")
            out_ipa = gr.Textbox(label="Your IPA")
            out_feedback = gr.Markdown()

    # --- BUTTON LOGIC ---
    btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
    btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output)
    
    # Fixed Speech logic
    btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref)
    btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])

demo.launch()