File size: 8,110 Bytes
25c274d
 
 
 
 
 
f20937d
25c274d
 
 
 
 
 
 
f20937d
25c274d
 
 
 
 
 
b4fe004
 
1677275
 
 
 
 
 
25c274d
 
 
 
 
 
 
 
f20937d
 
 
1677275
f20937d
1677275
 
f20937d
 
1677275
 
25c274d
1677275
b4fe004
 
f20937d
1677275
 
 
 
 
 
 
 
 
f20937d
25c274d
1677275
b3d5547
1677275
25c274d
1677275
 
b3d5547
 
 
 
1677275
 
 
b3d5547
 
1677275
b3d5547
 
 
 
1677275
 
 
b3d5547
1677275
25c274d
 
1677275
 
f20937d
1677275
25c274d
1677275
 
25c274d
f20937d
1677275
b4fe004
1677275
b4fe004
25c274d
 
 
1677275
25c274d
1677275
 
b4fe004
1677275
25c274d
f20937d
b4fe004
25c274d
 
 
 
 
f20937d
b4fe004
1677275
b4fe004
25c274d
 
 
b3d5547
 
 
 
 
25c274d
 
1677275
b3d5547
f20937d
b4fe004
b3d5547
1677275
25c274d
 
1677275
 
 
 
 
25c274d
f20937d
25c274d
 
 
1677275
f20937d
1677275
 
 
f20937d
25c274d
 
f20937d
1677275
 
 
25c274d
 
 
1677275
 
b4fe004
1677275
25c274d
 
1677275
 
 
 
25c274d
1677275
f20937d
1677275
 
 
 
 
 
 
 
 
 
 
 
 
f20937d
1677275
f20937d
 
1677275
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
import os
import asyncio
import edge_tts
import librosa
import torch
import time
import random
import numpy as np
import gradio as gr
from PIL import Image
from ultralytics import YOLOWorld
from phonemizer import phonemize
from transformers import pipeline
from huggingface_hub import InferenceClient
from datasets import load_dataset

# --- CONFIG & MODELS ---
HF_TOKEN = os.getenv("HF_TOKEN")
model_vision = YOLOWorld('yolov8s-world.pt') 
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)

# Initialize COCO Dataset Streaming
print("Initialising COCO Dataset streaming...")
try:
    ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
    ds_iter = iter(ds)
except Exception as e:
    print(f"Dataset init failed: {e}")
    ds_iter = None

LANG_CONFIG = {
    "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
    "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
    "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
    "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
}

# --- FUNCTIONS ---

def get_random_coco_image():
    global ds_iter
    try:
        if ds_iter is None: raise ValueError("Dataset not ready")
        for _ in range(random.randint(1, 3)):
            sample = next(ds_iter)
        return sample['image']
    except Exception as e:
        return "http://images.cocodataset.org/val2017/000000000632.jpg"

def scan_scene(img, lang_name, custom_tags):
    if img is None: 
        return None, "Please get a scene first.", []
    
    # 1. SET VOCABULARY (Open Vocabulary Feature)
    if custom_tags and len(custom_tags.strip()) > 0:
        # User defined search
        classes = [x.strip() for x in custom_tags.split(",")]
    else:
        # General discovery mode
        classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink", 
                   "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
    
    model_vision.set_classes(classes)
    
    # 2. PREDICT
    results = model_vision.predict(img, conf=0.25)
    annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
    
    # 3. EXTRACT AND TRANSLATE
    eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
    
    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
    trans_map = {}
    
    if eng_labels:
        # Prompt LLM to create a translation dictionary
        prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
        try:
            res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
            # Parse pairs like 'table:der Tisch'
            for pair in res_text.split(","):
                if ":" in pair:
                    eng, trans = pair.split(":")
                    trans_map[eng.strip().lower()] = trans.strip()
        except Exception as e:
            print(f"Translation Error: {e}")
            trans_map = {lbl.lower(): lbl for lbl in eng_labels}
    
    # 4. MAP DETECTIONS (Link box to translated word)
    detections = []
    for box in results[0].boxes:
        eng_label = model_vision.names[int(box.cls)].lower()
        translated_label = trans_map.get(eng_label, eng_label)
        coords = box.xyxy[0].tolist() 
        detections.append({"translated": translated_label, "english": eng_label, "box": coords})
    
    vocab_display = ", ".join(trans_map.values())
    return annotated_img, vocab_display, detections

def on_image_click(evt: gr.SelectData, detections):
    """Triggered when user clicks an object in the annotated image"""
    if not detections:
        return "Scan the image first!", ""
        
    click_x, click_y = evt.index
    for det in detections:
        x1, y1, x2, y2 = det["box"]
        # Check if click point is inside the detection box
        if x1 <= click_x <= x2 and y1 <= click_y <= y2:
            translated_word = det['translated']
            return f"🎯 Selected: **{translated_word}** ({det['english']})", translated_word
            
    return "πŸ’‘ Click directly inside a colored box!", ""

async def tts_task(text, lang_name):
    if not text: return None
    voice = LANG_CONFIG[lang_name]["voice"]
    path = f"speech_{int(time.time())}.mp3"
    await edge_tts.Communicate(text, voice).save(path)
    return path

def run_feedback(target, lang_name, audio_path):
    if not audio_path or not target: 
        return "Select a word and record audio.", "", ""
    
    asr_res = asr_pipe(audio_path)["text"].strip()
    ipa_code = LANG_CONFIG[lang_name]["ipa"]
    
    try:
        t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
        u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
    except:
        t_ipa, u_ipa = "N/A", "N/A"
    
    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
    prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
    try:
        fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
        return asr_res, f"/{u_ipa}/", fb
    except:
        return asr_res, f"/{u_ipa}/", "Coach is busy."

# --- UI ---
CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"

with gr.Blocks(css=CSS) as demo:
    gr.HTML("<h1 style='text-align: center; color: #1e40af;'>πŸŽ™οΈ PANINI Flashcards</h1>")
    gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")
    
    current_dets = gr.State([])

    with gr.Row():
        with gr.Column(scale=1):
            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
            btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
            input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)
            
            custom_tags = gr.Textbox(label="πŸ” What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
            btn_scan = gr.Button("πŸ” Scan Vocabulary", variant="primary")
            
        with gr.Column(scale=2):
            gr.Markdown("### Interactive Discovery")
            display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
            status_lab = gr.Markdown("Status: Ready.")
            vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)

    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎀 Practice Area")
            practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
            btn_play = gr.Button("πŸ”Š Listen to Native", scale=0)
            audio_out = gr.Audio(label="Native Reference", type="filepath")
        
        with gr.Column():
            audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
            btn_eval = gr.Button("πŸš€ Analyze Accent", variant="primary")
            res_heard = gr.Textbox(label="What AI heard")
            res_fb = gr.Markdown(elem_classes=["feedback-box"])

    # --- EVENTS ---
    btn_random.click(get_random_coco_image, outputs=input_img)
    
    btn_scan.click(
        scan_scene, 
        inputs=[input_img, lang_drop, custom_tags], 
        outputs=[display_img, vocab_list, current_dets]
    )
    
    display_img.select(
        on_image_click, 
        inputs=[current_dets], 
        outputs=[status_lab, practice_word]
    )
    
    btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
    
    btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])

# Launch
demo.launch(
    theme=gr.themes.Soft(primary_hue="blue"), 
    ssr_mode=False
)