Spaces:
Sleeping
Sleeping
File size: 8,110 Bytes
25c274d f20937d 25c274d f20937d 25c274d b4fe004 1677275 25c274d f20937d 1677275 f20937d 1677275 f20937d 1677275 25c274d 1677275 b4fe004 f20937d 1677275 f20937d 25c274d 1677275 b3d5547 1677275 25c274d 1677275 b3d5547 1677275 b3d5547 1677275 b3d5547 1677275 b3d5547 1677275 25c274d 1677275 f20937d 1677275 25c274d 1677275 25c274d f20937d 1677275 b4fe004 1677275 b4fe004 25c274d 1677275 25c274d 1677275 b4fe004 1677275 25c274d f20937d b4fe004 25c274d f20937d b4fe004 1677275 b4fe004 25c274d b3d5547 25c274d 1677275 b3d5547 f20937d b4fe004 b3d5547 1677275 25c274d 1677275 25c274d f20937d 25c274d 1677275 f20937d 1677275 f20937d 25c274d f20937d 1677275 25c274d 1677275 b4fe004 1677275 25c274d 1677275 25c274d 1677275 f20937d 1677275 f20937d 1677275 f20937d 1677275 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
import os
import asyncio
import edge_tts
import librosa
import torch
import time
import random
import numpy as np
import gradio as gr
from PIL import Image
from ultralytics import YOLOWorld
from phonemizer import phonemize
from transformers import pipeline
from huggingface_hub import InferenceClient
from datasets import load_dataset
# --- CONFIG & MODELS ---
HF_TOKEN = os.getenv("HF_TOKEN")
model_vision = YOLOWorld('yolov8s-world.pt')
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
# Initialize COCO Dataset Streaming
print("Initialising COCO Dataset streaming...")
try:
ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
ds_iter = iter(ds)
except Exception as e:
print(f"Dataset init failed: {e}")
ds_iter = None
LANG_CONFIG = {
"Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
"German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
"French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
"Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
}
# --- FUNCTIONS ---
def get_random_coco_image():
global ds_iter
try:
if ds_iter is None: raise ValueError("Dataset not ready")
for _ in range(random.randint(1, 3)):
sample = next(ds_iter)
return sample['image']
except Exception as e:
return "http://images.cocodataset.org/val2017/000000000632.jpg"
def scan_scene(img, lang_name, custom_tags):
if img is None:
return None, "Please get a scene first.", []
# 1. SET VOCABULARY (Open Vocabulary Feature)
if custom_tags and len(custom_tags.strip()) > 0:
# User defined search
classes = [x.strip() for x in custom_tags.split(",")]
else:
# General discovery mode
classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
"refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
model_vision.set_classes(classes)
# 2. PREDICT
results = model_vision.predict(img, conf=0.25)
annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
# 3. EXTRACT AND TRANSLATE
eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
trans_map = {}
if eng_labels:
# Prompt LLM to create a translation dictionary
prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
try:
res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
# Parse pairs like 'table:der Tisch'
for pair in res_text.split(","):
if ":" in pair:
eng, trans = pair.split(":")
trans_map[eng.strip().lower()] = trans.strip()
except Exception as e:
print(f"Translation Error: {e}")
trans_map = {lbl.lower(): lbl for lbl in eng_labels}
# 4. MAP DETECTIONS (Link box to translated word)
detections = []
for box in results[0].boxes:
eng_label = model_vision.names[int(box.cls)].lower()
translated_label = trans_map.get(eng_label, eng_label)
coords = box.xyxy[0].tolist()
detections.append({"translated": translated_label, "english": eng_label, "box": coords})
vocab_display = ", ".join(trans_map.values())
return annotated_img, vocab_display, detections
def on_image_click(evt: gr.SelectData, detections):
"""Triggered when user clicks an object in the annotated image"""
if not detections:
return "Scan the image first!", ""
click_x, click_y = evt.index
for det in detections:
x1, y1, x2, y2 = det["box"]
# Check if click point is inside the detection box
if x1 <= click_x <= x2 and y1 <= click_y <= y2:
translated_word = det['translated']
return f"π― Selected: **{translated_word}** ({det['english']})", translated_word
return "π‘ Click directly inside a colored box!", ""
async def tts_task(text, lang_name):
if not text: return None
voice = LANG_CONFIG[lang_name]["voice"]
path = f"speech_{int(time.time())}.mp3"
await edge_tts.Communicate(text, voice).save(path)
return path
def run_feedback(target, lang_name, audio_path):
if not audio_path or not target:
return "Select a word and record audio.", "", ""
asr_res = asr_pipe(audio_path)["text"].strip()
ipa_code = LANG_CONFIG[lang_name]["ipa"]
try:
t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
except:
t_ipa, u_ipa = "N/A", "N/A"
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
try:
fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
return asr_res, f"/{u_ipa}/", fb
except:
return asr_res, f"/{u_ipa}/", "Coach is busy."
# --- UI ---
CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"
with gr.Blocks(css=CSS) as demo:
gr.HTML("<h1 style='text-align: center; color: #1e40af;'>ποΈ PANINI Flashcards</h1>")
gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")
current_dets = gr.State([])
with gr.Row():
with gr.Column(scale=1):
lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
btn_random = gr.Button("π² Get Random Scene", variant="secondary")
input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)
custom_tags = gr.Textbox(label="π What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
btn_scan = gr.Button("π Scan Vocabulary", variant="primary")
with gr.Column(scale=2):
gr.Markdown("### Interactive Discovery")
display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
status_lab = gr.Markdown("Status: Ready.")
vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)
with gr.Row():
with gr.Column():
gr.Markdown("### π€ Practice Area")
practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
btn_play = gr.Button("π Listen to Native", scale=0)
audio_out = gr.Audio(label="Native Reference", type="filepath")
with gr.Column():
audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
btn_eval = gr.Button("π Analyze Accent", variant="primary")
res_heard = gr.Textbox(label="What AI heard")
res_fb = gr.Markdown(elem_classes=["feedback-box"])
# --- EVENTS ---
btn_random.click(get_random_coco_image, outputs=input_img)
btn_scan.click(
scan_scene,
inputs=[input_img, lang_drop, custom_tags],
outputs=[display_img, vocab_list, current_dets]
)
display_img.select(
on_image_click,
inputs=[current_dets],
outputs=[status_lab, practice_word]
)
btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
# Launch
demo.launch(
theme=gr.themes.Soft(primary_hue="blue"),
ssr_mode=False
) |