Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -16,14 +16,17 @@ from datasets import load_dataset
|
|
| 16 |
|
| 17 |
# --- CONFIG & MODELS ---
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
-
# Load YOLO World (Small) - efficient for CPU
|
| 20 |
model_vision = YOLOWorld('yolov8s-world.pt')
|
| 21 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 22 |
|
| 23 |
# Initialize COCO Dataset Streaming
|
| 24 |
print("Initialising COCO Dataset streaming...")
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
LANG_CONFIG = {
|
| 29 |
"Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
|
|
@@ -35,72 +38,79 @@ LANG_CONFIG = {
|
|
| 35 |
# --- FUNCTIONS ---
|
| 36 |
|
| 37 |
def get_random_coco_image():
|
| 38 |
-
|
| 39 |
-
global ds_iter # Declared at the top to avoid SyntaxError
|
| 40 |
try:
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
for _ in range(random.randint(1, 5)):
|
| 44 |
sample = next(ds_iter)
|
| 45 |
return sample['image']
|
| 46 |
-
except
|
| 47 |
-
|
| 48 |
-
ds_iter = iter(ds)
|
| 49 |
-
sample = next(ds_iter)
|
| 50 |
-
return sample['image']
|
| 51 |
|
| 52 |
-
def scan_scene(img, lang_name):
|
| 53 |
if img is None:
|
| 54 |
return None, "Please get a scene first.", []
|
| 55 |
|
| 56 |
-
#
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
model_vision.set_classes(classes)
|
| 60 |
|
|
|
|
| 61 |
results = model_vision.predict(img, conf=0.25)
|
| 62 |
-
annotated_img = results[0].plot()[..., ::-1] #
|
| 63 |
|
| 64 |
-
|
|
|
|
| 65 |
|
| 66 |
-
# Translate via LLM
|
| 67 |
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 68 |
trans_map = {}
|
| 69 |
|
| 70 |
-
if
|
| 71 |
-
|
|
|
|
| 72 |
try:
|
| 73 |
res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
|
|
|
|
| 74 |
for pair in res_text.split(","):
|
| 75 |
if ":" in pair:
|
| 76 |
eng, trans = pair.split(":")
|
| 77 |
trans_map[eng.strip().lower()] = trans.strip()
|
| 78 |
-
except:
|
| 79 |
-
|
|
|
|
| 80 |
|
| 81 |
-
#
|
| 82 |
detections = []
|
| 83 |
for box in results[0].boxes:
|
| 84 |
-
|
| 85 |
-
translated_label = trans_map.get(
|
| 86 |
coords = box.xyxy[0].tolist()
|
| 87 |
-
detections.append({"
|
| 88 |
|
| 89 |
-
|
|
|
|
| 90 |
|
| 91 |
def on_image_click(evt: gr.SelectData, detections):
|
| 92 |
-
"""Triggered when user clicks the
|
| 93 |
if not detections:
|
| 94 |
-
return "
|
| 95 |
|
| 96 |
click_x, click_y = evt.index
|
| 97 |
for det in detections:
|
| 98 |
x1, y1, x2, y2 = det["box"]
|
| 99 |
-
# Check if click is inside the
|
| 100 |
if x1 <= click_x <= x2 and y1 <= click_y <= y2:
|
| 101 |
-
|
|
|
|
| 102 |
|
| 103 |
-
return "π‘ Click inside a colored box!", ""
|
| 104 |
|
| 105 |
async def tts_task(text, lang_name):
|
| 106 |
if not text: return None
|
|
@@ -111,7 +121,7 @@ async def tts_task(text, lang_name):
|
|
| 111 |
|
| 112 |
def run_feedback(target, lang_name, audio_path):
|
| 113 |
if not audio_path or not target:
|
| 114 |
-
return "
|
| 115 |
|
| 116 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 117 |
ipa_code = LANG_CONFIG[lang_name]["ipa"]
|
|
@@ -123,52 +133,71 @@ def run_feedback(target, lang_name, audio_path):
|
|
| 123 |
t_ipa, u_ipa = "N/A", "N/A"
|
| 124 |
|
| 125 |
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 126 |
-
prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/.
|
| 127 |
try:
|
| 128 |
fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
|
| 129 |
return asr_res, f"/{u_ipa}/", fb
|
| 130 |
except:
|
| 131 |
-
return asr_res, f"/{u_ipa}/", "Coach is busy
|
| 132 |
|
| 133 |
# --- UI ---
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
|
|
|
|
|
|
| 137 |
|
| 138 |
current_dets = gr.State([])
|
| 139 |
|
| 140 |
with gr.Row():
|
| 141 |
with gr.Column(scale=1):
|
| 142 |
-
lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="
|
| 143 |
btn_random = gr.Button("π² Get Random Scene", variant="secondary")
|
| 144 |
-
input_img = gr.Image(type="
|
|
|
|
|
|
|
| 145 |
btn_scan = gr.Button("π Scan Vocabulary", variant="primary")
|
| 146 |
|
| 147 |
with gr.Column(scale=2):
|
| 148 |
gr.Markdown("### Interactive Discovery")
|
| 149 |
-
display_img = gr.Image(label="
|
| 150 |
-
status_lab = gr.Markdown("
|
| 151 |
-
vocab_list = gr.Textbox(label="Words
|
| 152 |
|
| 153 |
with gr.Row():
|
| 154 |
with gr.Column():
|
| 155 |
-
gr.Markdown("### Practice Area")
|
| 156 |
-
practice_word = gr.Textbox(label="Word to Practice (
|
| 157 |
btn_play = gr.Button("π Listen to Native", scale=0)
|
| 158 |
-
audio_out = gr.Audio(label="Native
|
| 159 |
|
| 160 |
with gr.Column():
|
| 161 |
-
audio_in = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
|
| 162 |
-
btn_eval = gr.Button("π
|
| 163 |
-
res_heard = gr.Textbox(label="AI
|
| 164 |
-
res_fb = gr.Markdown()
|
| 165 |
|
| 166 |
-
# ---
|
| 167 |
btn_random.click(get_random_coco_image, outputs=input_img)
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
|
|
|
|
| 171 |
btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
|
| 172 |
|
| 173 |
-
# Launch
|
| 174 |
-
demo.launch(
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# --- CONFIG & MODELS ---
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
|
|
|
| 19 |
model_vision = YOLOWorld('yolov8s-world.pt')
|
| 20 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 21 |
|
| 22 |
# Initialize COCO Dataset Streaming
|
| 23 |
print("Initialising COCO Dataset streaming...")
|
| 24 |
+
try:
|
| 25 |
+
ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
|
| 26 |
+
ds_iter = iter(ds)
|
| 27 |
+
except Exception as e:
|
| 28 |
+
print(f"Dataset init failed: {e}")
|
| 29 |
+
ds_iter = None
|
| 30 |
|
| 31 |
LANG_CONFIG = {
|
| 32 |
"Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
|
|
|
|
| 38 |
# --- FUNCTIONS ---
|
| 39 |
|
| 40 |
def get_random_coco_image():
|
| 41 |
+
global ds_iter
|
|
|
|
| 42 |
try:
|
| 43 |
+
if ds_iter is None: raise ValueError("Dataset not ready")
|
| 44 |
+
for _ in range(random.randint(1, 3)):
|
|
|
|
| 45 |
sample = next(ds_iter)
|
| 46 |
return sample['image']
|
| 47 |
+
except Exception as e:
|
| 48 |
+
return "http://images.cocodataset.org/val2017/000000000632.jpg"
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
def scan_scene(img, lang_name, custom_tags):
|
| 51 |
if img is None:
|
| 52 |
return None, "Please get a scene first.", []
|
| 53 |
|
| 54 |
+
# 1. SET VOCABULARY (Open Vocabulary Feature)
|
| 55 |
+
if custom_tags and len(custom_tags.strip()) > 0:
|
| 56 |
+
# User defined search
|
| 57 |
+
classes = [x.strip() for x in custom_tags.split(",")]
|
| 58 |
+
else:
|
| 59 |
+
# General discovery mode
|
| 60 |
+
classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
|
| 61 |
+
"refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
|
| 62 |
+
|
| 63 |
model_vision.set_classes(classes)
|
| 64 |
|
| 65 |
+
# 2. PREDICT
|
| 66 |
results = model_vision.predict(img, conf=0.25)
|
| 67 |
+
annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
|
| 68 |
|
| 69 |
+
# 3. EXTRACT AND TRANSLATE
|
| 70 |
+
eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
|
| 71 |
|
|
|
|
| 72 |
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 73 |
trans_map = {}
|
| 74 |
|
| 75 |
+
if eng_labels:
|
| 76 |
+
# Prompt LLM to create a translation dictionary
|
| 77 |
+
prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
|
| 78 |
try:
|
| 79 |
res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
|
| 80 |
+
# Parse pairs like 'table:der Tisch'
|
| 81 |
for pair in res_text.split(","):
|
| 82 |
if ":" in pair:
|
| 83 |
eng, trans = pair.split(":")
|
| 84 |
trans_map[eng.strip().lower()] = trans.strip()
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"Translation Error: {e}")
|
| 87 |
+
trans_map = {lbl.lower(): lbl for lbl in eng_labels}
|
| 88 |
|
| 89 |
+
# 4. MAP DETECTIONS (Link box to translated word)
|
| 90 |
detections = []
|
| 91 |
for box in results[0].boxes:
|
| 92 |
+
eng_label = model_vision.names[int(box.cls)].lower()
|
| 93 |
+
translated_label = trans_map.get(eng_label, eng_label)
|
| 94 |
coords = box.xyxy[0].tolist()
|
| 95 |
+
detections.append({"translated": translated_label, "english": eng_label, "box": coords})
|
| 96 |
|
| 97 |
+
vocab_display = ", ".join(trans_map.values())
|
| 98 |
+
return annotated_img, vocab_display, detections
|
| 99 |
|
| 100 |
def on_image_click(evt: gr.SelectData, detections):
|
| 101 |
+
"""Triggered when user clicks an object in the annotated image"""
|
| 102 |
if not detections:
|
| 103 |
+
return "Scan the image first!", ""
|
| 104 |
|
| 105 |
click_x, click_y = evt.index
|
| 106 |
for det in detections:
|
| 107 |
x1, y1, x2, y2 = det["box"]
|
| 108 |
+
# Check if click point is inside the detection box
|
| 109 |
if x1 <= click_x <= x2 and y1 <= click_y <= y2:
|
| 110 |
+
translated_word = det['translated']
|
| 111 |
+
return f"π― Selected: **{translated_word}** ({det['english']})", translated_word
|
| 112 |
|
| 113 |
+
return "π‘ Click directly inside a colored box!", ""
|
| 114 |
|
| 115 |
async def tts_task(text, lang_name):
|
| 116 |
if not text: return None
|
|
|
|
| 121 |
|
| 122 |
def run_feedback(target, lang_name, audio_path):
|
| 123 |
if not audio_path or not target:
|
| 124 |
+
return "Select a word and record audio.", "", ""
|
| 125 |
|
| 126 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 127 |
ipa_code = LANG_CONFIG[lang_name]["ipa"]
|
|
|
|
| 133 |
t_ipa, u_ipa = "N/A", "N/A"
|
| 134 |
|
| 135 |
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 136 |
+
prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
|
| 137 |
try:
|
| 138 |
fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
|
| 139 |
return asr_res, f"/{u_ipa}/", fb
|
| 140 |
except:
|
| 141 |
+
return asr_res, f"/{u_ipa}/", "Coach is busy."
|
| 142 |
|
| 143 |
# --- UI ---
|
| 144 |
+
CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"
|
| 145 |
+
|
| 146 |
+
with gr.Blocks(css=CSS) as demo:
|
| 147 |
+
gr.HTML("<h1 style='text-align: center; color: #1e40af;'>ποΈ PANINI Flashcards</h1>")
|
| 148 |
+
gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")
|
| 149 |
|
| 150 |
current_dets = gr.State([])
|
| 151 |
|
| 152 |
with gr.Row():
|
| 153 |
with gr.Column(scale=1):
|
| 154 |
+
lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
|
| 155 |
btn_random = gr.Button("π² Get Random Scene", variant="secondary")
|
| 156 |
+
input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)
|
| 157 |
+
|
| 158 |
+
custom_tags = gr.Textbox(label="π What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
|
| 159 |
btn_scan = gr.Button("π Scan Vocabulary", variant="primary")
|
| 160 |
|
| 161 |
with gr.Column(scale=2):
|
| 162 |
gr.Markdown("### Interactive Discovery")
|
| 163 |
+
display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
|
| 164 |
+
status_lab = gr.Markdown("Status: Ready.")
|
| 165 |
+
vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)
|
| 166 |
|
| 167 |
with gr.Row():
|
| 168 |
with gr.Column():
|
| 169 |
+
gr.Markdown("### π€ Practice Area")
|
| 170 |
+
practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
|
| 171 |
btn_play = gr.Button("π Listen to Native", scale=0)
|
| 172 |
+
audio_out = gr.Audio(label="Native Reference", type="filepath")
|
| 173 |
|
| 174 |
with gr.Column():
|
| 175 |
+
audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
|
| 176 |
+
btn_eval = gr.Button("π Analyze Accent", variant="primary")
|
| 177 |
+
res_heard = gr.Textbox(label="What AI heard")
|
| 178 |
+
res_fb = gr.Markdown(elem_classes=["feedback-box"])
|
| 179 |
|
| 180 |
+
# --- EVENTS ---
|
| 181 |
btn_random.click(get_random_coco_image, outputs=input_img)
|
| 182 |
+
|
| 183 |
+
btn_scan.click(
|
| 184 |
+
scan_scene,
|
| 185 |
+
inputs=[input_img, lang_drop, custom_tags],
|
| 186 |
+
outputs=[display_img, vocab_list, current_dets]
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
display_img.select(
|
| 190 |
+
on_image_click,
|
| 191 |
+
inputs=[current_dets],
|
| 192 |
+
outputs=[status_lab, practice_word]
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
|
| 196 |
+
|
| 197 |
btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
|
| 198 |
|
| 199 |
+
# Launch
|
| 200 |
+
demo.launch(
|
| 201 |
+
theme=gr.themes.Soft(primary_hue="blue"),
|
| 202 |
+
ssr_mode=False
|
| 203 |
+
)
|