Spaces:

st192011
/

Panini_Flashcards

Runtime error

App Files Files Community

st192011 commited on Dec 19, 2025

Commit

1677275

verified ·

1 Parent(s): b4fe004

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -57

app.py CHANGED Viewed

@@ -16,14 +16,17 @@ from datasets import load_dataset
 # --- CONFIG & MODELS ---
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Load YOLO World (Small) - efficient for CPU
 model_vision = YOLOWorld('yolov8s-world.pt')
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 # Initialize COCO Dataset Streaming
 print("Initialising COCO Dataset streaming...")
-ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
-ds_iter = iter(ds)
 LANG_CONFIG = {
     "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
@@ -35,72 +38,79 @@ LANG_CONFIG = {
 # --- FUNCTIONS ---
 def get_random_coco_image():
-    """Pulls a random image from the COCO dataset on Hugging Face"""
-    global ds_iter  # Declared at the top to avoid SyntaxError
     try:
-        # Skip a few items for variety
-        sample = None
-        for _ in range(random.randint(1, 5)):
             sample = next(ds_iter)
         return sample['image']
-    except (StopIteration, NameError):
-        # Re-initialize if we hit the end of the stream
-        ds_iter = iter(ds)
-        sample = next(ds_iter)
-        return sample['image']
-def scan_scene(img, lang_name):
     if img is None:
         return None, "Please get a scene first.", []
-    # Define broad vocabulary classes for discovery
-    classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
-               "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
     model_vision.set_classes(classes)
     results = model_vision.predict(img, conf=0.25)
-    annotated_img = results[0].plot()[..., ::-1] # Convert BGR to RGB
-    found_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
-    # Translate via LLM
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
     trans_map = {}
-    if found_labels:
-        prompt = f"Translate these English nouns to {lang_name}: {', '.join(found_labels)}. Format strictly as English:Translated, English:Translated"
         try:
             res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
             for pair in res_text.split(","):
                 if ":" in pair:
                     eng, trans = pair.split(":")
                     trans_map[eng.strip().lower()] = trans.strip()
-        except:
-            trans_map = {lbl: lbl for lbl in found_labels}
-    # Map detections for coordinate-based clicking
     detections = []
     for box in results[0].boxes:
-        label = model_vision.names[int(box.cls)].lower()
-        translated_label = trans_map.get(label, label)
         coords = box.xyxy[0].tolist()
-        detections.append({"label": translated_label, "box": coords})
-    return annotated_img, ", ".join(trans_map.values()), detections
 def on_image_click(evt: gr.SelectData, detections):
-    """Triggered when user clicks the image results"""
     if not detections:
-        return "Please scan the image first!", ""
     click_x, click_y = evt.index
     for det in detections:
         x1, y1, x2, y2 = det["box"]
-        # Check if click is inside the bounding box
         if x1 <= click_x <= x2 and y1 <= click_y <= y2:
-            return f"🎯 Selected: **{det['label']}**", det['label']
-    return "💡 Click inside a colored box!", ""
 async def tts_task(text, lang_name):
     if not text: return None
@@ -111,7 +121,7 @@ async def tts_task(text, lang_name):
 def run_feedback(target, lang_name, audio_path):
     if not audio_path or not target:
-        return "Record audio and select a word first.", "", ""
     asr_res = asr_pipe(audio_path)["text"].strip()
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
@@ -123,52 +133,71 @@ def run_feedback(target, lang_name, audio_path):
         t_ipa, u_ipa = "N/A", "N/A"
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
-    prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Identify the error and give 1 anatomical tip in English."
     try:
         fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
         return asr_res, f"/{u_ipa}/", fb
     except:
-        return asr_res, f"/{u_ipa}/", "Coach is busy, please try again."
 # --- UI ---
-with gr.Blocks(css=".gradio-container {max-width: 1000px !important}") as demo:
-    gr.HTML("<h1 style='text-align: center; color: #2563eb;'>📇 PANINI Flashcards</h1>")
-    gr.Markdown("Click a scene, scan it, and click objects to learn and practice.")
     current_dets = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
-            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
             btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
-            input_img = gr.Image(type="pil", label="Initial Scene", interactive=False)
             btn_scan = gr.Button("🔍 Scan Vocabulary", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Interactive Discovery")
-            display_img = gr.Image(label="Click an object box to practice", interactive=True)
-            status_lab = gr.Markdown("1. Get a scene. 2. Scan. 3. Click an object!")
-            vocab_list = gr.Textbox(label="Words Found in this Scene", interactive=False)
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Practice Area")
-            practice_word = gr.Textbox(label="Word to Practice (Autofilled on click)")
             btn_play = gr.Button("🔊 Listen to Native", scale=0)
-            audio_out = gr.Audio(label="Native Audio", type="filepath")
         with gr.Column():
-            audio_in = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
-            btn_eval = gr.Button("🚀 Check My Accent", variant="primary")
-            res_heard = gr.Textbox(label="AI Transcription")
-            res_fb = gr.Markdown()
-    # --- ACTIONS ---
     btn_random.click(get_random_coco_image, outputs=input_img)
-    btn_scan.click(scan_scene, [input_img, lang_drop], [display_img, vocab_list, current_dets])
-    display_img.select(on_image_click, [current_dets], [status_lab, practice_word])
     btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
     btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
-# Launch with theme and SSR settings
-demo.launch(theme=gr.themes.Soft(primary_hue="blue"), ssr_mode=False)

 # --- CONFIG & MODELS ---
 HF_TOKEN = os.getenv("HF_TOKEN")
 model_vision = YOLOWorld('yolov8s-world.pt')
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 # Initialize COCO Dataset Streaming
 print("Initialising COCO Dataset streaming...")
+try:
+    ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
+    ds_iter = iter(ds)
+except Exception as e:
+    print(f"Dataset init failed: {e}")
+    ds_iter = None
 LANG_CONFIG = {
     "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
 # --- FUNCTIONS ---
 def get_random_coco_image():
+    global ds_iter
     try:
+        if ds_iter is None: raise ValueError("Dataset not ready")
+        for _ in range(random.randint(1, 3)):
             sample = next(ds_iter)
         return sample['image']
+    except Exception as e:
+        return "http://images.cocodataset.org/val2017/000000000632.jpg"
+def scan_scene(img, lang_name, custom_tags):
     if img is None:
         return None, "Please get a scene first.", []
+    # 1. SET VOCABULARY (Open Vocabulary Feature)
+    if custom_tags and len(custom_tags.strip()) > 0:
+        # User defined search
+        classes = [x.strip() for x in custom_tags.split(",")]
+    else:
+        # General discovery mode
+        classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
+                   "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
     model_vision.set_classes(classes)
+    # 2. PREDICT
     results = model_vision.predict(img, conf=0.25)
+    annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
+    # 3. EXTRACT AND TRANSLATE
+    eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
     trans_map = {}
+    if eng_labels:
+        # Prompt LLM to create a translation dictionary
+        prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
         try:
             res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
+            # Parse pairs like 'table:der Tisch'
             for pair in res_text.split(","):
                 if ":" in pair:
                     eng, trans = pair.split(":")
                     trans_map[eng.strip().lower()] = trans.strip()
+        except Exception as e:
+            print(f"Translation Error: {e}")
+            trans_map = {lbl.lower(): lbl for lbl in eng_labels}
+    # 4. MAP DETECTIONS (Link box to translated word)
     detections = []
     for box in results[0].boxes:
+        eng_label = model_vision.names[int(box.cls)].lower()
+        translated_label = trans_map.get(eng_label, eng_label)
         coords = box.xyxy[0].tolist()
+        detections.append({"translated": translated_label, "english": eng_label, "box": coords})
+    vocab_display = ", ".join(trans_map.values())
+    return annotated_img, vocab_display, detections
 def on_image_click(evt: gr.SelectData, detections):
+    """Triggered when user clicks an object in the annotated image"""
     if not detections:
+        return "Scan the image first!", ""
     click_x, click_y = evt.index
     for det in detections:
         x1, y1, x2, y2 = det["box"]
+        # Check if click point is inside the detection box
         if x1 <= click_x <= x2 and y1 <= click_y <= y2:
+            translated_word = det['translated']
+            return f"🎯 Selected: **{translated_word}** ({det['english']})", translated_word
+    return "💡 Click directly inside a colored box!", ""
 async def tts_task(text, lang_name):
     if not text: return None
 def run_feedback(target, lang_name, audio_path):
     if not audio_path or not target:
+        return "Select a word and record audio.", "", ""
     asr_res = asr_pipe(audio_path)["text"].strip()
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
         t_ipa, u_ipa = "N/A", "N/A"
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
     try:
         fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
         return asr_res, f"/{u_ipa}/", fb
     except:
+        return asr_res, f"/{u_ipa}/", "Coach is busy."
 # --- UI ---
+CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"
+with gr.Blocks(css=CSS) as demo:
+    gr.HTML("<h1 style='text-align: center; color: #1e40af;'>🎙️ PANINI Flashcards</h1>")
+    gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")
     current_dets = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
+            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
             btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
+            input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)
+            custom_tags = gr.Textbox(label="🔍 What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
             btn_scan = gr.Button("🔍 Scan Vocabulary", variant="primary")
         with gr.Column(scale=2):
             gr.Markdown("### Interactive Discovery")
+            display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
+            status_lab = gr.Markdown("Status: Ready.")
+            vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### 🎤 Practice Area")
+            practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
             btn_play = gr.Button("🔊 Listen to Native", scale=0)
+            audio_out = gr.Audio(label="Native Reference", type="filepath")
         with gr.Column():
+            audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
+            btn_eval = gr.Button("🚀 Analyze Accent", variant="primary")
+            res_heard = gr.Textbox(label="What AI heard")
+            res_fb = gr.Markdown(elem_classes=["feedback-box"])
+    # --- EVENTS ---
     btn_random.click(get_random_coco_image, outputs=input_img)
+    btn_scan.click(
+        scan_scene,
+        inputs=[input_img, lang_drop, custom_tags],
+        outputs=[display_img, vocab_list, current_dets]
+    )
+    display_img.select(
+        on_image_click,
+        inputs=[current_dets],
+        outputs=[status_lab, practice_word]
+    )
     btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
     btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
+# Launch
+demo.launch(
+    theme=gr.themes.Soft(primary_hue="blue"),
+    ssr_mode=False
+)