Spaces:

st192011
/

Panini_Flashcards

Runtime error

App Files Files Community

st192011 commited on Dec 19, 2025

Commit

b3d5547

verified ·

1 Parent(s): 25c274d

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -61

app.py CHANGED Viewed

@@ -18,12 +18,12 @@ HF_TOKEN = os.getenv("HF_TOKEN")
 model_vision = YOLOWorld('yolov8s-world.pt')
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
-# Sample Scenes (Public Domain Images)
 SAMPLE_SCENES = {
-    "🍎 Kitchen": "https://images.unsplash.com/photo-1556910103-1c02745aae4d?w=800",
-    "🌳 Park": "https://images.unsplash.com/photo-1588714477688-cf28a50e94f7?w=800",
-    "🏢 Office": "https://images.unsplash.com/photo-1497215728101-856f4ea42174?w=800",
-    "🛋️ Living Room": "https://images.unsplash.com/photo-1583847268964-b28dc2f51ac9?w=800"
 }
 LANG_CONFIG = {
@@ -38,116 +38,137 @@ LANG_CONFIG = {
 def scan_scene(img, lang_name):
     if img is None: return None, "Please select a scene.", []
-    # 1. YOLO Scan
-    # Default vocabulary for discovery
-    model_vision.set_classes(["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink", "refrigerator", "oven"])
-    results = model_vision.predict(img, conf=0.3)
-    annotated_img = results[0].plot()[..., ::-1]
-    # 2. Extract Data
     detections = []
-    found_labels = []
     for box in results[0].boxes:
-        label = model_vision.names[int(box.cls)]
-        coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
-        detections.append({"label": label, "box": coords})
-        found_labels.append(label)
-    found_unique = list(set(found_labels))
-    # 3. Translate via LLM
-    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
-    prompt = f"Translate these objects to {lang_name}: {', '.join(found_unique)}. Return ONLY a comma-separated list."
-    try:
-        translated = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=100).choices[0].message.content
-    except:
-        translated = ", ".join(found_unique)
-    return annotated_img, translated, detections
-def check_point(evt: gr.SelectData, detections, lang_name):
-    # evt.index gives [x, y] of click
     click_x, click_y = evt.index
     for det in detections:
         x1, y1, x2, y2 = det["box"]
         if x1 <= click_x <= x2 and y1 <= click_y <= y2:
-            return f"🎯 You found the **{det['label']}**!"
-    return "❌ Try clicking exactly on an object."
 async def run_tts(text, lang_name):
     voice = LANG_CONFIG[lang_name]["voice"]
     path = f"speech_{int(time.time())}.mp3"
     await edge_tts.Communicate(text, voice).save(path)
     return path
 def run_speech_analysis(target, lang_name, audio_path):
-    if not audio_path: return "No recording.", "", ""
     asr_res = asr_pipe(audio_path)["text"].strip()
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
-    t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
-    u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
     prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 anatomical tip in English."
-    feedback = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=100).choices[0].message.content
     return asr_res, f"/{u_ipa}/", feedback
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange")) as demo:
-    gr.HTML("<h1 style='text-align: center;'>📇 PANINI Flashcards</h1>")
-    gr.Markdown("Discover vocabulary in your environment or use our sample scenes.")
-    # State to hold detection data for the current image
     current_detections = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
             lang_choice = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
-            gr.Markdown("### Step 1: Pick a Scene")
-            scene_gallery = gr.Radio(choices=list(SAMPLE_SCENES.keys()), label="Sample Scenes")
-            upload_input = gr.Image(type="pil", label="OR Upload Your Own")
-            btn_scan = gr.Button("🔍 Scan for Vocabulary", variant="primary")
         with gr.Column(scale=2):
-            gr.Markdown("### Step 2: Interactive Discovery")
-            main_display = gr.Image(label="Click an object to identify it", interactive=True)
-            click_feedback = gr.Markdown("*Detections will appear here...*")
-            found_vocab = gr.Textbox(label="Vocabulary List (Generated by AI)")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Step 3: Pronunciation Practice")
-            practice_word = gr.Textbox(label="Word to practice (Copy from list above)")
-            btn_play = gr.Button("🔊 Hear Native", scale=0)
-            audio_ref = gr.Audio(label="Native Reference", type="filepath")
         with gr.Column():
-            audio_user = gr.Audio(label="Record Yourself", sources=["microphone"], type="filepath")
-            btn_analyze = gr.Button("🚀 Analyze My Speech")
             out_heard = gr.Textbox(label="AI Heard")
             out_feedback = gr.Markdown()
     # --- ACTIONS ---
-    # Scene selection logic
-    def load_scene(name):
-        return SAMPLE_SCENES[name]
-    scene_gallery.change(load_scene, scene_gallery, upload_input)
-    # Scan logic
     btn_scan.click(
         scan_scene,
-        inputs=[upload_input, lang_choice],
-        outputs=[main_display, found_vocab, current_detections]
     )
-    # Pointing Logic
-    main_display.select(check_point, [current_detections, lang_choice], click_feedback)
     # Speech Logic
     btn_play.click(lambda t, l: asyncio.run(run_tts(t, l)), [practice_word, lang_choice], audio_ref)

 model_vision = YOLOWorld('yolov8s-world.pt')
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
+# Robust Scene Library (Using stable Wikimedia/Pixabay direct links)
 SAMPLE_SCENES = {
+    "🍳 The Kitchen": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Kitchen_in_the_White_House.jpg/1280px-Kitchen_in_the_White_House.jpg",
+    "🛋️ Living Room": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Interior_of_a_living_room.jpg/1280px-Interior_of_a_living_room.jpg",
+    "🏙️ City Street": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d8/London_Regent_Street_2.jpg/1280px-London_Regent_Street_2.jpg",
+    "🛒 Supermarket": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Produce_department_of_a_supermarket.jpg/1280px-Produce_department_of_a_supermarket.jpg"
 }
 LANG_CONFIG = {
 def scan_scene(img, lang_name):
     if img is None: return None, "Please select a scene.", []
+    # 1. Broad Vocabulary Scan
+    classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
+               "refrigerator", "oven", "car", "person", "tree", "backpack", "clock"]
+    model_vision.set_classes(classes)
+    # Prediction
+    results = model_vision.predict(img, conf=0.25)
+    annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
+    # 2. Map Detections & Translate
+    found_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
+    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    trans_map = {}
+    if found_labels:
+        prompt = f"Translate these English nouns to {lang_name}: {', '.join(found_labels)}. Format: English:Translated, English:Translated"
+        try:
+            res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
+            # Create a dictionary for quick lookup
+            for pair in res_text.split(","):
+                if ":" in pair:
+                    eng, trans = pair.split(":")
+                    trans_map[eng.strip().lower()] = trans.strip()
+        except:
+            trans_map = {lbl: lbl for lbl in found_labels} # Fallback
+    # 3. Build Detection Objects with Translations
     detections = []
     for box in results[0].boxes:
+        label = model_vision.names[int(box.cls)].lower()
+        translated_label = trans_map.get(label, label)
+        coords = box.xyxy[0].tolist()
+        detections.append({"label": translated_label, "box": coords})
+    return annotated_img, ", ".join(trans_map.values()), detections
+def check_point_and_update(evt: gr.SelectData, detections):
+    # evt.index gives [x, y] of the click
     click_x, click_y = evt.index
     for det in detections:
         x1, y1, x2, y2 = det["box"]
+        # Check if click is inside the bounding box
         if x1 <= click_x <= x2 and y1 <= click_y <= y2:
+            translated_word = det['label']
+            return f"🎯 Found: **{translated_word}**", translated_word
+    return "❌ Try clicking exactly on an object box!", ""
 async def run_tts(text, lang_name):
+    if not text: return None
     voice = LANG_CONFIG[lang_name]["voice"]
     path = f"speech_{int(time.time())}.mp3"
     await edge_tts.Communicate(text, voice).save(path)
     return path
 def run_speech_analysis(target, lang_name, audio_path):
+    if not audio_path or not target: return "No recording or target.", "", ""
     asr_res = asr_pipe(audio_path)["text"].strip()
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
+    try:
+        t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
+        u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
+    except:
+        t_ipa, u_ipa = "N/A", "N/A"
     client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
     prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 anatomical tip in English."
+    try:
+        feedback = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
+    except:
+        feedback = "Analysis busy."
     return asr_res, f"/{u_ipa}/", feedback
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange")) as demo:
+    gr.HTML("<h1 style='text-align: center; color: #d97706;'>📇 PANINI Flashcards</h1>")
+    # State stores detections for the current scan
     current_detections = gr.State([])
     with gr.Row():
         with gr.Column(scale=1):
             lang_choice = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
+            gr.Markdown("### Step 1: Choose a Scene")
+            scene_radio = gr.Radio(choices=list(SAMPLE_SCENES.keys()), label="Library")
+            img_input = gr.Image(type="pil", label="Scene Preview / Upload")
+            btn_scan = gr.Button("🔍 Discover Vocabulary", variant="primary")
         with gr.Column(scale=2):
+            gr.Markdown("### Step 2: Point & Identify")
+            # The interactive image where the user clicks
+            display_img = gr.Image(label="Click an object to practice it!", interactive=True)
+            click_info = gr.Markdown("Click an object in the scanned image above.")
+            vocab_list = gr.Textbox(label="Detected Vocabulary", interactive=False)
     with gr.Row():
         with gr.Column():
+            gr.Markdown("### Step 3: Speak & Learn")
+            # This box gets filled automatically when the user clicks the image
+            practice_word = gr.Textbox(label="Word to Practice", placeholder="Click an object in the picture...")
+            btn_play = gr.Button("🔊 Native Pronunciation", scale=0)
+            audio_ref = gr.Audio(label="Reference", type="filepath")
         with gr.Column():
+            audio_user = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
+            btn_analyze = gr.Button("🚀 Analyze Accent", variant="primary")
             out_heard = gr.Textbox(label="AI Heard")
             out_feedback = gr.Markdown()
     # --- ACTIONS ---
+    # Handle Scene Selection
+    scene_radio.change(lambda name: SAMPLE_SCENES[name], scene_radio, img_input)
+    # Handle Scan
     btn_scan.click(
         scan_scene,
+        inputs=[img_input, lang_choice],
+        outputs=[display_img, vocab_list, current_detections]
     )
+    # Handle Image Pointing (This updates the practice box!)
+    display_img.select(
+        check_point_and_update,
+        inputs=[current_detections],
+        outputs=[click_info, practice_word]
+    )
     # Speech Logic
     btn_play.click(lambda t, l: asyncio.run(run_tts(t, l)), [practice_word, lang_choice], audio_ref)