Spaces:

st192011
/

Panini-Vision

Paused

App Files Files Community

st192011 commited on 18 days ago

Commit

038431f

verified ·

1 Parent(s): 9471780

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -72

app.py CHANGED Viewed

@@ -6,6 +6,8 @@ import torch
 import numpy as np
 import pandas as pd
 import gradio as gr
 from PIL import Image
 from ultralytics import YOLOWorld
 from phonemizer import phonemize
@@ -15,11 +17,10 @@ from huggingface_hub import InferenceClient
 # --- INITIALIZATION ---
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Load a small YOLO World model
-# Note: On first run, it downloads the weights automatically
 model_vision = YOLOWorld('yolov8s-world.pt')
-# Whisper for ASR (Transcription)
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 LANG_CONFIG = {
@@ -33,127 +34,151 @@ LANG_CONFIG = {
 # --- VISION LOGIC ---
 def detect_objects(img, target_queries):
     if img is None:
-        return None, "Please upload an image."
-    # Set custom classes based on user input
-    if target_queries:
         classes = [x.strip() for x in target_queries.split(",")]
-        model_vision.set_classes(classes)
     else:
-        # Default common objects for language learning
-        model_vision.set_classes(["chair", "table", "bottle", "cup", "fruit", "book", "laptop", "backpack"])
-    results = model_vision.predict(img, conf=0.3)
-    # Draw results on image
     annotated_img = results[0].plot()
-    # Convert BGR (OpenCV format) to RGB for Gradio
     annotated_img = annotated_img[..., ::-1]
-    # Extract unique labels
-    detected_labels = []
     for c in results[0].boxes.cls:
-        detected_labels.append(model_vision.names[int(c)])
-    return annotated_img, ", ".join(list(set(detected_labels)))
-# --- TRANSLATION & FEEDBACK LOGIC ---
-def get_llm_response(model_id, system_prompt, user_prompt):
-    client = InferenceClient(model=model_id, token=HF_TOKEN)
-    try:
-        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
-        output = client.chat_completion(messages, max_tokens=200)
-        return output.choices[0].message.content
-    except Exception as e:
-        return f"AI Error: {str(e)}"
 def translate_labels(lang_name, labels_str):
-    if not labels_str or labels_str == "No objects detected.":
-        return "Nothing to translate."
-    system = f"You are a helpful translator for a language learning app."
-    prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return only a comma-separated list."
-    return get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
-# --- AUDIO LOGIC ---
-async def play_tts(text, lang_name):
-    if not text: return None
     voice = LANG_CONFIG[lang_name]["voice"]
-    path = "ref.mp3"
     communicate = edge_tts.Communicate(text, voice)
-    await communicate.save(path)
-    return path
-def analyze_audio(lang_name, target_text, audio_path):
     if not audio_path or not target_text:
-        return "Record your voice and provide text!", "", ""
-    # 1. ASR
     asr_res = asr_pipe(audio_path)["text"].strip()
-    # 2. IPA
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
     try:
         target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
         user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
     except:
-        target_ipa = "IPA Error"
-        user_ipa = "IPA Error"
-    # 3. LLM Anatomical Feedback
-    system = "You are a professional Phonetics Coach."
-    prompt = (f"Target: '{target_text}' (IPA: /{target_ipa}/). "
-              f"Student said: '{asr_res}' (IPA: /{user_ipa}/). "
-              f"Identify the main pronunciation error and give 1 anatomical tip in English.")
-    feedback = get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
     return asr_res, f"/{user_ipa}/", feedback
 # --- UI ---
-# Moved theme into Blocks constructor (or it can go in launch)
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.HTML("<h1 style='text-align: center;'>👁️ PANINI Vision</h1>")
-    gr.HTML("<p style='text-align: center;'>Discover your world in any language.</p>")
-    with gr.Tab("1. Scan & Discover"):
         with gr.Row():
             with gr.Column():
-                # FIXED: type="pill" -> type="pil"
-                input_img = gr.Image(type="pil", label="Upload or Capture Photo")
-                target_tags = gr.Textbox(label="Target specific things?", placeholder="e.g. apple, dog, keyboard")
                 btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
             with gr.Column():
-                output_img = gr.Image(label="Annotated View")
-                detected_list = gr.Textbox(label="Objects Found (English)")
-    with gr.Tab("2. Practice Naming"):
         with gr.Row():
-            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learn in...", value="Spanish")
             btn_trans = gr.Button("🌐 Translate Labels")
-        translated_box = gr.Textbox(label="Vocabulary List")
         with gr.Row():
-            target_word = gr.Textbox(label="Word to Practice")
-            btn_play = gr.Button("🔊 Listen", scale=0)
-            audio_ref = gr.Audio(label="Reference", type="filepath")
         with gr.Row():
             audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
-            btn_analyze = gr.Button("🚀 Analyze Speech", variant="primary")
         with gr.Row():
             out_heard = gr.Textbox(label="AI Heard")
             out_ipa = gr.Textbox(label="Your IPA")
             out_feedback = gr.Markdown()
-    # --- ACTIONS ---
     btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
-    btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=translated_box)
-    btn_play.click(fn=lambda t, l: asyncio.run(play_tts(t, l)), inputs=[target_word, lang_drop], outputs=audio_ref)
-    btn_analyze.click(analyze_audio, inputs=[lang_drop, target_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
 demo.launch()

 import numpy as np
 import pandas as pd
 import gradio as gr
+import re
+import time
 from PIL import Image
 from ultralytics import YOLOWorld
 from phonemizer import phonemize
 # --- INITIALIZATION ---
 HF_TOKEN = os.getenv("HF_TOKEN")
+# Load YOLO World (Small)
 model_vision = YOLOWorld('yolov8s-world.pt')
+# Whisper for ASR (Using tiny for speed)
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 LANG_CONFIG = {
 # --- VISION LOGIC ---
 def detect_objects(img, target_queries):
     if img is None:
+        return None, "Please upload an image first."
+    # 1. Reset/Set Vocabulary
+    if target_queries and len(target_queries.strip()) > 0:
         classes = [x.strip() for x in target_queries.split(",")]
     else:
+        # Balanced default list to prevent "bottle" bias
+        classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase",
+                   "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
+                   "sandwich", "orange", "broccoli", "carrot", "pizza", "donut",
+                   "cake", "chair", "couch", "potted plant", "bed", "dining table",
+                   "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
+                   "microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"]
+    # Force YOLO to update its internal class list
+    model_vision.set_classes(classes)
+    # 2. Prediction (Slightly higher confidence to reduce noise)
+    results = model_vision.predict(img, conf=0.4)
+    # 3. Process Image
     annotated_img = results[0].plot()
+    # Flip BGR to RGB
     annotated_img = annotated_img[..., ::-1]
+    # 4. Extract Labels
+    found_labels = []
     for c in results[0].boxes.cls:
+        found_labels.append(model_vision.names[int(c)])
+    label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'."
+    return annotated_img, label_list
+# --- TRANSLATION ---
 def translate_labels(lang_name, labels_str):
+    if not labels_str or "No objects" in labels_str:
+        return "No objects detected to translate."
+    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list."
+    try:
+        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
+        return output.choices[0].message.content
+    except Exception as e:
+        return f"Translation Error: {str(e)}"
+# --- SPEECH LOGIC (FIXED) ---
+async def tts_core(text, lang_name):
     voice = LANG_CONFIG[lang_name]["voice"]
+    # Use timestamp to prevent browser audio caching issues
+    filename = f"ref_{int(time.time())}.mp3"
     communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(filename)
+    return filename
+def handle_tts(text, lang_name):
+    if not text: return None
+    return asyncio.run(tts_core(text, lang_name))
+def analyze_speech(lang_name, target_text, audio_path):
     if not audio_path or not target_text:
+        return "Missing recording or target word.", "", "Please provide both."
+    # ASR
     asr_res = asr_pipe(audio_path)["text"].strip()
+    # IPA
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
     try:
         target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
         user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
     except:
+        target_ipa = "Error"
+        user_ipa = "Error"
+    # LLM Feedback
+    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. "
+              "Identify the error and give 1 specific anatomical tip for tongue/lips in English.")
+    try:
+        fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
+        feedback = fb.choices[0].message.content
+    except:
+        feedback = "Speech analysis busy. Try again."
     return asr_res, f"/{user_ipa}/", feedback
 # --- UI ---
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
+    gr.HTML("<h1 style='text-align: center;'>🎙️ PANINI Vision</h1>")
+    with gr.Accordion("📖 How to use (Instruction)", open=False):
+        gr.Markdown("""
+        ### 1. Vision Step
+        * **Upload a photo** of your room or desk.
+        * **Custom Tags (Open Vocabulary):** This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look *specifically* for those items. If you leave it blank, it uses a general list.
+        * Click **Scan Environment**.
+        ### 2. Translation & Speech Step
+        * Select your **Target Language**.
+        * Click **Translate Labels** to turn the English names into your learning language.
+        * **Copy** one of those words into the 'Word to Practice' box.
+        * **Listen** to the AI, then **Record** yourself to get feedback!
+        """)
+    with gr.Tab("1. Discover Objects"):
         with gr.Row():
             with gr.Column():
+                input_img = gr.Image(type="pil", label="Capture your world")
+                target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book")
                 btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
             with gr.Column():
+                output_img = gr.Image(label="AI Detection")
+                detected_list = gr.Textbox(label="Detected Objects (English)")
+    with gr.Tab("2. Language Practice"):
         with gr.Row():
+            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish")
             btn_trans = gr.Button("🌐 Translate Labels")
+        vocab_output = gr.Textbox(label="Translated Vocabulary")
         with gr.Row():
+            practice_word = gr.Textbox(label="Word to Practice")
+            btn_listen = gr.Button("🔊 Listen", scale=0)
+            audio_ref = gr.Audio(label="Native Reference", type="filepath")
         with gr.Row():
             audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
+            btn_analyze = gr.Button("🚀 Analyze Pronunciation", variant="primary")
         with gr.Row():
             out_heard = gr.Textbox(label="AI Heard")
             out_ipa = gr.Textbox(label="Your IPA")
             out_feedback = gr.Markdown()
+    # --- BUTTON LOGIC ---
     btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
+    btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output)
+    # Fixed Speech logic
+    btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref)
+    btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
 demo.launch()