Spaces:

st192011
/

Panini-Vision

Runtime error

App Files Files Community

st192011 commited on Dec 18, 2025

Commit

9471780

verified ·

1 Parent(s): 7132f70

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -52

app.py CHANGED Viewed

@@ -11,14 +11,15 @@ from ultralytics import YOLOWorld
 from phonemizer import phonemize
 from transformers import pipeline
 from huggingface_hub import InferenceClient
-from torch.nn.functional import cosine_similarity
 # --- INITIALIZATION ---
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Load a small YOLO World model for CPU efficiency
 model_vision = YOLOWorld('yolov8s-world.pt')
-# Whisper for ASR
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 LANG_CONFIG = {
@@ -31,60 +32,54 @@ LANG_CONFIG = {
 # --- VISION LOGIC ---
 def detect_objects(img, target_queries):
     # Set custom classes based on user input
     if target_queries:
         classes = [x.strip() for x in target_queries.split(",")]
         model_vision.set_classes(classes)
     else:
-        # Default common objects
-        model_vision.set_classes(["chair", "table", "person", "bottle", "cup", "fruit", "book"])
     results = model_vision.predict(img, conf=0.3)
     # Draw results on image
     annotated_img = results[0].plot()
     # Extract unique labels
     detected_labels = []
     for c in results[0].boxes.cls:
         detected_labels.append(model_vision.names[int(c)])
-    return annotated_img, list(set(detected_labels))
 # --- TRANSLATION & FEEDBACK LOGIC ---
-def get_llm_feedback(lang_name, english_word, student_speech, student_ipa, target_ipa):
-    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
-    prompt = f"""
-    Target Word: {english_word} in {lang_name}.
-    Native IPA: /{target_ipa}/
-    Student IPA: /{student_ipa}/
-    Student said: "{student_speech}"
-    The student is learning {lang_name}. Identify the main pronunciation error and give 1 short anatomical tip (tongue/lip placement) in English.
-    """
     try:
-        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
         return output.choices[0].message.content
-    except:
-        return "LLM Busy. Try again in a moment."
-def translate_labels(lang_name, labels):
-    if not labels: return "No objects detected."
-    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
-    labels_str = ", ".join(labels)
-    prompt = f"Translate these English object labels into {lang_name}. Provide the results as a comma-separated list. Labels: {labels_str}"
-    try:
-        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
-        return output.choices[0].message.content
-    except:
-        return labels_str # Fallback to English
 # --- AUDIO LOGIC ---
 async def play_tts(text, lang_name):
     voice = LANG_CONFIG[lang_name]["voice"]
     path = "ref.mp3"
     communicate = edge_tts.Communicate(text, voice)
@@ -92,55 +87,67 @@ async def play_tts(text, lang_name):
     return path
 def analyze_audio(lang_name, target_text, audio_path):
-    if not audio_path: return "Record your voice!", "", ""
     # 1. ASR
     asr_res = asr_pipe(audio_path)["text"].strip()
     # 2. IPA
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
-    target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
-    user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
-    # 3. LLM Feedback
-    feedback = get_llm_feedback(lang_name, target_text, asr_res, user_ipa, target_ipa)
     return asr_res, f"/{user_ipa}/", feedback
 # --- UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 👁️ PANINI Vision: Visual Language Coach")
-    gr.Markdown("Identify objects in your world and master their names in any language.")
-    with gr.Tab("Step 1: Visual Discovery"):
         with gr.Row():
             with gr.Column():
-                input_img = gr.Image(type="pill", label="Upload or Capture Photo")
-                target_tags = gr.Textbox(label="Custom Tags (Optional)", placeholder="e.g. coffee, snacks, cat")
                 btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
             with gr.Column():
-                output_img = gr.Image(label="Identified Objects")
-                detected_list = gr.Textbox(label="Detected English Objects")
-    with gr.Tab("Step 2: Naming & Practice"):
         with gr.Row():
-            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
             btn_trans = gr.Button("🌐 Translate Labels")
-        translated_box = gr.Textbox(label="Vocabulary List (Study these!)")
         with gr.Row():
-            target_word = gr.Textbox(label="Type word to practice")
-            btn_play = gr.Button("🔊 Hear Native", scale=0)
-            audio_ref = gr.Audio(label="Reference Audio", type="filepath")
         with gr.Row():
-            audio_user = gr.Audio(label="Record Your Pronunciation", sources=["microphone"], type="filepath")
-            btn_analyze = gr.Button("🚀 Analyze My Speech", variant="primary")
         with gr.Row():
             out_heard = gr.Textbox(label="AI Heard")
-            out_ipa = gr.Textbox(label="Your Phonetics (IPA)")
             out_feedback = gr.Markdown()
     # --- ACTIONS ---

 from phonemizer import phonemize
 from transformers import pipeline
 from huggingface_hub import InferenceClient
 # --- INITIALIZATION ---
 HF_TOKEN = os.getenv("HF_TOKEN")
+# Load a small YOLO World model
+# Note: On first run, it downloads the weights automatically
 model_vision = YOLOWorld('yolov8s-world.pt')
+# Whisper for ASR (Transcription)
 asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
 LANG_CONFIG = {
 # --- VISION LOGIC ---
 def detect_objects(img, target_queries):
+    if img is None:
+        return None, "Please upload an image."
     # Set custom classes based on user input
     if target_queries:
         classes = [x.strip() for x in target_queries.split(",")]
         model_vision.set_classes(classes)
     else:
+        # Default common objects for language learning
+        model_vision.set_classes(["chair", "table", "bottle", "cup", "fruit", "book", "laptop", "backpack"])
     results = model_vision.predict(img, conf=0.3)
     # Draw results on image
     annotated_img = results[0].plot()
+    # Convert BGR (OpenCV format) to RGB for Gradio
+    annotated_img = annotated_img[..., ::-1]
     # Extract unique labels
     detected_labels = []
     for c in results[0].boxes.cls:
         detected_labels.append(model_vision.names[int(c)])
+    return annotated_img, ", ".join(list(set(detected_labels)))
 # --- TRANSLATION & FEEDBACK LOGIC ---
+def get_llm_response(model_id, system_prompt, user_prompt):
+    client = InferenceClient(model=model_id, token=HF_TOKEN)
     try:
+        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
+        output = client.chat_completion(messages, max_tokens=200)
         return output.choices[0].message.content
+    except Exception as e:
+        return f"AI Error: {str(e)}"
+def translate_labels(lang_name, labels_str):
+    if not labels_str or labels_str == "No objects detected.":
+        return "Nothing to translate."
+    system = f"You are a helpful translator for a language learning app."
+    prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return only a comma-separated list."
+    return get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
 # --- AUDIO LOGIC ---
 async def play_tts(text, lang_name):
+    if not text: return None
     voice = LANG_CONFIG[lang_name]["voice"]
     path = "ref.mp3"
     communicate = edge_tts.Communicate(text, voice)
     return path
 def analyze_audio(lang_name, target_text, audio_path):
+    if not audio_path or not target_text:
+        return "Record your voice and provide text!", "", ""
     # 1. ASR
     asr_res = asr_pipe(audio_path)["text"].strip()
     # 2. IPA
     ipa_code = LANG_CONFIG[lang_name]["ipa"]
+    try:
+        target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
+        user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
+    except:
+        target_ipa = "IPA Error"
+        user_ipa = "IPA Error"
+    # 3. LLM Anatomical Feedback
+    system = "You are a professional Phonetics Coach."
+    prompt = (f"Target: '{target_text}' (IPA: /{target_ipa}/). "
+              f"Student said: '{asr_res}' (IPA: /{user_ipa}/). "
+              f"Identify the main pronunciation error and give 1 anatomical tip in English.")
+    feedback = get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
     return asr_res, f"/{user_ipa}/", feedback
 # --- UI ---
+# Moved theme into Blocks constructor (or it can go in launch)
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.HTML("<h1 style='text-align: center;'>👁️ PANINI Vision</h1>")
+    gr.HTML("<p style='text-align: center;'>Discover your world in any language.</p>")
+    with gr.Tab("1. Scan & Discover"):
         with gr.Row():
             with gr.Column():
+                # FIXED: type="pill" -> type="pil"
+                input_img = gr.Image(type="pil", label="Upload or Capture Photo")
+                target_tags = gr.Textbox(label="Target specific things?", placeholder="e.g. apple, dog, keyboard")
                 btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
             with gr.Column():
+                output_img = gr.Image(label="Annotated View")
+                detected_list = gr.Textbox(label="Objects Found (English)")
+    with gr.Tab("2. Practice Naming"):
         with gr.Row():
+            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learn in...", value="Spanish")
             btn_trans = gr.Button("🌐 Translate Labels")
+        translated_box = gr.Textbox(label="Vocabulary List")
         with gr.Row():
+            target_word = gr.Textbox(label="Word to Practice")
+            btn_play = gr.Button("🔊 Listen", scale=0)
+            audio_ref = gr.Audio(label="Reference", type="filepath")
         with gr.Row():
+            audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
+            btn_analyze = gr.Button("🚀 Analyze Speech", variant="primary")
         with gr.Row():
             out_heard = gr.Textbox(label="AI Heard")
+            out_ipa = gr.Textbox(label="Your IPA")
             out_feedback = gr.Markdown()
     # --- ACTIONS ---