Spaces:

st192011
/

Panini-Vision

Sleeping

App Files Files Community

st192011 commited on Dec 18, 2025

Commit

7132f70

verified ·

1 Parent(s): 7374052

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+import asyncio
+import edge_tts
+import librosa
+import torch
+import numpy as np
+import pandas as pd
+import gradio as gr
+from PIL import Image
+from ultralytics import YOLOWorld
+from phonemizer import phonemize
+from transformers import pipeline
+from huggingface_hub import InferenceClient
+from torch.nn.functional import cosine_similarity
+# --- INITIALIZATION ---
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Load a small YOLO World model for CPU efficiency
+model_vision = YOLOWorld('yolov8s-world.pt')
+# Whisper for ASR
+asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
+LANG_CONFIG = {
+    "English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"},
+    "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
+    "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
+    "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
+    "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
+}
+# --- VISION LOGIC ---
+def detect_objects(img, target_queries):
+    # Set custom classes based on user input
+    if target_queries:
+        classes = [x.strip() for x in target_queries.split(",")]
+        model_vision.set_classes(classes)
+    else:
+        # Default common objects
+        model_vision.set_classes(["chair", "table", "person", "bottle", "cup", "fruit", "book"])
+    results = model_vision.predict(img, conf=0.3)
+    # Draw results on image
+    annotated_img = results[0].plot()
+    # Extract unique labels
+    detected_labels = []
+    for c in results[0].boxes.cls:
+        detected_labels.append(model_vision.names[int(c)])
+    return annotated_img, list(set(detected_labels))
+# --- TRANSLATION & FEEDBACK LOGIC ---
+def get_llm_feedback(lang_name, english_word, student_speech, student_ipa, target_ipa):
+    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    prompt = f"""
+    Target Word: {english_word} in {lang_name}.
+    Native IPA: /{target_ipa}/
+    Student IPA: /{student_ipa}/
+    Student said: "{student_speech}"
+    The student is learning {lang_name}. Identify the main pronunciation error and give 1 short anatomical tip (tongue/lip placement) in English.
+    """
+    try:
+        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
+        return output.choices[0].message.content
+    except:
+        return "LLM Busy. Try again in a moment."
+def translate_labels(lang_name, labels):
+    if not labels: return "No objects detected."
+    client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
+    labels_str = ", ".join(labels)
+    prompt = f"Translate these English object labels into {lang_name}. Provide the results as a comma-separated list. Labels: {labels_str}"
+    try:
+        output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
+        return output.choices[0].message.content
+    except:
+        return labels_str # Fallback to English
+# --- AUDIO LOGIC ---
+async def play_tts(text, lang_name):
+    voice = LANG_CONFIG[lang_name]["voice"]
+    path = "ref.mp3"
+    communicate = edge_tts.Communicate(text, voice)
+    await communicate.save(path)
+    return path
+def analyze_audio(lang_name, target_text, audio_path):
+    if not audio_path: return "Record your voice!", "", ""
+    # 1. ASR
+    asr_res = asr_pipe(audio_path)["text"].strip()
+    # 2. IPA
+    ipa_code = LANG_CONFIG[lang_name]["ipa"]
+    target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
+    user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
+    # 3. LLM Feedback
+    feedback = get_llm_feedback(lang_name, target_text, asr_res, user_ipa, target_ipa)
+    return asr_res, f"/{user_ipa}/", feedback
+# --- UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 👁️ PANINI Vision: Visual Language Coach")
+    gr.Markdown("Identify objects in your world and master their names in any language.")
+    with gr.Tab("Step 1: Visual Discovery"):
+        with gr.Row():
+            with gr.Column():
+                input_img = gr.Image(type="pill", label="Upload or Capture Photo")
+                target_tags = gr.Textbox(label="Custom Tags (Optional)", placeholder="e.g. coffee, snacks, cat")
+                btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
+            with gr.Column():
+                output_img = gr.Image(label="Identified Objects")
+                detected_list = gr.Textbox(label="Detected English Objects")
+    with gr.Tab("Step 2: Naming & Practice"):
+        with gr.Row():
+            lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
+            btn_trans = gr.Button("🌐 Translate Labels")
+        translated_box = gr.Textbox(label="Vocabulary List (Study these!)")
+        with gr.Row():
+            target_word = gr.Textbox(label="Type word to practice")
+            btn_play = gr.Button("🔊 Hear Native", scale=0)
+            audio_ref = gr.Audio(label="Reference Audio", type="filepath")
+        with gr.Row():
+            audio_user = gr.Audio(label="Record Your Pronunciation", sources=["microphone"], type="filepath")
+            btn_analyze = gr.Button("🚀 Analyze My Speech", variant="primary")
+        with gr.Row():
+            out_heard = gr.Textbox(label="AI Heard")
+            out_ipa = gr.Textbox(label="Your Phonetics (IPA)")
+            out_feedback = gr.Markdown()
+    # --- ACTIONS ---
+    btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
+    btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=translated_box)
+    btn_play.click(fn=lambda t, l: asyncio.run(play_tts(t, l)), inputs=[target_word, lang_drop], outputs=audio_ref)
+    btn_analyze.click(analyze_audio, inputs=[lang_drop, target_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
+demo.launch()