Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,6 +6,8 @@ import torch
|
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import gradio as gr
|
|
|
|
|
|
|
| 9 |
from PIL import Image
|
| 10 |
from ultralytics import YOLOWorld
|
| 11 |
from phonemizer import phonemize
|
|
@@ -15,11 +17,10 @@ from huggingface_hub import InferenceClient
|
|
| 15 |
# --- INITIALIZATION ---
|
| 16 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 17 |
|
| 18 |
-
# Load
|
| 19 |
-
# Note: On first run, it downloads the weights automatically
|
| 20 |
model_vision = YOLOWorld('yolov8s-world.pt')
|
| 21 |
|
| 22 |
-
# Whisper for ASR (
|
| 23 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 24 |
|
| 25 |
LANG_CONFIG = {
|
|
@@ -33,127 +34,151 @@ LANG_CONFIG = {
|
|
| 33 |
# --- VISION LOGIC ---
|
| 34 |
def detect_objects(img, target_queries):
|
| 35 |
if img is None:
|
| 36 |
-
return None, "Please upload an image."
|
| 37 |
|
| 38 |
-
# Set
|
| 39 |
-
if target_queries:
|
| 40 |
classes = [x.strip() for x in target_queries.split(",")]
|
| 41 |
-
model_vision.set_classes(classes)
|
| 42 |
else:
|
| 43 |
-
#
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
|
|
|
| 47 |
|
| 48 |
-
#
|
| 49 |
annotated_img = results[0].plot()
|
| 50 |
-
|
| 51 |
-
# Convert BGR (OpenCV format) to RGB for Gradio
|
| 52 |
annotated_img = annotated_img[..., ::-1]
|
| 53 |
|
| 54 |
-
# Extract
|
| 55 |
-
|
| 56 |
for c in results[0].boxes.cls:
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
def get_llm_response(model_id, system_prompt, user_prompt):
|
| 63 |
-
client = InferenceClient(model=model_id, token=HF_TOKEN)
|
| 64 |
-
try:
|
| 65 |
-
messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
|
| 66 |
-
output = client.chat_completion(messages, max_tokens=200)
|
| 67 |
-
return output.choices[0].message.content
|
| 68 |
-
except Exception as e:
|
| 69 |
-
return f"AI Error: {str(e)}"
|
| 70 |
|
|
|
|
| 71 |
def translate_labels(lang_name, labels_str):
|
| 72 |
-
if not labels_str or
|
| 73 |
-
return "
|
| 74 |
|
| 75 |
-
|
| 76 |
-
prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return
|
| 77 |
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
# ---
|
| 81 |
-
async def
|
| 82 |
-
if not text: return None
|
| 83 |
voice = LANG_CONFIG[lang_name]["voice"]
|
| 84 |
-
|
|
|
|
| 85 |
communicate = edge_tts.Communicate(text, voice)
|
| 86 |
-
await communicate.save(
|
| 87 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
def
|
| 90 |
if not audio_path or not target_text:
|
| 91 |
-
return "
|
| 92 |
|
| 93 |
-
#
|
| 94 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 95 |
|
| 96 |
-
#
|
| 97 |
ipa_code = LANG_CONFIG[lang_name]["ipa"]
|
| 98 |
try:
|
| 99 |
target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
|
| 100 |
user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
|
| 101 |
except:
|
| 102 |
-
target_ipa = "
|
| 103 |
-
user_ipa = "
|
| 104 |
|
| 105 |
-
#
|
| 106 |
-
|
| 107 |
-
prompt = (f"
|
| 108 |
-
|
| 109 |
-
f"Identify the main pronunciation error and give 1 anatomical tip in English.")
|
| 110 |
-
|
| 111 |
-
feedback = get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
return asr_res, f"/{user_ipa}/", feedback
|
| 114 |
|
| 115 |
# --- UI ---
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
gr.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
-
with gr.Tab("1.
|
| 122 |
with gr.Row():
|
| 123 |
with gr.Column():
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
target_tags = gr.Textbox(label="Target specific things?", placeholder="e.g. apple, dog, keyboard")
|
| 127 |
btn_scan = gr.Button("π Scan Environment", variant="primary")
|
| 128 |
with gr.Column():
|
| 129 |
-
output_img = gr.Image(label="
|
| 130 |
-
detected_list = gr.Textbox(label="Objects
|
| 131 |
|
| 132 |
-
with gr.Tab("2. Practice
|
| 133 |
with gr.Row():
|
| 134 |
-
lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="
|
| 135 |
btn_trans = gr.Button("π Translate Labels")
|
| 136 |
|
| 137 |
-
|
| 138 |
|
| 139 |
with gr.Row():
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
audio_ref = gr.Audio(label="Reference", type="filepath")
|
| 143 |
|
| 144 |
with gr.Row():
|
| 145 |
audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
|
| 146 |
-
btn_analyze = gr.Button("π Analyze
|
| 147 |
|
| 148 |
with gr.Row():
|
| 149 |
out_heard = gr.Textbox(label="AI Heard")
|
| 150 |
out_ipa = gr.Textbox(label="Your IPA")
|
| 151 |
out_feedback = gr.Markdown()
|
| 152 |
|
| 153 |
-
# ---
|
| 154 |
btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
|
| 155 |
-
btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
| 158 |
|
| 159 |
demo.launch()
|
|
|
|
| 6 |
import numpy as np
|
| 7 |
import pandas as pd
|
| 8 |
import gradio as gr
|
| 9 |
+
import re
|
| 10 |
+
import time
|
| 11 |
from PIL import Image
|
| 12 |
from ultralytics import YOLOWorld
|
| 13 |
from phonemizer import phonemize
|
|
|
|
| 17 |
# --- INITIALIZATION ---
|
| 18 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 19 |
|
| 20 |
+
# Load YOLO World (Small)
|
|
|
|
| 21 |
model_vision = YOLOWorld('yolov8s-world.pt')
|
| 22 |
|
| 23 |
+
# Whisper for ASR (Using tiny for speed)
|
| 24 |
asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
|
| 25 |
|
| 26 |
LANG_CONFIG = {
|
|
|
|
| 34 |
# --- VISION LOGIC ---
|
| 35 |
def detect_objects(img, target_queries):
|
| 36 |
if img is None:
|
| 37 |
+
return None, "Please upload an image first."
|
| 38 |
|
| 39 |
+
# 1. Reset/Set Vocabulary
|
| 40 |
+
if target_queries and len(target_queries.strip()) > 0:
|
| 41 |
classes = [x.strip() for x in target_queries.split(",")]
|
|
|
|
| 42 |
else:
|
| 43 |
+
# Balanced default list to prevent "bottle" bias
|
| 44 |
+
classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase",
|
| 45 |
+
"cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
|
| 46 |
+
"sandwich", "orange", "broccoli", "carrot", "pizza", "donut",
|
| 47 |
+
"cake", "chair", "couch", "potted plant", "bed", "dining table",
|
| 48 |
+
"tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
|
| 49 |
+
"microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"]
|
| 50 |
+
|
| 51 |
+
# Force YOLO to update its internal class list
|
| 52 |
+
model_vision.set_classes(classes)
|
| 53 |
|
| 54 |
+
# 2. Prediction (Slightly higher confidence to reduce noise)
|
| 55 |
+
results = model_vision.predict(img, conf=0.4)
|
| 56 |
|
| 57 |
+
# 3. Process Image
|
| 58 |
annotated_img = results[0].plot()
|
| 59 |
+
# Flip BGR to RGB
|
|
|
|
| 60 |
annotated_img = annotated_img[..., ::-1]
|
| 61 |
|
| 62 |
+
# 4. Extract Labels
|
| 63 |
+
found_labels = []
|
| 64 |
for c in results[0].boxes.cls:
|
| 65 |
+
found_labels.append(model_vision.names[int(c)])
|
| 66 |
|
| 67 |
+
label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'."
|
| 68 |
+
|
| 69 |
+
return annotated_img, label_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
+
# --- TRANSLATION ---
|
| 72 |
def translate_labels(lang_name, labels_str):
|
| 73 |
+
if not labels_str or "No objects" in labels_str:
|
| 74 |
+
return "No objects detected to translate."
|
| 75 |
|
| 76 |
+
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 77 |
+
prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list."
|
| 78 |
|
| 79 |
+
try:
|
| 80 |
+
output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
|
| 81 |
+
return output.choices[0].message.content
|
| 82 |
+
except Exception as e:
|
| 83 |
+
return f"Translation Error: {str(e)}"
|
| 84 |
|
| 85 |
+
# --- SPEECH LOGIC (FIXED) ---
|
| 86 |
+
async def tts_core(text, lang_name):
|
|
|
|
| 87 |
voice = LANG_CONFIG[lang_name]["voice"]
|
| 88 |
+
# Use timestamp to prevent browser audio caching issues
|
| 89 |
+
filename = f"ref_{int(time.time())}.mp3"
|
| 90 |
communicate = edge_tts.Communicate(text, voice)
|
| 91 |
+
await communicate.save(filename)
|
| 92 |
+
return filename
|
| 93 |
+
|
| 94 |
+
def handle_tts(text, lang_name):
|
| 95 |
+
if not text: return None
|
| 96 |
+
return asyncio.run(tts_core(text, lang_name))
|
| 97 |
|
| 98 |
+
def analyze_speech(lang_name, target_text, audio_path):
|
| 99 |
if not audio_path or not target_text:
|
| 100 |
+
return "Missing recording or target word.", "", "Please provide both."
|
| 101 |
|
| 102 |
+
# ASR
|
| 103 |
asr_res = asr_pipe(audio_path)["text"].strip()
|
| 104 |
|
| 105 |
+
# IPA
|
| 106 |
ipa_code = LANG_CONFIG[lang_name]["ipa"]
|
| 107 |
try:
|
| 108 |
target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
|
| 109 |
user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
|
| 110 |
except:
|
| 111 |
+
target_ipa = "Error"
|
| 112 |
+
user_ipa = "Error"
|
| 113 |
|
| 114 |
+
# LLM Feedback
|
| 115 |
+
client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
|
| 116 |
+
prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. "
|
| 117 |
+
"Identify the error and give 1 specific anatomical tip for tongue/lips in English.")
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
try:
|
| 120 |
+
fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
|
| 121 |
+
feedback = fb.choices[0].message.content
|
| 122 |
+
except:
|
| 123 |
+
feedback = "Speech analysis busy. Try again."
|
| 124 |
+
|
| 125 |
return asr_res, f"/{user_ipa}/", feedback
|
| 126 |
|
| 127 |
# --- UI ---
|
| 128 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
|
| 129 |
+
gr.HTML("<h1 style='text-align: center;'>ποΈ PANINI Vision</h1>")
|
| 130 |
+
|
| 131 |
+
with gr.Accordion("π How to use (Instruction)", open=False):
|
| 132 |
+
gr.Markdown("""
|
| 133 |
+
### 1. Vision Step
|
| 134 |
+
* **Upload a photo** of your room or desk.
|
| 135 |
+
* **Custom Tags (Open Vocabulary):** This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look *specifically* for those items. If you leave it blank, it uses a general list.
|
| 136 |
+
* Click **Scan Environment**.
|
| 137 |
+
|
| 138 |
+
### 2. Translation & Speech Step
|
| 139 |
+
* Select your **Target Language**.
|
| 140 |
+
* Click **Translate Labels** to turn the English names into your learning language.
|
| 141 |
+
* **Copy** one of those words into the 'Word to Practice' box.
|
| 142 |
+
* **Listen** to the AI, then **Record** yourself to get feedback!
|
| 143 |
+
""")
|
| 144 |
|
| 145 |
+
with gr.Tab("1. Discover Objects"):
|
| 146 |
with gr.Row():
|
| 147 |
with gr.Column():
|
| 148 |
+
input_img = gr.Image(type="pil", label="Capture your world")
|
| 149 |
+
target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book")
|
|
|
|
| 150 |
btn_scan = gr.Button("π Scan Environment", variant="primary")
|
| 151 |
with gr.Column():
|
| 152 |
+
output_img = gr.Image(label="AI Detection")
|
| 153 |
+
detected_list = gr.Textbox(label="Detected Objects (English)")
|
| 154 |
|
| 155 |
+
with gr.Tab("2. Language Practice"):
|
| 156 |
with gr.Row():
|
| 157 |
+
lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish")
|
| 158 |
btn_trans = gr.Button("π Translate Labels")
|
| 159 |
|
| 160 |
+
vocab_output = gr.Textbox(label="Translated Vocabulary")
|
| 161 |
|
| 162 |
with gr.Row():
|
| 163 |
+
practice_word = gr.Textbox(label="Word to Practice")
|
| 164 |
+
btn_listen = gr.Button("π Listen", scale=0)
|
| 165 |
+
audio_ref = gr.Audio(label="Native Reference", type="filepath")
|
| 166 |
|
| 167 |
with gr.Row():
|
| 168 |
audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
|
| 169 |
+
btn_analyze = gr.Button("π Analyze Pronunciation", variant="primary")
|
| 170 |
|
| 171 |
with gr.Row():
|
| 172 |
out_heard = gr.Textbox(label="AI Heard")
|
| 173 |
out_ipa = gr.Textbox(label="Your IPA")
|
| 174 |
out_feedback = gr.Markdown()
|
| 175 |
|
| 176 |
+
# --- BUTTON LOGIC ---
|
| 177 |
btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
|
| 178 |
+
btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output)
|
| 179 |
+
|
| 180 |
+
# Fixed Speech logic
|
| 181 |
+
btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref)
|
| 182 |
+
btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
|
| 183 |
|
| 184 |
demo.launch()
|