st192011 commited on
Commit
038431f
Β·
verified Β·
1 Parent(s): 9471780

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -72
app.py CHANGED
@@ -6,6 +6,8 @@ import torch
6
  import numpy as np
7
  import pandas as pd
8
  import gradio as gr
 
 
9
  from PIL import Image
10
  from ultralytics import YOLOWorld
11
  from phonemizer import phonemize
@@ -15,11 +17,10 @@ from huggingface_hub import InferenceClient
15
  # --- INITIALIZATION ---
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
 
18
- # Load a small YOLO World model
19
- # Note: On first run, it downloads the weights automatically
20
  model_vision = YOLOWorld('yolov8s-world.pt')
21
 
22
- # Whisper for ASR (Transcription)
23
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
24
 
25
  LANG_CONFIG = {
@@ -33,127 +34,151 @@ LANG_CONFIG = {
33
  # --- VISION LOGIC ---
34
  def detect_objects(img, target_queries):
35
  if img is None:
36
- return None, "Please upload an image."
37
 
38
- # Set custom classes based on user input
39
- if target_queries:
40
  classes = [x.strip() for x in target_queries.split(",")]
41
- model_vision.set_classes(classes)
42
  else:
43
- # Default common objects for language learning
44
- model_vision.set_classes(["chair", "table", "bottle", "cup", "fruit", "book", "laptop", "backpack"])
 
 
 
 
 
 
 
 
45
 
46
- results = model_vision.predict(img, conf=0.3)
 
47
 
48
- # Draw results on image
49
  annotated_img = results[0].plot()
50
-
51
- # Convert BGR (OpenCV format) to RGB for Gradio
52
  annotated_img = annotated_img[..., ::-1]
53
 
54
- # Extract unique labels
55
- detected_labels = []
56
  for c in results[0].boxes.cls:
57
- detected_labels.append(model_vision.names[int(c)])
58
 
59
- return annotated_img, ", ".join(list(set(detected_labels)))
60
-
61
- # --- TRANSLATION & FEEDBACK LOGIC ---
62
- def get_llm_response(model_id, system_prompt, user_prompt):
63
- client = InferenceClient(model=model_id, token=HF_TOKEN)
64
- try:
65
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
66
- output = client.chat_completion(messages, max_tokens=200)
67
- return output.choices[0].message.content
68
- except Exception as e:
69
- return f"AI Error: {str(e)}"
70
 
 
71
  def translate_labels(lang_name, labels_str):
72
- if not labels_str or labels_str == "No objects detected.":
73
- return "Nothing to translate."
74
 
75
- system = f"You are a helpful translator for a language learning app."
76
- prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return only a comma-separated list."
77
 
78
- return get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
 
 
 
 
79
 
80
- # --- AUDIO LOGIC ---
81
- async def play_tts(text, lang_name):
82
- if not text: return None
83
  voice = LANG_CONFIG[lang_name]["voice"]
84
- path = "ref.mp3"
 
85
  communicate = edge_tts.Communicate(text, voice)
86
- await communicate.save(path)
87
- return path
 
 
 
 
88
 
89
- def analyze_audio(lang_name, target_text, audio_path):
90
  if not audio_path or not target_text:
91
- return "Record your voice and provide text!", "", ""
92
 
93
- # 1. ASR
94
  asr_res = asr_pipe(audio_path)["text"].strip()
95
 
96
- # 2. IPA
97
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
98
  try:
99
  target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
100
  user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
101
  except:
102
- target_ipa = "IPA Error"
103
- user_ipa = "IPA Error"
104
 
105
- # 3. LLM Anatomical Feedback
106
- system = "You are a professional Phonetics Coach."
107
- prompt = (f"Target: '{target_text}' (IPA: /{target_ipa}/). "
108
- f"Student said: '{asr_res}' (IPA: /{user_ipa}/). "
109
- f"Identify the main pronunciation error and give 1 anatomical tip in English.")
110
-
111
- feedback = get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
112
 
 
 
 
 
 
 
113
  return asr_res, f"/{user_ipa}/", feedback
114
 
115
  # --- UI ---
116
- # Moved theme into Blocks constructor (or it can go in launch)
117
- with gr.Blocks(theme=gr.themes.Soft()) as demo:
118
- gr.HTML("<h1 style='text-align: center;'>πŸ‘οΈ PANINI Vision</h1>")
119
- gr.HTML("<p style='text-align: center;'>Discover your world in any language.</p>")
 
 
 
 
 
 
 
 
 
 
 
 
120
 
121
- with gr.Tab("1. Scan & Discover"):
122
  with gr.Row():
123
  with gr.Column():
124
- # FIXED: type="pill" -> type="pil"
125
- input_img = gr.Image(type="pil", label="Upload or Capture Photo")
126
- target_tags = gr.Textbox(label="Target specific things?", placeholder="e.g. apple, dog, keyboard")
127
  btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
128
  with gr.Column():
129
- output_img = gr.Image(label="Annotated View")
130
- detected_list = gr.Textbox(label="Objects Found (English)")
131
 
132
- with gr.Tab("2. Practice Naming"):
133
  with gr.Row():
134
- lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learn in...", value="Spanish")
135
  btn_trans = gr.Button("🌐 Translate Labels")
136
 
137
- translated_box = gr.Textbox(label="Vocabulary List")
138
 
139
  with gr.Row():
140
- target_word = gr.Textbox(label="Word to Practice")
141
- btn_play = gr.Button("πŸ”Š Listen", scale=0)
142
- audio_ref = gr.Audio(label="Reference", type="filepath")
143
 
144
  with gr.Row():
145
  audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
146
- btn_analyze = gr.Button("πŸš€ Analyze Speech", variant="primary")
147
 
148
  with gr.Row():
149
  out_heard = gr.Textbox(label="AI Heard")
150
  out_ipa = gr.Textbox(label="Your IPA")
151
  out_feedback = gr.Markdown()
152
 
153
- # --- ACTIONS ---
154
  btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
155
- btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=translated_box)
156
- btn_play.click(fn=lambda t, l: asyncio.run(play_tts(t, l)), inputs=[target_word, lang_drop], outputs=audio_ref)
157
- btn_analyze.click(analyze_audio, inputs=[lang_drop, target_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
 
 
158
 
159
  demo.launch()
 
6
  import numpy as np
7
  import pandas as pd
8
  import gradio as gr
9
+ import re
10
+ import time
11
  from PIL import Image
12
  from ultralytics import YOLOWorld
13
  from phonemizer import phonemize
 
17
  # --- INITIALIZATION ---
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
 
20
+ # Load YOLO World (Small)
 
21
  model_vision = YOLOWorld('yolov8s-world.pt')
22
 
23
+ # Whisper for ASR (Using tiny for speed)
24
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
25
 
26
  LANG_CONFIG = {
 
34
  # --- VISION LOGIC ---
35
  def detect_objects(img, target_queries):
36
  if img is None:
37
+ return None, "Please upload an image first."
38
 
39
+ # 1. Reset/Set Vocabulary
40
+ if target_queries and len(target_queries.strip()) > 0:
41
  classes = [x.strip() for x in target_queries.split(",")]
 
42
  else:
43
+ # Balanced default list to prevent "bottle" bias
44
+ classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase",
45
+ "cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
46
+ "sandwich", "orange", "broccoli", "carrot", "pizza", "donut",
47
+ "cake", "chair", "couch", "potted plant", "bed", "dining table",
48
+ "tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
49
+ "microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"]
50
+
51
+ # Force YOLO to update its internal class list
52
+ model_vision.set_classes(classes)
53
 
54
+ # 2. Prediction (Slightly higher confidence to reduce noise)
55
+ results = model_vision.predict(img, conf=0.4)
56
 
57
+ # 3. Process Image
58
  annotated_img = results[0].plot()
59
+ # Flip BGR to RGB
 
60
  annotated_img = annotated_img[..., ::-1]
61
 
62
+ # 4. Extract Labels
63
+ found_labels = []
64
  for c in results[0].boxes.cls:
65
+ found_labels.append(model_vision.names[int(c)])
66
 
67
+ label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'."
68
+
69
+ return annotated_img, label_list
 
 
 
 
 
 
 
 
70
 
71
+ # --- TRANSLATION ---
72
  def translate_labels(lang_name, labels_str):
73
+ if not labels_str or "No objects" in labels_str:
74
+ return "No objects detected to translate."
75
 
76
+ client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
77
+ prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list."
78
 
79
+ try:
80
+ output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
81
+ return output.choices[0].message.content
82
+ except Exception as e:
83
+ return f"Translation Error: {str(e)}"
84
 
85
+ # --- SPEECH LOGIC (FIXED) ---
86
+ async def tts_core(text, lang_name):
 
87
  voice = LANG_CONFIG[lang_name]["voice"]
88
+ # Use timestamp to prevent browser audio caching issues
89
+ filename = f"ref_{int(time.time())}.mp3"
90
  communicate = edge_tts.Communicate(text, voice)
91
+ await communicate.save(filename)
92
+ return filename
93
+
94
+ def handle_tts(text, lang_name):
95
+ if not text: return None
96
+ return asyncio.run(tts_core(text, lang_name))
97
 
98
+ def analyze_speech(lang_name, target_text, audio_path):
99
  if not audio_path or not target_text:
100
+ return "Missing recording or target word.", "", "Please provide both."
101
 
102
+ # ASR
103
  asr_res = asr_pipe(audio_path)["text"].strip()
104
 
105
+ # IPA
106
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
107
  try:
108
  target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
109
  user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
110
  except:
111
+ target_ipa = "Error"
112
+ user_ipa = "Error"
113
 
114
+ # LLM Feedback
115
+ client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
116
+ prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. "
117
+ "Identify the error and give 1 specific anatomical tip for tongue/lips in English.")
 
 
 
118
 
119
+ try:
120
+ fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
121
+ feedback = fb.choices[0].message.content
122
+ except:
123
+ feedback = "Speech analysis busy. Try again."
124
+
125
  return asr_res, f"/{user_ipa}/", feedback
126
 
127
  # --- UI ---
128
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
129
+ gr.HTML("<h1 style='text-align: center;'>πŸŽ™οΈ PANINI Vision</h1>")
130
+
131
+ with gr.Accordion("πŸ“– How to use (Instruction)", open=False):
132
+ gr.Markdown("""
133
+ ### 1. Vision Step
134
+ * **Upload a photo** of your room or desk.
135
+ * **Custom Tags (Open Vocabulary):** This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look *specifically* for those items. If you leave it blank, it uses a general list.
136
+ * Click **Scan Environment**.
137
+
138
+ ### 2. Translation & Speech Step
139
+ * Select your **Target Language**.
140
+ * Click **Translate Labels** to turn the English names into your learning language.
141
+ * **Copy** one of those words into the 'Word to Practice' box.
142
+ * **Listen** to the AI, then **Record** yourself to get feedback!
143
+ """)
144
 
145
+ with gr.Tab("1. Discover Objects"):
146
  with gr.Row():
147
  with gr.Column():
148
+ input_img = gr.Image(type="pil", label="Capture your world")
149
+ target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book")
 
150
  btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
151
  with gr.Column():
152
+ output_img = gr.Image(label="AI Detection")
153
+ detected_list = gr.Textbox(label="Detected Objects (English)")
154
 
155
+ with gr.Tab("2. Language Practice"):
156
  with gr.Row():
157
+ lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish")
158
  btn_trans = gr.Button("🌐 Translate Labels")
159
 
160
+ vocab_output = gr.Textbox(label="Translated Vocabulary")
161
 
162
  with gr.Row():
163
+ practice_word = gr.Textbox(label="Word to Practice")
164
+ btn_listen = gr.Button("πŸ”Š Listen", scale=0)
165
+ audio_ref = gr.Audio(label="Native Reference", type="filepath")
166
 
167
  with gr.Row():
168
  audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
169
+ btn_analyze = gr.Button("πŸš€ Analyze Pronunciation", variant="primary")
170
 
171
  with gr.Row():
172
  out_heard = gr.Textbox(label="AI Heard")
173
  out_ipa = gr.Textbox(label="Your IPA")
174
  out_feedback = gr.Markdown()
175
 
176
+ # --- BUTTON LOGIC ---
177
  btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
178
+ btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output)
179
+
180
+ # Fixed Speech logic
181
+ btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref)
182
+ btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
183
 
184
  demo.launch()