st192011 commited on
Commit
1677275
Β·
verified Β·
1 Parent(s): b4fe004

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -57
app.py CHANGED
@@ -16,14 +16,17 @@ from datasets import load_dataset
16
 
17
  # --- CONFIG & MODELS ---
18
  HF_TOKEN = os.getenv("HF_TOKEN")
19
- # Load YOLO World (Small) - efficient for CPU
20
  model_vision = YOLOWorld('yolov8s-world.pt')
21
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
22
 
23
  # Initialize COCO Dataset Streaming
24
  print("Initialising COCO Dataset streaming...")
25
- ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
26
- ds_iter = iter(ds)
 
 
 
 
27
 
28
  LANG_CONFIG = {
29
  "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
@@ -35,72 +38,79 @@ LANG_CONFIG = {
35
  # --- FUNCTIONS ---
36
 
37
  def get_random_coco_image():
38
- """Pulls a random image from the COCO dataset on Hugging Face"""
39
- global ds_iter # Declared at the top to avoid SyntaxError
40
  try:
41
- # Skip a few items for variety
42
- sample = None
43
- for _ in range(random.randint(1, 5)):
44
  sample = next(ds_iter)
45
  return sample['image']
46
- except (StopIteration, NameError):
47
- # Re-initialize if we hit the end of the stream
48
- ds_iter = iter(ds)
49
- sample = next(ds_iter)
50
- return sample['image']
51
 
52
- def scan_scene(img, lang_name):
53
  if img is None:
54
  return None, "Please get a scene first.", []
55
 
56
- # Define broad vocabulary classes for discovery
57
- classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
58
- "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
 
 
 
 
 
 
59
  model_vision.set_classes(classes)
60
 
 
61
  results = model_vision.predict(img, conf=0.25)
62
- annotated_img = results[0].plot()[..., ::-1] # Convert BGR to RGB
63
 
64
- found_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
 
65
 
66
- # Translate via LLM
67
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
68
  trans_map = {}
69
 
70
- if found_labels:
71
- prompt = f"Translate these English nouns to {lang_name}: {', '.join(found_labels)}. Format strictly as English:Translated, English:Translated"
 
72
  try:
73
  res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
 
74
  for pair in res_text.split(","):
75
  if ":" in pair:
76
  eng, trans = pair.split(":")
77
  trans_map[eng.strip().lower()] = trans.strip()
78
- except:
79
- trans_map = {lbl: lbl for lbl in found_labels}
 
80
 
81
- # Map detections for coordinate-based clicking
82
  detections = []
83
  for box in results[0].boxes:
84
- label = model_vision.names[int(box.cls)].lower()
85
- translated_label = trans_map.get(label, label)
86
  coords = box.xyxy[0].tolist()
87
- detections.append({"label": translated_label, "box": coords})
88
 
89
- return annotated_img, ", ".join(trans_map.values()), detections
 
90
 
91
  def on_image_click(evt: gr.SelectData, detections):
92
- """Triggered when user clicks the image results"""
93
  if not detections:
94
- return "Please scan the image first!", ""
95
 
96
  click_x, click_y = evt.index
97
  for det in detections:
98
  x1, y1, x2, y2 = det["box"]
99
- # Check if click is inside the bounding box
100
  if x1 <= click_x <= x2 and y1 <= click_y <= y2:
101
- return f"🎯 Selected: **{det['label']}**", det['label']
 
102
 
103
- return "πŸ’‘ Click inside a colored box!", ""
104
 
105
  async def tts_task(text, lang_name):
106
  if not text: return None
@@ -111,7 +121,7 @@ async def tts_task(text, lang_name):
111
 
112
  def run_feedback(target, lang_name, audio_path):
113
  if not audio_path or not target:
114
- return "Record audio and select a word first.", "", ""
115
 
116
  asr_res = asr_pipe(audio_path)["text"].strip()
117
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
@@ -123,52 +133,71 @@ def run_feedback(target, lang_name, audio_path):
123
  t_ipa, u_ipa = "N/A", "N/A"
124
 
125
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
126
- prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Identify the error and give 1 anatomical tip in English."
127
  try:
128
  fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
129
  return asr_res, f"/{u_ipa}/", fb
130
  except:
131
- return asr_res, f"/{u_ipa}/", "Coach is busy, please try again."
132
 
133
  # --- UI ---
134
- with gr.Blocks(css=".gradio-container {max-width: 1000px !important}") as demo:
135
- gr.HTML("<h1 style='text-align: center; color: #2563eb;'>πŸ“‡ PANINI Flashcards</h1>")
136
- gr.Markdown("Click a scene, scan it, and click objects to learn and practice.")
 
 
137
 
138
  current_dets = gr.State([])
139
 
140
  with gr.Row():
141
  with gr.Column(scale=1):
142
- lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
143
  btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
144
- input_img = gr.Image(type="pil", label="Initial Scene", interactive=False)
 
 
145
  btn_scan = gr.Button("πŸ” Scan Vocabulary", variant="primary")
146
 
147
  with gr.Column(scale=2):
148
  gr.Markdown("### Interactive Discovery")
149
- display_img = gr.Image(label="Click an object box to practice", interactive=True)
150
- status_lab = gr.Markdown("1. Get a scene. 2. Scan. 3. Click an object!")
151
- vocab_list = gr.Textbox(label="Words Found in this Scene", interactive=False)
152
 
153
  with gr.Row():
154
  with gr.Column():
155
- gr.Markdown("### Practice Area")
156
- practice_word = gr.Textbox(label="Word to Practice (Autofilled on click)")
157
  btn_play = gr.Button("πŸ”Š Listen to Native", scale=0)
158
- audio_out = gr.Audio(label="Native Audio", type="filepath")
159
 
160
  with gr.Column():
161
- audio_in = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
162
- btn_eval = gr.Button("πŸš€ Check My Accent", variant="primary")
163
- res_heard = gr.Textbox(label="AI Transcription")
164
- res_fb = gr.Markdown()
165
 
166
- # --- ACTIONS ---
167
  btn_random.click(get_random_coco_image, outputs=input_img)
168
- btn_scan.click(scan_scene, [input_img, lang_drop], [display_img, vocab_list, current_dets])
169
- display_img.select(on_image_click, [current_dets], [status_lab, practice_word])
 
 
 
 
 
 
 
 
 
 
 
170
  btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
 
171
  btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
172
 
173
- # Launch with theme and SSR settings
174
- demo.launch(theme=gr.themes.Soft(primary_hue="blue"), ssr_mode=False)
 
 
 
 
16
 
17
  # --- CONFIG & MODELS ---
18
  HF_TOKEN = os.getenv("HF_TOKEN")
 
19
  model_vision = YOLOWorld('yolov8s-world.pt')
20
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
21
 
22
  # Initialize COCO Dataset Streaming
23
  print("Initialising COCO Dataset streaming...")
24
+ try:
25
+ ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
26
+ ds_iter = iter(ds)
27
+ except Exception as e:
28
+ print(f"Dataset init failed: {e}")
29
+ ds_iter = None
30
 
31
  LANG_CONFIG = {
32
  "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
 
38
  # --- FUNCTIONS ---
39
 
40
  def get_random_coco_image():
41
+ global ds_iter
 
42
  try:
43
+ if ds_iter is None: raise ValueError("Dataset not ready")
44
+ for _ in range(random.randint(1, 3)):
 
45
  sample = next(ds_iter)
46
  return sample['image']
47
+ except Exception as e:
48
+ return "http://images.cocodataset.org/val2017/000000000632.jpg"
 
 
 
49
 
50
+ def scan_scene(img, lang_name, custom_tags):
51
  if img is None:
52
  return None, "Please get a scene first.", []
53
 
54
+ # 1. SET VOCABULARY (Open Vocabulary Feature)
55
+ if custom_tags and len(custom_tags.strip()) > 0:
56
+ # User defined search
57
+ classes = [x.strip() for x in custom_tags.split(",")]
58
+ else:
59
+ # General discovery mode
60
+ classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
61
+ "refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]
62
+
63
  model_vision.set_classes(classes)
64
 
65
+ # 2. PREDICT
66
  results = model_vision.predict(img, conf=0.25)
67
+ annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
68
 
69
+ # 3. EXTRACT AND TRANSLATE
70
+ eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
71
 
 
72
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
73
  trans_map = {}
74
 
75
+ if eng_labels:
76
+ # Prompt LLM to create a translation dictionary
77
+ prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
78
  try:
79
  res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
80
+ # Parse pairs like 'table:der Tisch'
81
  for pair in res_text.split(","):
82
  if ":" in pair:
83
  eng, trans = pair.split(":")
84
  trans_map[eng.strip().lower()] = trans.strip()
85
+ except Exception as e:
86
+ print(f"Translation Error: {e}")
87
+ trans_map = {lbl.lower(): lbl for lbl in eng_labels}
88
 
89
+ # 4. MAP DETECTIONS (Link box to translated word)
90
  detections = []
91
  for box in results[0].boxes:
92
+ eng_label = model_vision.names[int(box.cls)].lower()
93
+ translated_label = trans_map.get(eng_label, eng_label)
94
  coords = box.xyxy[0].tolist()
95
+ detections.append({"translated": translated_label, "english": eng_label, "box": coords})
96
 
97
+ vocab_display = ", ".join(trans_map.values())
98
+ return annotated_img, vocab_display, detections
99
 
100
  def on_image_click(evt: gr.SelectData, detections):
101
+ """Triggered when user clicks an object in the annotated image"""
102
  if not detections:
103
+ return "Scan the image first!", ""
104
 
105
  click_x, click_y = evt.index
106
  for det in detections:
107
  x1, y1, x2, y2 = det["box"]
108
+ # Check if click point is inside the detection box
109
  if x1 <= click_x <= x2 and y1 <= click_y <= y2:
110
+ translated_word = det['translated']
111
+ return f"🎯 Selected: **{translated_word}** ({det['english']})", translated_word
112
 
113
+ return "πŸ’‘ Click directly inside a colored box!", ""
114
 
115
  async def tts_task(text, lang_name):
116
  if not text: return None
 
121
 
122
  def run_feedback(target, lang_name, audio_path):
123
  if not audio_path or not target:
124
+ return "Select a word and record audio.", "", ""
125
 
126
  asr_res = asr_pipe(audio_path)["text"].strip()
127
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
 
133
  t_ipa, u_ipa = "N/A", "N/A"
134
 
135
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
136
+ prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
137
  try:
138
  fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
139
  return asr_res, f"/{u_ipa}/", fb
140
  except:
141
+ return asr_res, f"/{u_ipa}/", "Coach is busy."
142
 
143
  # --- UI ---
144
+ CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"
145
+
146
+ with gr.Blocks(css=CSS) as demo:
147
+ gr.HTML("<h1 style='text-align: center; color: #1e40af;'>πŸŽ™οΈ PANINI Flashcards</h1>")
148
+ gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")
149
 
150
  current_dets = gr.State([])
151
 
152
  with gr.Row():
153
  with gr.Column(scale=1):
154
+ lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
155
  btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
156
+ input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)
157
+
158
+ custom_tags = gr.Textbox(label="πŸ” What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
159
  btn_scan = gr.Button("πŸ” Scan Vocabulary", variant="primary")
160
 
161
  with gr.Column(scale=2):
162
  gr.Markdown("### Interactive Discovery")
163
+ display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
164
+ status_lab = gr.Markdown("Status: Ready.")
165
+ vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)
166
 
167
  with gr.Row():
168
  with gr.Column():
169
+ gr.Markdown("### 🎀 Practice Area")
170
+ practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
171
  btn_play = gr.Button("πŸ”Š Listen to Native", scale=0)
172
+ audio_out = gr.Audio(label="Native Reference", type="filepath")
173
 
174
  with gr.Column():
175
+ audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
176
+ btn_eval = gr.Button("πŸš€ Analyze Accent", variant="primary")
177
+ res_heard = gr.Textbox(label="What AI heard")
178
+ res_fb = gr.Markdown(elem_classes=["feedback-box"])
179
 
180
+ # --- EVENTS ---
181
  btn_random.click(get_random_coco_image, outputs=input_img)
182
+
183
+ btn_scan.click(
184
+ scan_scene,
185
+ inputs=[input_img, lang_drop, custom_tags],
186
+ outputs=[display_img, vocab_list, current_dets]
187
+ )
188
+
189
+ display_img.select(
190
+ on_image_click,
191
+ inputs=[current_dets],
192
+ outputs=[status_lab, practice_word]
193
+ )
194
+
195
  btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)
196
+
197
  btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])
198
 
199
+ # Launch
200
+ demo.launch(
201
+ theme=gr.themes.Soft(primary_hue="blue"),
202
+ ssr_mode=False
203
+ )