st192011 commited on
Commit
b3d5547
Β·
verified Β·
1 Parent(s): 25c274d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -61
app.py CHANGED
@@ -18,12 +18,12 @@ HF_TOKEN = os.getenv("HF_TOKEN")
18
  model_vision = YOLOWorld('yolov8s-world.pt')
19
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
20
 
21
- # Sample Scenes (Public Domain Images)
22
  SAMPLE_SCENES = {
23
- "🍎 Kitchen": "https://images.unsplash.com/photo-1556910103-1c02745aae4d?w=800",
24
- "🌳 Park": "https://images.unsplash.com/photo-1588714477688-cf28a50e94f7?w=800",
25
- "🏒 Office": "https://images.unsplash.com/photo-1497215728101-856f4ea42174?w=800",
26
- "πŸ›‹οΈ Living Room": "https://images.unsplash.com/photo-1583847268964-b28dc2f51ac9?w=800"
27
  }
28
 
29
  LANG_CONFIG = {
@@ -38,116 +38,137 @@ LANG_CONFIG = {
38
  def scan_scene(img, lang_name):
39
  if img is None: return None, "Please select a scene.", []
40
 
41
- # 1. YOLO Scan
42
- # Default vocabulary for discovery
43
- model_vision.set_classes(["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink", "refrigerator", "oven"])
44
- results = model_vision.predict(img, conf=0.3)
45
 
46
- annotated_img = results[0].plot()[..., ::-1]
 
 
47
 
48
- # 2. Extract Data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  detections = []
50
- found_labels = []
51
  for box in results[0].boxes:
52
- label = model_vision.names[int(box.cls)]
53
- coords = box.xyxy[0].tolist() # [x1, y1, x2, y2]
54
- detections.append({"label": label, "box": coords})
55
- found_labels.append(label)
56
 
57
- found_unique = list(set(found_labels))
58
-
59
- # 3. Translate via LLM
60
- client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
61
- prompt = f"Translate these objects to {lang_name}: {', '.join(found_unique)}. Return ONLY a comma-separated list."
62
- try:
63
- translated = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=100).choices[0].message.content
64
- except:
65
- translated = ", ".join(found_unique)
66
-
67
- return annotated_img, translated, detections
68
 
69
- def check_point(evt: gr.SelectData, detections, lang_name):
70
- # evt.index gives [x, y] of click
71
  click_x, click_y = evt.index
72
 
73
  for det in detections:
74
  x1, y1, x2, y2 = det["box"]
 
75
  if x1 <= click_x <= x2 and y1 <= click_y <= y2:
76
- return f"🎯 You found the **{det['label']}**!"
 
77
 
78
- return "❌ Try clicking exactly on an object."
79
 
80
  async def run_tts(text, lang_name):
 
81
  voice = LANG_CONFIG[lang_name]["voice"]
82
  path = f"speech_{int(time.time())}.mp3"
83
  await edge_tts.Communicate(text, voice).save(path)
84
  return path
85
 
86
  def run_speech_analysis(target, lang_name, audio_path):
87
- if not audio_path: return "No recording.", "", ""
88
  asr_res = asr_pipe(audio_path)["text"].strip()
89
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
90
 
91
- t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
92
- u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
 
 
 
93
 
94
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
95
  prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 anatomical tip in English."
96
- feedback = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=100).choices[0].message.content
 
 
 
97
 
98
  return asr_res, f"/{u_ipa}/", feedback
99
 
100
  # --- UI ---
101
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange")) as demo:
102
- gr.HTML("<h1 style='text-align: center;'>πŸ“‡ PANINI Flashcards</h1>")
103
- gr.Markdown("Discover vocabulary in your environment or use our sample scenes.")
104
 
105
- # State to hold detection data for the current image
106
  current_detections = gr.State([])
107
 
108
  with gr.Row():
109
  with gr.Column(scale=1):
110
  lang_choice = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
111
- gr.Markdown("### Step 1: Pick a Scene")
112
- scene_gallery = gr.Radio(choices=list(SAMPLE_SCENES.keys()), label="Sample Scenes")
113
- upload_input = gr.Image(type="pil", label="OR Upload Your Own")
114
- btn_scan = gr.Button("πŸ” Scan for Vocabulary", variant="primary")
115
 
116
  with gr.Column(scale=2):
117
- gr.Markdown("### Step 2: Interactive Discovery")
118
- main_display = gr.Image(label="Click an object to identify it", interactive=True)
119
- click_feedback = gr.Markdown("*Detections will appear here...*")
120
- found_vocab = gr.Textbox(label="Vocabulary List (Generated by AI)")
 
121
 
122
  with gr.Row():
123
  with gr.Column():
124
- gr.Markdown("### Step 3: Pronunciation Practice")
125
- practice_word = gr.Textbox(label="Word to practice (Copy from list above)")
126
- btn_play = gr.Button("πŸ”Š Hear Native", scale=0)
127
- audio_ref = gr.Audio(label="Native Reference", type="filepath")
 
128
 
129
  with gr.Column():
130
- audio_user = gr.Audio(label="Record Yourself", sources=["microphone"], type="filepath")
131
- btn_analyze = gr.Button("πŸš€ Analyze My Speech")
132
  out_heard = gr.Textbox(label="AI Heard")
133
  out_feedback = gr.Markdown()
134
 
135
  # --- ACTIONS ---
136
 
137
- # Scene selection logic
138
- def load_scene(name):
139
- return SAMPLE_SCENES[name]
140
- scene_gallery.change(load_scene, scene_gallery, upload_input)
141
 
142
- # Scan logic
143
  btn_scan.click(
144
  scan_scene,
145
- inputs=[upload_input, lang_choice],
146
- outputs=[main_display, found_vocab, current_detections]
147
  )
148
 
149
- # Pointing Logic
150
- main_display.select(check_point, [current_detections, lang_choice], click_feedback)
 
 
 
 
151
 
152
  # Speech Logic
153
  btn_play.click(lambda t, l: asyncio.run(run_tts(t, l)), [practice_word, lang_choice], audio_ref)
 
18
  model_vision = YOLOWorld('yolov8s-world.pt')
19
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
20
 
21
+ # Robust Scene Library (Using stable Wikimedia/Pixabay direct links)
22
  SAMPLE_SCENES = {
23
+ "🍳 The Kitchen": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/31/Kitchen_in_the_White_House.jpg/1280px-Kitchen_in_the_White_House.jpg",
24
+ "πŸ›‹οΈ Living Room": "https://upload.wikimedia.org/wikipedia/commons/thumb/5/5a/Interior_of_a_living_room.jpg/1280px-Interior_of_a_living_room.jpg",
25
+ "πŸ™οΈ City Street": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/d8/London_Regent_Street_2.jpg/1280px-London_Regent_Street_2.jpg",
26
+ "πŸ›’ Supermarket": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/00/Produce_department_of_a_supermarket.jpg/1280px-Produce_department_of_a_supermarket.jpg"
27
  }
28
 
29
  LANG_CONFIG = {
 
38
  def scan_scene(img, lang_name):
39
  if img is None: return None, "Please select a scene.", []
40
 
41
+ # 1. Broad Vocabulary Scan
42
+ classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
43
+ "refrigerator", "oven", "car", "person", "tree", "backpack", "clock"]
44
+ model_vision.set_classes(classes)
45
 
46
+ # Prediction
47
+ results = model_vision.predict(img, conf=0.25)
48
+ annotated_img = results[0].plot()[..., ::-1] # BGR to RGB
49
 
50
+ # 2. Map Detections & Translate
51
+ found_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))
52
+
53
+ client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
54
+ trans_map = {}
55
+
56
+ if found_labels:
57
+ prompt = f"Translate these English nouns to {lang_name}: {', '.join(found_labels)}. Format: English:Translated, English:Translated"
58
+ try:
59
+ res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
60
+ # Create a dictionary for quick lookup
61
+ for pair in res_text.split(","):
62
+ if ":" in pair:
63
+ eng, trans = pair.split(":")
64
+ trans_map[eng.strip().lower()] = trans.strip()
65
+ except:
66
+ trans_map = {lbl: lbl for lbl in found_labels} # Fallback
67
+
68
+ # 3. Build Detection Objects with Translations
69
  detections = []
 
70
  for box in results[0].boxes:
71
+ label = model_vision.names[int(box.cls)].lower()
72
+ translated_label = trans_map.get(label, label)
73
+ coords = box.xyxy[0].tolist()
74
+ detections.append({"label": translated_label, "box": coords})
75
 
76
+ return annotated_img, ", ".join(trans_map.values()), detections
 
 
 
 
 
 
 
 
 
 
77
 
78
+ def check_point_and_update(evt: gr.SelectData, detections):
79
+ # evt.index gives [x, y] of the click
80
  click_x, click_y = evt.index
81
 
82
  for det in detections:
83
  x1, y1, x2, y2 = det["box"]
84
+ # Check if click is inside the bounding box
85
  if x1 <= click_x <= x2 and y1 <= click_y <= y2:
86
+ translated_word = det['label']
87
+ return f"🎯 Found: **{translated_word}**", translated_word
88
 
89
+ return "❌ Try clicking exactly on an object box!", ""
90
 
91
  async def run_tts(text, lang_name):
92
+ if not text: return None
93
  voice = LANG_CONFIG[lang_name]["voice"]
94
  path = f"speech_{int(time.time())}.mp3"
95
  await edge_tts.Communicate(text, voice).save(path)
96
  return path
97
 
98
  def run_speech_analysis(target, lang_name, audio_path):
99
+ if not audio_path or not target: return "No recording or target.", "", ""
100
  asr_res = asr_pipe(audio_path)["text"].strip()
101
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
102
 
103
+ try:
104
+ t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
105
+ u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
106
+ except:
107
+ t_ipa, u_ipa = "N/A", "N/A"
108
 
109
  client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
110
  prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 anatomical tip in English."
111
+ try:
112
+ feedback = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
113
+ except:
114
+ feedback = "Analysis busy."
115
 
116
  return asr_res, f"/{u_ipa}/", feedback
117
 
118
  # --- UI ---
119
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="orange")) as demo:
120
+ gr.HTML("<h1 style='text-align: center; color: #d97706;'>πŸ“‡ PANINI Flashcards</h1>")
 
121
 
122
+ # State stores detections for the current scan
123
  current_detections = gr.State([])
124
 
125
  with gr.Row():
126
  with gr.Column(scale=1):
127
  lang_choice = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
128
+ gr.Markdown("### Step 1: Choose a Scene")
129
+ scene_radio = gr.Radio(choices=list(SAMPLE_SCENES.keys()), label="Library")
130
+ img_input = gr.Image(type="pil", label="Scene Preview / Upload")
131
+ btn_scan = gr.Button("πŸ” Discover Vocabulary", variant="primary")
132
 
133
  with gr.Column(scale=2):
134
+ gr.Markdown("### Step 2: Point & Identify")
135
+ # The interactive image where the user clicks
136
+ display_img = gr.Image(label="Click an object to practice it!", interactive=True)
137
+ click_info = gr.Markdown("Click an object in the scanned image above.")
138
+ vocab_list = gr.Textbox(label="Detected Vocabulary", interactive=False)
139
 
140
  with gr.Row():
141
  with gr.Column():
142
+ gr.Markdown("### Step 3: Speak & Learn")
143
+ # This box gets filled automatically when the user clicks the image
144
+ practice_word = gr.Textbox(label="Word to Practice", placeholder="Click an object in the picture...")
145
+ btn_play = gr.Button("πŸ”Š Native Pronunciation", scale=0)
146
+ audio_ref = gr.Audio(label="Reference", type="filepath")
147
 
148
  with gr.Column():
149
+ audio_user = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
150
+ btn_analyze = gr.Button("πŸš€ Analyze Accent", variant="primary")
151
  out_heard = gr.Textbox(label="AI Heard")
152
  out_feedback = gr.Markdown()
153
 
154
  # --- ACTIONS ---
155
 
156
+ # Handle Scene Selection
157
+ scene_radio.change(lambda name: SAMPLE_SCENES[name], scene_radio, img_input)
 
 
158
 
159
+ # Handle Scan
160
  btn_scan.click(
161
  scan_scene,
162
+ inputs=[img_input, lang_choice],
163
+ outputs=[display_img, vocab_list, current_detections]
164
  )
165
 
166
+ # Handle Image Pointing (This updates the practice box!)
167
+ display_img.select(
168
+ check_point_and_update,
169
+ inputs=[current_detections],
170
+ outputs=[click_info, practice_word]
171
+ )
172
 
173
  # Speech Logic
174
  btn_play.click(lambda t, l: asyncio.run(run_tts(t, l)), [practice_word, lang_choice], audio_ref)