st192011 commited on
Commit
9471780
Β·
verified Β·
1 Parent(s): 7132f70

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -52
app.py CHANGED
@@ -11,14 +11,15 @@ from ultralytics import YOLOWorld
11
  from phonemizer import phonemize
12
  from transformers import pipeline
13
  from huggingface_hub import InferenceClient
14
- from torch.nn.functional import cosine_similarity
15
 
16
  # --- INITIALIZATION ---
17
  HF_TOKEN = os.getenv("HF_TOKEN")
18
- # Load a small YOLO World model for CPU efficiency
 
 
19
  model_vision = YOLOWorld('yolov8s-world.pt')
20
 
21
- # Whisper for ASR
22
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
23
 
24
  LANG_CONFIG = {
@@ -31,60 +32,54 @@ LANG_CONFIG = {
31
 
32
  # --- VISION LOGIC ---
33
  def detect_objects(img, target_queries):
 
 
 
34
  # Set custom classes based on user input
35
  if target_queries:
36
  classes = [x.strip() for x in target_queries.split(",")]
37
  model_vision.set_classes(classes)
38
  else:
39
- # Default common objects
40
- model_vision.set_classes(["chair", "table", "person", "bottle", "cup", "fruit", "book"])
41
 
42
  results = model_vision.predict(img, conf=0.3)
43
 
44
  # Draw results on image
45
  annotated_img = results[0].plot()
46
 
 
 
 
47
  # Extract unique labels
48
  detected_labels = []
49
  for c in results[0].boxes.cls:
50
  detected_labels.append(model_vision.names[int(c)])
51
 
52
- return annotated_img, list(set(detected_labels))
53
 
54
  # --- TRANSLATION & FEEDBACK LOGIC ---
55
- def get_llm_feedback(lang_name, english_word, student_speech, student_ipa, target_ipa):
56
- client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
57
-
58
- prompt = f"""
59
- Target Word: {english_word} in {lang_name}.
60
- Native IPA: /{target_ipa}/
61
- Student IPA: /{student_ipa}/
62
- Student said: "{student_speech}"
63
-
64
- The student is learning {lang_name}. Identify the main pronunciation error and give 1 short anatomical tip (tongue/lip placement) in English.
65
- """
66
-
67
  try:
68
- output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
 
69
  return output.choices[0].message.content
70
- except:
71
- return "LLM Busy. Try again in a moment."
72
 
73
- def translate_labels(lang_name, labels):
74
- if not labels: return "No objects detected."
75
- client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
76
 
77
- labels_str = ", ".join(labels)
78
- prompt = f"Translate these English object labels into {lang_name}. Provide the results as a comma-separated list. Labels: {labels_str}"
79
 
80
- try:
81
- output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
82
- return output.choices[0].message.content
83
- except:
84
- return labels_str # Fallback to English
85
 
86
  # --- AUDIO LOGIC ---
87
  async def play_tts(text, lang_name):
 
88
  voice = LANG_CONFIG[lang_name]["voice"]
89
  path = "ref.mp3"
90
  communicate = edge_tts.Communicate(text, voice)
@@ -92,55 +87,67 @@ async def play_tts(text, lang_name):
92
  return path
93
 
94
  def analyze_audio(lang_name, target_text, audio_path):
95
- if not audio_path: return "Record your voice!", "", ""
 
96
 
97
  # 1. ASR
98
  asr_res = asr_pipe(audio_path)["text"].strip()
99
 
100
  # 2. IPA
101
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
102
- target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
103
- user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
 
 
 
 
 
 
 
 
 
 
104
 
105
- # 3. LLM Feedback
106
- feedback = get_llm_feedback(lang_name, target_text, asr_res, user_ipa, target_ipa)
107
 
108
  return asr_res, f"/{user_ipa}/", feedback
109
 
110
  # --- UI ---
 
111
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
112
- gr.Markdown("# πŸ‘οΈ PANINI Vision: Visual Language Coach")
113
- gr.Markdown("Identify objects in your world and master their names in any language.")
114
 
115
- with gr.Tab("Step 1: Visual Discovery"):
116
  with gr.Row():
117
  with gr.Column():
118
- input_img = gr.Image(type="pill", label="Upload or Capture Photo")
119
- target_tags = gr.Textbox(label="Custom Tags (Optional)", placeholder="e.g. coffee, snacks, cat")
 
120
  btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
121
  with gr.Column():
122
- output_img = gr.Image(label="Identified Objects")
123
- detected_list = gr.Textbox(label="Detected English Objects")
124
 
125
- with gr.Tab("Step 2: Naming & Practice"):
126
  with gr.Row():
127
- lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
128
  btn_trans = gr.Button("🌐 Translate Labels")
129
 
130
- translated_box = gr.Textbox(label="Vocabulary List (Study these!)")
131
 
132
  with gr.Row():
133
- target_word = gr.Textbox(label="Type word to practice")
134
- btn_play = gr.Button("πŸ”Š Hear Native", scale=0)
135
- audio_ref = gr.Audio(label="Reference Audio", type="filepath")
136
 
137
  with gr.Row():
138
- audio_user = gr.Audio(label="Record Your Pronunciation", sources=["microphone"], type="filepath")
139
- btn_analyze = gr.Button("πŸš€ Analyze My Speech", variant="primary")
140
 
141
  with gr.Row():
142
  out_heard = gr.Textbox(label="AI Heard")
143
- out_ipa = gr.Textbox(label="Your Phonetics (IPA)")
144
  out_feedback = gr.Markdown()
145
 
146
  # --- ACTIONS ---
 
11
  from phonemizer import phonemize
12
  from transformers import pipeline
13
  from huggingface_hub import InferenceClient
 
14
 
15
  # --- INITIALIZATION ---
16
  HF_TOKEN = os.getenv("HF_TOKEN")
17
+
18
+ # Load a small YOLO World model
19
+ # Note: On first run, it downloads the weights automatically
20
  model_vision = YOLOWorld('yolov8s-world.pt')
21
 
22
+ # Whisper for ASR (Transcription)
23
  asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
24
 
25
  LANG_CONFIG = {
 
32
 
33
  # --- VISION LOGIC ---
34
  def detect_objects(img, target_queries):
35
+ if img is None:
36
+ return None, "Please upload an image."
37
+
38
  # Set custom classes based on user input
39
  if target_queries:
40
  classes = [x.strip() for x in target_queries.split(",")]
41
  model_vision.set_classes(classes)
42
  else:
43
+ # Default common objects for language learning
44
+ model_vision.set_classes(["chair", "table", "bottle", "cup", "fruit", "book", "laptop", "backpack"])
45
 
46
  results = model_vision.predict(img, conf=0.3)
47
 
48
  # Draw results on image
49
  annotated_img = results[0].plot()
50
 
51
+ # Convert BGR (OpenCV format) to RGB for Gradio
52
+ annotated_img = annotated_img[..., ::-1]
53
+
54
  # Extract unique labels
55
  detected_labels = []
56
  for c in results[0].boxes.cls:
57
  detected_labels.append(model_vision.names[int(c)])
58
 
59
+ return annotated_img, ", ".join(list(set(detected_labels)))
60
 
61
  # --- TRANSLATION & FEEDBACK LOGIC ---
62
+ def get_llm_response(model_id, system_prompt, user_prompt):
63
+ client = InferenceClient(model=model_id, token=HF_TOKEN)
 
 
 
 
 
 
 
 
 
 
64
  try:
65
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
66
+ output = client.chat_completion(messages, max_tokens=200)
67
  return output.choices[0].message.content
68
+ except Exception as e:
69
+ return f"AI Error: {str(e)}"
70
 
71
+ def translate_labels(lang_name, labels_str):
72
+ if not labels_str or labels_str == "No objects detected.":
73
+ return "Nothing to translate."
74
 
75
+ system = f"You are a helpful translator for a language learning app."
76
+ prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return only a comma-separated list."
77
 
78
+ return get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
 
 
 
 
79
 
80
  # --- AUDIO LOGIC ---
81
  async def play_tts(text, lang_name):
82
+ if not text: return None
83
  voice = LANG_CONFIG[lang_name]["voice"]
84
  path = "ref.mp3"
85
  communicate = edge_tts.Communicate(text, voice)
 
87
  return path
88
 
89
  def analyze_audio(lang_name, target_text, audio_path):
90
+ if not audio_path or not target_text:
91
+ return "Record your voice and provide text!", "", ""
92
 
93
  # 1. ASR
94
  asr_res = asr_pipe(audio_path)["text"].strip()
95
 
96
  # 2. IPA
97
  ipa_code = LANG_CONFIG[lang_name]["ipa"]
98
+ try:
99
+ target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
100
+ user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
101
+ except:
102
+ target_ipa = "IPA Error"
103
+ user_ipa = "IPA Error"
104
+
105
+ # 3. LLM Anatomical Feedback
106
+ system = "You are a professional Phonetics Coach."
107
+ prompt = (f"Target: '{target_text}' (IPA: /{target_ipa}/). "
108
+ f"Student said: '{asr_res}' (IPA: /{user_ipa}/). "
109
+ f"Identify the main pronunciation error and give 1 anatomical tip in English.")
110
 
111
+ feedback = get_llm_response("Qwen/Qwen2.5-7B-Instruct", system, prompt)
 
112
 
113
  return asr_res, f"/{user_ipa}/", feedback
114
 
115
  # --- UI ---
116
+ # Moved theme into Blocks constructor (or it can go in launch)
117
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
118
+ gr.HTML("<h1 style='text-align: center;'>πŸ‘οΈ PANINI Vision</h1>")
119
+ gr.HTML("<p style='text-align: center;'>Discover your world in any language.</p>")
120
 
121
+ with gr.Tab("1. Scan & Discover"):
122
  with gr.Row():
123
  with gr.Column():
124
+ # FIXED: type="pill" -> type="pil"
125
+ input_img = gr.Image(type="pil", label="Upload or Capture Photo")
126
+ target_tags = gr.Textbox(label="Target specific things?", placeholder="e.g. apple, dog, keyboard")
127
  btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
128
  with gr.Column():
129
+ output_img = gr.Image(label="Annotated View")
130
+ detected_list = gr.Textbox(label="Objects Found (English)")
131
 
132
+ with gr.Tab("2. Practice Naming"):
133
  with gr.Row():
134
+ lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learn in...", value="Spanish")
135
  btn_trans = gr.Button("🌐 Translate Labels")
136
 
137
+ translated_box = gr.Textbox(label="Vocabulary List")
138
 
139
  with gr.Row():
140
+ target_word = gr.Textbox(label="Word to Practice")
141
+ btn_play = gr.Button("πŸ”Š Listen", scale=0)
142
+ audio_ref = gr.Audio(label="Reference", type="filepath")
143
 
144
  with gr.Row():
145
+ audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
146
+ btn_analyze = gr.Button("πŸš€ Analyze Speech", variant="primary")
147
 
148
  with gr.Row():
149
  out_heard = gr.Textbox(label="AI Heard")
150
+ out_ipa = gr.Textbox(label="Your IPA")
151
  out_feedback = gr.Markdown()
152
 
153
  # --- ACTIONS ---