st192011 commited on
Commit
7132f70
Β·
verified Β·
1 Parent(s): 7374052

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ import edge_tts
4
+ import librosa
5
+ import torch
6
+ import numpy as np
7
+ import pandas as pd
8
+ import gradio as gr
9
+ from PIL import Image
10
+ from ultralytics import YOLOWorld
11
+ from phonemizer import phonemize
12
+ from transformers import pipeline
13
+ from huggingface_hub import InferenceClient
14
+ from torch.nn.functional import cosine_similarity
15
+
16
+ # --- INITIALIZATION ---
17
+ HF_TOKEN = os.getenv("HF_TOKEN")
18
+ # Load a small YOLO World model for CPU efficiency
19
+ model_vision = YOLOWorld('yolov8s-world.pt')
20
+
21
+ # Whisper for ASR
22
+ asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)
23
+
24
+ LANG_CONFIG = {
25
+ "English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"},
26
+ "German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
27
+ "French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
28
+ "Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
29
+ "Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
30
+ }
31
+
32
+ # --- VISION LOGIC ---
33
+ def detect_objects(img, target_queries):
34
+ # Set custom classes based on user input
35
+ if target_queries:
36
+ classes = [x.strip() for x in target_queries.split(",")]
37
+ model_vision.set_classes(classes)
38
+ else:
39
+ # Default common objects
40
+ model_vision.set_classes(["chair", "table", "person", "bottle", "cup", "fruit", "book"])
41
+
42
+ results = model_vision.predict(img, conf=0.3)
43
+
44
+ # Draw results on image
45
+ annotated_img = results[0].plot()
46
+
47
+ # Extract unique labels
48
+ detected_labels = []
49
+ for c in results[0].boxes.cls:
50
+ detected_labels.append(model_vision.names[int(c)])
51
+
52
+ return annotated_img, list(set(detected_labels))
53
+
54
+ # --- TRANSLATION & FEEDBACK LOGIC ---
55
+ def get_llm_feedback(lang_name, english_word, student_speech, student_ipa, target_ipa):
56
+ client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
57
+
58
+ prompt = f"""
59
+ Target Word: {english_word} in {lang_name}.
60
+ Native IPA: /{target_ipa}/
61
+ Student IPA: /{student_ipa}/
62
+ Student said: "{student_speech}"
63
+
64
+ The student is learning {lang_name}. Identify the main pronunciation error and give 1 short anatomical tip (tongue/lip placement) in English.
65
+ """
66
+
67
+ try:
68
+ output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
69
+ return output.choices[0].message.content
70
+ except:
71
+ return "LLM Busy. Try again in a moment."
72
+
73
+ def translate_labels(lang_name, labels):
74
+ if not labels: return "No objects detected."
75
+ client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
76
+
77
+ labels_str = ", ".join(labels)
78
+ prompt = f"Translate these English object labels into {lang_name}. Provide the results as a comma-separated list. Labels: {labels_str}"
79
+
80
+ try:
81
+ output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
82
+ return output.choices[0].message.content
83
+ except:
84
+ return labels_str # Fallback to English
85
+
86
+ # --- AUDIO LOGIC ---
87
+ async def play_tts(text, lang_name):
88
+ voice = LANG_CONFIG[lang_name]["voice"]
89
+ path = "ref.mp3"
90
+ communicate = edge_tts.Communicate(text, voice)
91
+ await communicate.save(path)
92
+ return path
93
+
94
+ def analyze_audio(lang_name, target_text, audio_path):
95
+ if not audio_path: return "Record your voice!", "", ""
96
+
97
+ # 1. ASR
98
+ asr_res = asr_pipe(audio_path)["text"].strip()
99
+
100
+ # 2. IPA
101
+ ipa_code = LANG_CONFIG[lang_name]["ipa"]
102
+ target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
103
+ user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
104
+
105
+ # 3. LLM Feedback
106
+ feedback = get_llm_feedback(lang_name, target_text, asr_res, user_ipa, target_ipa)
107
+
108
+ return asr_res, f"/{user_ipa}/", feedback
109
+
110
+ # --- UI ---
111
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
112
+ gr.Markdown("# πŸ‘οΈ PANINI Vision: Visual Language Coach")
113
+ gr.Markdown("Identify objects in your world and master their names in any language.")
114
+
115
+ with gr.Tab("Step 1: Visual Discovery"):
116
+ with gr.Row():
117
+ with gr.Column():
118
+ input_img = gr.Image(type="pill", label="Upload or Capture Photo")
119
+ target_tags = gr.Textbox(label="Custom Tags (Optional)", placeholder="e.g. coffee, snacks, cat")
120
+ btn_scan = gr.Button("πŸ” Scan Environment", variant="primary")
121
+ with gr.Column():
122
+ output_img = gr.Image(label="Identified Objects")
123
+ detected_list = gr.Textbox(label="Detected English Objects")
124
+
125
+ with gr.Tab("Step 2: Naming & Practice"):
126
+ with gr.Row():
127
+ lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Target Language", value="Spanish")
128
+ btn_trans = gr.Button("🌐 Translate Labels")
129
+
130
+ translated_box = gr.Textbox(label="Vocabulary List (Study these!)")
131
+
132
+ with gr.Row():
133
+ target_word = gr.Textbox(label="Type word to practice")
134
+ btn_play = gr.Button("πŸ”Š Hear Native", scale=0)
135
+ audio_ref = gr.Audio(label="Reference Audio", type="filepath")
136
+
137
+ with gr.Row():
138
+ audio_user = gr.Audio(label="Record Your Pronunciation", sources=["microphone"], type="filepath")
139
+ btn_analyze = gr.Button("πŸš€ Analyze My Speech", variant="primary")
140
+
141
+ with gr.Row():
142
+ out_heard = gr.Textbox(label="AI Heard")
143
+ out_ipa = gr.Textbox(label="Your Phonetics (IPA)")
144
+ out_feedback = gr.Markdown()
145
+
146
+ # --- ACTIONS ---
147
+ btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
148
+ btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=translated_box)
149
+ btn_play.click(fn=lambda t, l: asyncio.run(play_tts(t, l)), inputs=[target_word, lang_drop], outputs=audio_ref)
150
+ btn_analyze.click(analyze_audio, inputs=[lang_drop, target_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])
151
+
152
+ demo.launch()