Spaces:

st192011
/

Panini-Vision

Sleeping

App Files Files Community

Panini-Vision / app.py

st192011

Update app.py

038431f verified 15 days ago

raw

history blame contribute delete

7.61 kB

	import os
	import asyncio
	import edge_tts
	import librosa
	import torch
	import numpy as np
	import pandas as pd
	import gradio as gr
	import re
	import time
	from PIL import Image
	from ultralytics import YOLOWorld
	from phonemizer import phonemize
	from transformers import pipeline
	from huggingface_hub import InferenceClient

	# --- INITIALIZATION ---
	HF_TOKEN = os.getenv("HF_TOKEN")

	# Load YOLO World (Small)
	model_vision = YOLOWorld('yolov8s-world.pt')

	# Whisper for ASR (Using tiny for speed)
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)

	LANG_CONFIG = {
	"English (US)": {"ipa": "en-us", "voice": "en-US-ChristopherNeural"},
	"German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
	"French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
	"Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
	"Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
	}

	# --- VISION LOGIC ---
	def detect_objects(img, target_queries):
	if img is None:
	return None, "Please upload an image first."

	# 1. Reset/Set Vocabulary
	if target_queries and len(target_queries.strip()) > 0:
	classes = [x.strip() for x in target_queries.split(",")]
	else:
	# Balanced default list to prevent "bottle" bias
	classes = ["person", "backpack", "umbrella", "handbag", "tie", "suitcase",
	"cup", "fork", "knife", "spoon", "bowl", "banana", "apple",
	"sandwich", "orange", "broccoli", "carrot", "pizza", "donut",
	"cake", "chair", "couch", "potted plant", "bed", "dining table",
	"tv", "laptop", "mouse", "remote", "keyboard", "cell phone",
	"microwave", "oven", "sink", "refrigerator", "book", "clock", "vase"]

	# Force YOLO to update its internal class list
	model_vision.set_classes(classes)

	# 2. Prediction (Slightly higher confidence to reduce noise)
	results = model_vision.predict(img, conf=0.4)

	# 3. Process Image
	annotated_img = results[0].plot()
	# Flip BGR to RGB
	annotated_img = annotated_img[..., ::-1]

	# 4. Extract Labels
	found_labels = []
	for c in results[0].boxes.cls:
	found_labels.append(model_vision.names[int(c)])

	label_list = ", ".join(list(set(found_labels))) if found_labels else "No objects found. Try adjusting 'Custom Tags'."

	return annotated_img, label_list

	# --- TRANSLATION ---
	def translate_labels(lang_name, labels_str):
	if not labels_str or "No objects" in labels_str:
	return "No objects detected to translate."

	client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
	prompt = f"Translate these English object labels into {lang_name}: {labels_str}. Return ONLY the translated words as a comma-separated list."

	try:
	output = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200)
	return output.choices[0].message.content
	except Exception as e:
	return f"Translation Error: {str(e)}"

	# --- SPEECH LOGIC (FIXED) ---
	async def tts_core(text, lang_name):
	voice = LANG_CONFIG[lang_name]["voice"]
	# Use timestamp to prevent browser audio caching issues
	filename = f"ref_{int(time.time())}.mp3"
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(filename)
	return filename

	def handle_tts(text, lang_name):
	if not text: return None
	return asyncio.run(tts_core(text, lang_name))

	def analyze_speech(lang_name, target_text, audio_path):
	if not audio_path or not target_text:
	return "Missing recording or target word.", "", "Please provide both."

	# ASR
	asr_res = asr_pipe(audio_path)["text"].strip()

	# IPA
	ipa_code = LANG_CONFIG[lang_name]["ipa"]
	try:
	target_ipa = phonemize(target_text, language=ipa_code, backend='espeak', strip=True)
	user_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
	except:
	target_ipa = "Error"
	user_ipa = "Error"

	# LLM Feedback
	client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
	prompt = (f"In {lang_name}, the target IPA is /{target_ipa}/. The student said '{asr_res}' with IPA /{user_ipa}/. "
	"Identify the error and give 1 specific anatomical tip for tongue/lips in English.")

	try:
	fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150)
	feedback = fb.choices[0].message.content
	except:
	feedback = "Speech analysis busy. Try again."

	return asr_res, f"/{user_ipa}/", feedback

	# --- UI ---
	with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
	gr.HTML("<h1 style='text-align: center;'>🎙️ PANINI Vision</h1>")

	with gr.Accordion("📖 How to use (Instruction)", open=False):
	gr.Markdown("""
	### 1. Vision Step
	* Upload a photo of your room or desk.
	* Custom Tags (Open Vocabulary): This is the magic of YOLO World. If you are in a kitchen, type `spatula, whisk, blender`. The AI will look specifically for those items. If you leave it blank, it uses a general list.
	* Click Scan Environment.

	### 2. Translation & Speech Step
	* Select your Target Language.
	* Click Translate Labels to turn the English names into your learning language.
	* Copy one of those words into the 'Word to Practice' box.
	* Listen to the AI, then Record yourself to get feedback!
	""")

	with gr.Tab("1. Discover Objects"):
	with gr.Row():
	with gr.Column():
	input_img = gr.Image(type="pil", label="Capture your world")
	target_tags = gr.Textbox(label="Target specific things? (Comma separated)", placeholder="e.g. guitar, plant, blue book")
	btn_scan = gr.Button("🔍 Scan Environment", variant="primary")
	with gr.Column():
	output_img = gr.Image(label="AI Detection")
	detected_list = gr.Textbox(label="Detected Objects (English)")

	with gr.Tab("2. Language Practice"):
	with gr.Row():
	lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Learning Language", value="Spanish")
	btn_trans = gr.Button("🌐 Translate Labels")

	vocab_output = gr.Textbox(label="Translated Vocabulary")

	with gr.Row():
	practice_word = gr.Textbox(label="Word to Practice")
	btn_listen = gr.Button("🔊 Listen", scale=0)
	audio_ref = gr.Audio(label="Native Reference", type="filepath")

	with gr.Row():
	audio_user = gr.Audio(label="Your Voice", sources=["microphone"], type="filepath")
	btn_analyze = gr.Button("🚀 Analyze Pronunciation", variant="primary")

	with gr.Row():
	out_heard = gr.Textbox(label="AI Heard")
	out_ipa = gr.Textbox(label="Your IPA")
	out_feedback = gr.Markdown()

	# --- BUTTON LOGIC ---
	btn_scan.click(detect_objects, inputs=[input_img, target_tags], outputs=[output_img, detected_list])
	btn_trans.click(translate_labels, inputs=[lang_drop, detected_list], outputs=vocab_output)

	# Fixed Speech logic
	btn_listen.click(handle_tts, inputs=[practice_word, lang_drop], outputs=audio_ref)
	btn_analyze.click(analyze_speech, inputs=[lang_drop, practice_word, audio_user], outputs=[out_heard, out_ipa, out_feedback])

	demo.launch()