Spaces:

st192011
/

Panini_Flashcards

Sleeping

App Files Files Community

Panini_Flashcards / app.py

st192011

Update app.py

1677275 verified 27 days ago

raw

history blame contribute delete

8.11 kB

	import os
	import asyncio
	import edge_tts
	import librosa
	import torch
	import time
	import random
	import numpy as np
	import gradio as gr
	from PIL import Image
	from ultralytics import YOLOWorld
	from phonemizer import phonemize
	from transformers import pipeline
	from huggingface_hub import InferenceClient
	from datasets import load_dataset

	# --- CONFIG & MODELS ---
	HF_TOKEN = os.getenv("HF_TOKEN")
	model_vision = YOLOWorld('yolov8s-world.pt')
	asr_pipe = pipeline("automatic-speech-recognition", model="openai/whisper-tiny", device=-1)

	# Initialize COCO Dataset Streaming
	print("Initialising COCO Dataset streaming...")
	try:
	ds = load_dataset("detection-datasets/coco", split="val", streaming=True)
	ds_iter = iter(ds)
	except Exception as e:
	print(f"Dataset init failed: {e}")
	ds_iter = None

	LANG_CONFIG = {
	"Spanish": {"ipa": "es", "voice": "es-ES-ElviraNeural"},
	"German": {"ipa": "de", "voice": "de-DE-KatjaNeural"},
	"French": {"ipa": "fr-fr", "voice": "fr-FR-DeniseNeural"},
	"Chinese": {"ipa": "cmn", "voice": "zh-CN-XiaoxiaoNeural"}
	}

	# --- FUNCTIONS ---

	def get_random_coco_image():
	global ds_iter
	try:
	if ds_iter is None: raise ValueError("Dataset not ready")
	for _ in range(random.randint(1, 3)):
	sample = next(ds_iter)
	return sample['image']
	except Exception as e:
	return "http://images.cocodataset.org/val2017/000000000632.jpg"

	def scan_scene(img, lang_name, custom_tags):
	if img is None:
	return None, "Please get a scene first.", []

	# 1. SET VOCABULARY (Open Vocabulary Feature)
	if custom_tags and len(custom_tags.strip()) > 0:
	# User defined search
	classes = [x.strip() for x in custom_tags.split(",")]
	else:
	# General discovery mode
	classes = ["bottle", "cup", "chair", "table", "laptop", "fruit", "book", "vase", "sink",
	"refrigerator", "oven", "car", "person", "tree", "backpack", "clock", "dog", "cat"]

	model_vision.set_classes(classes)

	# 2. PREDICT
	results = model_vision.predict(img, conf=0.25)
	annotated_img = results[0].plot()[..., ::-1] # BGR to RGB

	# 3. EXTRACT AND TRANSLATE
	eng_labels = list(set([model_vision.names[int(box.cls)] for box in results[0].boxes]))

	client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
	trans_map = {}

	if eng_labels:
	# Prompt LLM to create a translation dictionary
	prompt = f"Translate these English words to {lang_name}: {', '.join(eng_labels)}. Return ONLY in this format: 'word:translation, word:translation'."
	try:
	res_text = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=200).choices[0].message.content
	# Parse pairs like 'table:der Tisch'
	for pair in res_text.split(","):
	if ":" in pair:
	eng, trans = pair.split(":")
	trans_map[eng.strip().lower()] = trans.strip()
	except Exception as e:
	print(f"Translation Error: {e}")
	trans_map = {lbl.lower(): lbl for lbl in eng_labels}

	# 4. MAP DETECTIONS (Link box to translated word)
	detections = []
	for box in results[0].boxes:
	eng_label = model_vision.names[int(box.cls)].lower()
	translated_label = trans_map.get(eng_label, eng_label)
	coords = box.xyxy[0].tolist()
	detections.append({"translated": translated_label, "english": eng_label, "box": coords})

	vocab_display = ", ".join(trans_map.values())
	return annotated_img, vocab_display, detections

	def on_image_click(evt: gr.SelectData, detections):
	"""Triggered when user clicks an object in the annotated image"""
	if not detections:
	return "Scan the image first!", ""

	click_x, click_y = evt.index
	for det in detections:
	x1, y1, x2, y2 = det["box"]
	# Check if click point is inside the detection box
	if x1 <= click_x <= x2 and y1 <= click_y <= y2:
	translated_word = det['translated']
	return f"🎯 Selected: {translated_word} ({det['english']})", translated_word

	return "💡 Click directly inside a colored box!", ""

	async def tts_task(text, lang_name):
	if not text: return None
	voice = LANG_CONFIG[lang_name]["voice"]
	path = f"speech_{int(time.time())}.mp3"
	await edge_tts.Communicate(text, voice).save(path)
	return path

	def run_feedback(target, lang_name, audio_path):
	if not audio_path or not target:
	return "Select a word and record audio.", "", ""

	asr_res = asr_pipe(audio_path)["text"].strip()
	ipa_code = LANG_CONFIG[lang_name]["ipa"]

	try:
	t_ipa = phonemize(target, language=ipa_code, backend='espeak', strip=True)
	u_ipa = phonemize(asr_res, language=ipa_code, backend='espeak', strip=True)
	except:
	t_ipa, u_ipa = "N/A", "N/A"

	client = InferenceClient(model="Qwen/Qwen2.5-7B-Instruct", token=HF_TOKEN)
	prompt = f"Target {lang_name} IPA: /{t_ipa}/. Student IPA: /{u_ipa}/. Give 1 short anatomical tip in English."
	try:
	fb = client.chat_completion([{"role": "user", "content": prompt}], max_tokens=150).choices[0].message.content
	return asr_res, f"/{u_ipa}/", fb
	except:
	return asr_res, f"/{u_ipa}/", "Coach is busy."

	# --- UI ---
	CSS = ".gradio-container {max-width: 1050px !important} .feedback-box { background-color: #f8fafc; padding: 15px; border-radius: 10px; }"

	with gr.Blocks(css=CSS) as demo:
	gr.HTML("<h1 style='text-align: center; color: #1e40af;'>🎙️ PANINI Flashcards</h1>")
	gr.Markdown("1. Select language. 2. Get a scene. 3. Enter items to find (or leave blank). 4. Scan and Click boxes.")

	current_dets = gr.State([])

	with gr.Row():
	with gr.Column(scale=1):
	lang_drop = gr.Dropdown(list(LANG_CONFIG.keys()), label="Language to Learn", value="Spanish")
	btn_random = gr.Button("🎲 Get Random Scene", variant="secondary")
	input_img = gr.Image(type="filepath", label="Scene Image", interactive=False)

	custom_tags = gr.Textbox(label="🔍 What should the AI find?", placeholder="e.g. guitar, cat, red book (optional)")
	btn_scan = gr.Button("🔍 Scan Vocabulary", variant="primary")

	with gr.Column(scale=2):
	gr.Markdown("### Interactive Discovery")
	display_img = gr.Image(label="Touch a box to practice that word", interactive=True)
	status_lab = gr.Markdown("Status: Ready.")
	vocab_list = gr.Textbox(label="Detected Words (Translated)", interactive=False)

	with gr.Row():
	with gr.Column():
	gr.Markdown("### 🎤 Practice Area")
	practice_word = gr.Textbox(label="Word to Practice (Click an object above)", placeholder="Waiting for selection...")
	btn_play = gr.Button("🔊 Listen to Native", scale=0)
	audio_out = gr.Audio(label="Native Reference", type="filepath")

	with gr.Column():
	audio_in = gr.Audio(label="Record Your Voice", sources=["microphone"], type="filepath")
	btn_eval = gr.Button("🚀 Analyze Accent", variant="primary")
	res_heard = gr.Textbox(label="What AI heard")
	res_fb = gr.Markdown(elem_classes=["feedback-box"])

	# --- EVENTS ---
	btn_random.click(get_random_coco_image, outputs=input_img)

	btn_scan.click(
	scan_scene,
	inputs=[input_img, lang_drop, custom_tags],
	outputs=[display_img, vocab_list, current_dets]
	)

	display_img.select(
	on_image_click,
	inputs=[current_dets],
	outputs=[status_lab, practice_word]
	)

	btn_play.click(lambda t, l: asyncio.run(tts_task(t, l)), [practice_word, lang_drop], audio_out)

	btn_eval.click(run_feedback, [practice_word, lang_drop, audio_in], [res_heard, res_heard, res_fb])

	# Launch
	demo.launch(
	theme=gr.themes.Soft(primary_hue="blue"),
	ssr_mode=False
	)