Spaces:

kawaa99
/

idioms

Sleeping

App Files Files Community

idioms / utils.py

kawaa99

Upload 3 files

88d75c9 verified about 1 month ago

raw

history blame contribute delete

6.3 kB

	import json
	import sqlite3
	import tempfile
	from gtts import gTTS
	from datasets import load_dataset
	from transformers import MarianMTModel, MarianTokenizer
	import pathlib
	import streamlit as st
	import random
	from functools import lru_cache


	# LOAD IDIOMS
	@st.cache_data
	def load_idioms(path="idioms.json"):
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	return {k.lower(): v for k, v in data.items()}

	# AUDIO
	CACHE_DIR = pathlib.Path("audio_cache")
	CACHE_DIR.mkdir(exist_ok=True)

	@st.cache_data
	def generate_audio(text, lang="en"):
	safe_text = text.replace(" ", "_")
	file_path = CACHE_DIR / f"{safe_text}.mp3"

	if not file_path.exists():
	tts = gTTS(text=text, lang=lang)
	tts.save(file_path)

	return str(file_path)

	# SQLITE DATABASE
	def init_db():
	conn = sqlite3.connect("idioms.db", check_same_thread=False)
	c = conn.cursor()

	c.execute("""
	CREATE TABLE IF NOT EXISTS favorite (
	idiom TEXT PRIMARY KEY,
	timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
	)
	""")

	c.execute("""
	CREATE TABLE IF NOT EXISTS analytics (
	idiom TEXT PRIMARY KEY,
	attempts INTEGER DEFAULT 0,
	correct INTEGER DEFAULT 0
	)
	""")

	conn.commit()
	return conn

	def add_favorite(conn, idiom):
	c = conn.cursor()
	c.execute("INSERT OR REPLACE INTO favorite (idiom) VALUES (?)", (idiom,))
	conn.commit()

	def get_favorite(conn):
	c = conn.cursor()
	rows = c.execute("SELECT idiom FROM favorite ORDER BY timestamp DESC").fetchall()
	return [r[0] for r in rows]

	def remove_favorite(conn, idiom):
	c = conn.cursor()
	c.execute("DELETE FROM favorite WHERE idiom = ?", (idiom,))
	conn.commit()

	# ANALYTICS
	def update_analytics(conn, idiom, is_correct):
	c = conn.cursor()

	c.execute("""
	INSERT INTO analytics (idiom, attempts, correct)
	VALUES (?, 0, 0)
	ON CONFLICT(idiom) DO NOTHING
	""", (idiom,))

	c.execute("""
	UPDATE analytics
	SET attempts = attempts + 1,
	correct = correct + ?
	WHERE idiom = ?
	""", (1 if is_correct else 0, idiom))

	conn.commit()

	def get_learning_stats(conn):
	c = conn.cursor()
	rows = c.execute("SELECT idiom, attempts, correct FROM analytics").fetchall()

	stats = []
	for idiom, attempts, correct in rows:
	acc = correct / attempts if attempts else 0
	stats.append((idiom, attempts, correct, acc))

	return stats

	def get_weak_idioms(conn):
	c = conn.cursor()
	rows = c.execute("""
	SELECT idiom, attempts, correct
	FROM analytics
	WHERE attempts >= 2
	""").fetchall()

	weak = []
	for idiom, attempts, correct in rows:
	accuracy = correct / attempts if attempts else 0
	if accuracy < 0.6:
	weak.append(idiom)

	return weak

	# TRANSLATION MODEL
	@st.cache_resource
	def load_translation_model():
	model_name = "Helsinki-NLP/opus-mt-en-es"
	tokenizer = MarianTokenizer.from_pretrained(model_name)
	model = MarianMTModel.from_pretrained(model_name)
	return tokenizer, model

	def translate_literal(text):
	tokenizer, model = load_translation_model()
	inputs = tokenizer(text, return_tensors="pt", truncation=True)
	translated = model.generate(**inputs, max_new_tokens=100)
	return tokenizer.decode(translated[0], skip_special_tokens=True)

	# EXAMPLES DATASETS
	@st.cache_data(show_spinner=True)
	def build_examples_map():
	examples_map = {}

	def add_example(key, en, es):
	key = key.lower().strip()
	if key not in examples_map:
	examples_map[key] = []
	if en or es:
	examples_map[key].append({"en": en or "", "es": es or ""})

	try:
	ds1 = load_dataset("fdelucaf/IdioTS")
	for row in ds1["train"]:
	if row["sentence_has_idiom"]:
	add_example(row["idiom"], row.get("en", ""), row.get("es", ""))

	ds2 = load_dataset("UCSC-Admire/idiom-SFT-dataset-561-2024-12-06_00-40-30")
	for row in ds2["train"]:
	idiom = row.get("idiom") or row.get("Idiomatic Expression") or ""
	en = row.get("en") or row.get("English") or ""
	es = row.get("es") or row.get("Spanish") or ""
	if idiom:
	add_example(idiom, en, es)

	except Exception:
	pass

	return examples_map

	# QUIZ GENERATION
	def generate_adaptive_quiz(conn, idiom_map, examples_map):

	idioms = list(idiom_map.keys())

	# Build list of valid (idiom, example) pairs
	valid_examples = []

	for idiom, examples in examples_map.items():
	for ex in examples:
	sentence = ex.get("en", "")
	if not sentence:
	continue

	# Case-insensitive whole match replacement check
	pattern = re.compile(r'\b' + re.escape(idiom) + r'\b', re.IGNORECASE)

	if pattern.search(sentence):
	valid_examples.append((idiom, sentence))

	if not valid_examples:
	raise ValueError("No valid dataset sentences contain their idioms.")

	# Adaptive filtering (prioritize weak idioms)
	weak = get_weak_idioms(conn)

	if weak:
	filtered = [pair for pair in valid_examples if pair[0] in weak]
	if filtered:
	valid_examples = filtered

	idiom, sentence = random.choice(valid_examples)

	# Replace idiom with blank (safe regex replacement)
	question_sentence = re.sub(
	r'\b' + re.escape(idiom) + r'\b',
	"_____",
	sentence,
	flags=re.IGNORECASE
	)

	# Generate wrong options
	wrong_pool = [i for i in idioms if i != idiom]
	wrong_options = random.sample(wrong_pool, min(3, len(wrong_pool)))

	options = wrong_options + [idiom]
	random.shuffle(options)

	return {
	"question": question_sentence,
	"options": options,
	"answer": idiom
	}

	# DETECT IDIOMS
	def detect_idioms(text, idioms):
	text_lower = text.lower()
	return [i for i in idioms if i.lower() in text_lower]