agentic-language-partner / src /app /flashcards_tools.py
mastefan's picture
Update src/app/flashcards_tools.py
c904774 verified
# src/app/flashcards_tools.py
import json
import re
from pathlib import Path
from typing import Dict, List, Tuple, Optional
from deep_translator import GoogleTranslator
from .config import get_user_dir
def _get_decks_dir(username: str) -> Path:
"""
Returns the directory where all of a user's decks are stored.
"""
user_dir = get_user_dir(username)
decks_dir = user_dir / "decks"
decks_dir.mkdir(parents=True, exist_ok=True)
return decks_dir
def list_user_decks(username: str) -> Dict[str, Path]:
"""
Returns a mapping of deck name -> deck json path.
Deck name is taken from the deck's "name" field if present,
otherwise the filename stem.
"""
decks_dir = _get_decks_dir(username)
deck_files = sorted(decks_dir.glob("*.json"))
decks: Dict[str, Path] = {}
for path in deck_files:
try:
data = json.loads(path.read_text(encoding="utf-8"))
name = data.get("name") or path.stem
except Exception:
name = path.stem
# ensure uniqueness by appending stem if needed
if name in decks and decks[name] != path:
name = f"{name} ({path.stem})"
decks[name] = path
return decks
def _ensure_card_stats(card: Dict) -> None:
"""
Ensure that a card has simple spaced-repetition stats.
"""
if "score" not in card: # learning strength
card["score"] = 0
if "reviews" not in card:
card["reviews"] = 0
def load_deck(path: Path) -> Dict:
"""
Loads a deck from JSON, ensuring 'cards' exists and that
each card has basic stats for spaced repetition.
"""
try:
data = json.loads(path.read_text(encoding="utf-8"))
except Exception:
data = {}
if "cards" not in data or not isinstance(data["cards"], list):
data["cards"] = []
if "name" not in data:
data["name"] = path.stem
if "tags" not in data or not isinstance(data["tags"], list):
data["tags"] = []
for card in data["cards"]:
_ensure_card_stats(card)
return data
def save_deck(path: Path, deck: Dict) -> None:
"""
Saves deck to JSON.
"""
if "cards" not in deck:
deck["cards"] = []
if "name" not in deck:
deck["name"] = path.stem
if "tags" not in deck or not isinstance(deck["tags"], list):
deck["tags"] = []
# make sure stats are present
for card in deck["cards"]:
_ensure_card_stats(card)
path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8")
# ------------------------------------------------------------
# Shared tokenization
# ------------------------------------------------------------
def _extract_candidate_words(text: str) -> List[str]:
"""
Simple tokenizer & filter for candidate vocab words.
"""
tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE)
out = []
seen = set()
for t in tokens:
t_norm = t.strip()
if len(t_norm) < 2:
continue
if any(ch.isdigit() for ch in t_norm):
continue
lower = t_norm.lower()
if lower in seen:
continue
seen.add(lower)
out.append(t_norm)
return out
# ------------------------------------------------------------
# OCR → Flashcards
# ------------------------------------------------------------
def generate_flashcards_from_ocr_results(
username: str,
ocr_results: List[Dict],
deck_name: str = "ocr",
target_lang: str = "en",
tags: Optional[List[str]] = None,
) -> Path:
"""
Takes OCR results (as produced by ocr_tools.ocr_and_translate_batch)
and constructs a simple vocab deck.
ocr_results: list of dict with keys:
- "text": original text
- optionally other fields (ignored)
"""
all_text = []
for res in ocr_results:
t = res.get("text") or res.get("raw_text") or ""
if t:
all_text.append(t)
joined = "\n".join(all_text)
words = _extract_candidate_words(joined)
if not words:
raise ValueError("No candidate words found in OCR results.")
translator = GoogleTranslator(source="auto", target=target_lang)
cards = []
for w in words:
try:
trans = translator.translate(w)
except Exception:
continue
if not trans:
continue
if trans.strip().lower() == w.strip().lower():
continue
card = {
"front": w,
"back": trans,
"content_type": "ocr_vocab",
"language": target_lang,
}
_ensure_card_stats(card)
cards.append(card)
if not cards:
raise ValueError("No translatable words found to build cards.")
decks_dir = _get_decks_dir(username)
deck_path = decks_dir / f"{deck_name}.json"
deck = {
"name": deck_name,
"cards": cards,
"tags": tags or [],
}
save_deck(deck_path, deck)
return deck_path
# ------------------------------------------------------------
# Conversation/Text → Flashcards
# ------------------------------------------------------------
def generate_flashcards_from_text(
username: str,
text: str,
deck_name: str = "conversation",
target_lang: str = "en",
tags: Optional[List[str]] = None,
) -> Path:
"""
Build a vocab deck from raw conversation text.
"""
words = _extract_candidate_words(text)
if not words:
raise ValueError("No candidate words found in text.")
translator = GoogleTranslator(source="auto", target=target_lang)
cards = []
for w in words:
try:
trans = translator.translate(w)
except Exception:
continue
if not trans:
continue
if trans.strip().lower() == w.strip().lower():
continue
card = {
"front": w,
"back": trans,
"content_type": "conversation_vocab",
"language": target_lang,
}
_ensure_card_stats(card)
cards.append(card)
if not cards:
raise ValueError("No translatable words found to build cards.")
decks_dir = _get_decks_dir(username)
deck_path = decks_dir / f"{deck_name}.json"
deck = {
"name": deck_name,
"cards": cards,
"tags": tags or ["conversation"],
}
save_deck(deck_path, deck)
return deck_path