Spaces:
Sleeping
Sleeping
| # src/app/flashcards_tools.py | |
| import json | |
| import re | |
| from pathlib import Path | |
| from typing import Dict, List, Tuple, Optional | |
| from deep_translator import GoogleTranslator | |
| from .config import get_user_dir | |
| def _get_decks_dir(username: str) -> Path: | |
| """ | |
| Returns the directory where all of a user's decks are stored. | |
| """ | |
| user_dir = get_user_dir(username) | |
| decks_dir = user_dir / "decks" | |
| decks_dir.mkdir(parents=True, exist_ok=True) | |
| return decks_dir | |
| def list_user_decks(username: str) -> Dict[str, Path]: | |
| """ | |
| Returns a mapping of deck name -> deck json path. | |
| Deck name is taken from the deck's "name" field if present, | |
| otherwise the filename stem. | |
| """ | |
| decks_dir = _get_decks_dir(username) | |
| deck_files = sorted(decks_dir.glob("*.json")) | |
| decks: Dict[str, Path] = {} | |
| for path in deck_files: | |
| try: | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| name = data.get("name") or path.stem | |
| except Exception: | |
| name = path.stem | |
| # ensure uniqueness by appending stem if needed | |
| if name in decks and decks[name] != path: | |
| name = f"{name} ({path.stem})" | |
| decks[name] = path | |
| return decks | |
| def _ensure_card_stats(card: Dict) -> None: | |
| """ | |
| Ensure that a card has simple spaced-repetition stats. | |
| """ | |
| if "score" not in card: # learning strength | |
| card["score"] = 0 | |
| if "reviews" not in card: | |
| card["reviews"] = 0 | |
| def load_deck(path: Path) -> Dict: | |
| """ | |
| Loads a deck from JSON, ensuring 'cards' exists and that | |
| each card has basic stats for spaced repetition. | |
| """ | |
| try: | |
| data = json.loads(path.read_text(encoding="utf-8")) | |
| except Exception: | |
| data = {} | |
| if "cards" not in data or not isinstance(data["cards"], list): | |
| data["cards"] = [] | |
| if "name" not in data: | |
| data["name"] = path.stem | |
| if "tags" not in data or not isinstance(data["tags"], list): | |
| data["tags"] = [] | |
| for card in data["cards"]: | |
| _ensure_card_stats(card) | |
| return data | |
| def save_deck(path: Path, deck: Dict) -> None: | |
| """ | |
| Saves deck to JSON. | |
| """ | |
| if "cards" not in deck: | |
| deck["cards"] = [] | |
| if "name" not in deck: | |
| deck["name"] = path.stem | |
| if "tags" not in deck or not isinstance(deck["tags"], list): | |
| deck["tags"] = [] | |
| # make sure stats are present | |
| for card in deck["cards"]: | |
| _ensure_card_stats(card) | |
| path.write_text(json.dumps(deck, indent=2, ensure_ascii=False), encoding="utf-8") | |
| # ------------------------------------------------------------ | |
| # Shared tokenization | |
| # ------------------------------------------------------------ | |
| def _extract_candidate_words(text: str) -> List[str]: | |
| """ | |
| Simple tokenizer & filter for candidate vocab words. | |
| """ | |
| tokens = re.findall(r"\b\w+\b", text, flags=re.UNICODE) | |
| out = [] | |
| seen = set() | |
| for t in tokens: | |
| t_norm = t.strip() | |
| if len(t_norm) < 2: | |
| continue | |
| if any(ch.isdigit() for ch in t_norm): | |
| continue | |
| lower = t_norm.lower() | |
| if lower in seen: | |
| continue | |
| seen.add(lower) | |
| out.append(t_norm) | |
| return out | |
| # ------------------------------------------------------------ | |
| # OCR → Flashcards | |
| # ------------------------------------------------------------ | |
| def generate_flashcards_from_ocr_results( | |
| username: str, | |
| ocr_results: List[Dict], | |
| deck_name: str = "ocr", | |
| target_lang: str = "en", | |
| tags: Optional[List[str]] = None, | |
| ) -> Path: | |
| """ | |
| Takes OCR results (as produced by ocr_tools.ocr_and_translate_batch) | |
| and constructs a simple vocab deck. | |
| ocr_results: list of dict with keys: | |
| - "text": original text | |
| - optionally other fields (ignored) | |
| """ | |
| all_text = [] | |
| for res in ocr_results: | |
| t = res.get("text") or res.get("raw_text") or "" | |
| if t: | |
| all_text.append(t) | |
| joined = "\n".join(all_text) | |
| words = _extract_candidate_words(joined) | |
| if not words: | |
| raise ValueError("No candidate words found in OCR results.") | |
| translator = GoogleTranslator(source="auto", target=target_lang) | |
| cards = [] | |
| for w in words: | |
| try: | |
| trans = translator.translate(w) | |
| except Exception: | |
| continue | |
| if not trans: | |
| continue | |
| if trans.strip().lower() == w.strip().lower(): | |
| continue | |
| card = { | |
| "front": w, | |
| "back": trans, | |
| "content_type": "ocr_vocab", | |
| "language": target_lang, | |
| } | |
| _ensure_card_stats(card) | |
| cards.append(card) | |
| if not cards: | |
| raise ValueError("No translatable words found to build cards.") | |
| decks_dir = _get_decks_dir(username) | |
| deck_path = decks_dir / f"{deck_name}.json" | |
| deck = { | |
| "name": deck_name, | |
| "cards": cards, | |
| "tags": tags or [], | |
| } | |
| save_deck(deck_path, deck) | |
| return deck_path | |
| # ------------------------------------------------------------ | |
| # Conversation/Text → Flashcards | |
| # ------------------------------------------------------------ | |
| def generate_flashcards_from_text( | |
| username: str, | |
| text: str, | |
| deck_name: str = "conversation", | |
| target_lang: str = "en", | |
| tags: Optional[List[str]] = None, | |
| ) -> Path: | |
| """ | |
| Build a vocab deck from raw conversation text. | |
| """ | |
| words = _extract_candidate_words(text) | |
| if not words: | |
| raise ValueError("No candidate words found in text.") | |
| translator = GoogleTranslator(source="auto", target=target_lang) | |
| cards = [] | |
| for w in words: | |
| try: | |
| trans = translator.translate(w) | |
| except Exception: | |
| continue | |
| if not trans: | |
| continue | |
| if trans.strip().lower() == w.strip().lower(): | |
| continue | |
| card = { | |
| "front": w, | |
| "back": trans, | |
| "content_type": "conversation_vocab", | |
| "language": target_lang, | |
| } | |
| _ensure_card_stats(card) | |
| cards.append(card) | |
| if not cards: | |
| raise ValueError("No translatable words found to build cards.") | |
| decks_dir = _get_decks_dir(username) | |
| deck_path = decks_dir / f"{deck_name}.json" | |
| deck = { | |
| "name": deck_name, | |
| "cards": cards, | |
| "tags": tags or ["conversation"], | |
| } | |
| save_deck(deck_path, deck) | |
| return deck_path | |