# app/utils/nltk_bootstrap.py import os import pathlib def ensure_punkt(): """ Make NLTK look in a writable dir, download 'punkt_tab' (or 'punkt') if missing, and inject that dir into nltk.data.path. No-op if already present. """ try: import nltk # import here so module is optional at build time except Exception: return # If NLTK isn't installed, caller should have a fallback # Choose a persistent/writable location base = "/data/nltk_data" if os.path.isdir("/data") else os.path.join(os.path.dirname(__file__), "..", "nltk_data") nltd = os.path.abspath(base) os.makedirs(nltd, exist_ok=True) # Environment + explicit path injection (important in Spaces) os.environ["NLTK_DATA"] = nltd if nltd not in nltk.data.path: nltk.data.path.insert(0, nltd) # Try new resource first, then the legacy one for res in ("punkt_tab", "punkt"): try: nltk.data.find(f"tokenizers/{res}") return except LookupError: try: nltk.download(res, download_dir=nltd, quiet=True, raise_on_error=True) # After download, ensure it's on the path and return if nltd not in nltk.data.path: nltk.data.path.insert(0, nltd) return except Exception: # Try the next resource name continue