Spaces:
Sleeping
Sleeping
| # app/utils/nltk_bootstrap.py | |
| import os | |
| import pathlib | |
| def ensure_punkt(): | |
| """ | |
| Make NLTK look in a writable dir, download 'punkt_tab' (or 'punkt') | |
| if missing, and inject that dir into nltk.data.path. No-op if already present. | |
| """ | |
| try: | |
| import nltk # import here so module is optional at build time | |
| except Exception: | |
| return # If NLTK isn't installed, caller should have a fallback | |
| # Choose a persistent/writable location | |
| base = "/data/nltk_data" if os.path.isdir("/data") else os.path.join(os.path.dirname(__file__), "..", "nltk_data") | |
| nltd = os.path.abspath(base) | |
| os.makedirs(nltd, exist_ok=True) | |
| # Environment + explicit path injection (important in Spaces) | |
| os.environ["NLTK_DATA"] = nltd | |
| if nltd not in nltk.data.path: | |
| nltk.data.path.insert(0, nltd) | |
| # Try new resource first, then the legacy one | |
| for res in ("punkt_tab", "punkt"): | |
| try: | |
| nltk.data.find(f"tokenizers/{res}") | |
| return | |
| except LookupError: | |
| try: | |
| nltk.download(res, download_dir=nltd, quiet=True, raise_on_error=True) | |
| # After download, ensure it's on the path and return | |
| if nltd not in nltk.data.path: | |
| nltk.data.path.insert(0, nltd) | |
| return | |
| except Exception: | |
| # Try the next resource name | |
| continue | |