Spaces:
Sleeping
Sleeping
| from pathlib import Path | |
| import os | |
| # Paths base | |
| BASE_DIR = Path(__file__).resolve().parent.parent | |
| DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets")) | |
| RAW_DIR = DATA_DIR / "raw" | |
| PROCESSED_DIR = DATA_DIR / "processed" | |
| EMBEDDINGS_DIR = DATA_DIR / "embeddings" | |
| # Ensure directories exist | |
| for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]: | |
| path.mkdir(parents=True, exist_ok=True) | |
| # Preprocessing configs | |
| MIN_YEAR = 2020 | |
| MAX_TEXTS = 2000 | |
| # Embeddings configuration | |
| EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2" | |
| # EMBEDDING_MODEL_NAME = "allenai/specter2" | |