Spaces:
Sleeping
Sleeping
File size: 583 Bytes
532f1f0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 |
from pathlib import Path
import os
# Paths base
BASE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
# Ensure directories exist
for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
path.mkdir(parents=True, exist_ok=True)
# Preprocessing configs
MIN_YEAR = 2020
MAX_TEXTS = 2000
# Embeddings configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# EMBEDDING_MODEL_NAME = "allenai/specter2"
|