csds-project / src /config.py
beatrizpm's picture
Rename config.py to src/config.py
0650966 verified
raw
history blame contribute delete
583 Bytes
from pathlib import Path
import os
# Paths base
BASE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"
# Ensure directories exist
for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
path.mkdir(parents=True, exist_ok=True)
# Preprocessing configs
MIN_YEAR = 2020
MAX_TEXTS = 2000
# Embeddings configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
# EMBEDDING_MODEL_NAME = "allenai/specter2"