File size: 583 Bytes
532f1f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from pathlib import Path
import os

# Paths base
BASE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.environ.get("DATA_DIR", BASE_DIR / "datasets"))

RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
EMBEDDINGS_DIR = DATA_DIR / "embeddings"

# Ensure directories exist
for path in [DATA_DIR, RAW_DIR, PROCESSED_DIR, EMBEDDINGS_DIR]:
    path.mkdir(parents=True, exist_ok=True)

# Preprocessing configs
MIN_YEAR = 2020
MAX_TEXTS = 2000

# Embeddings configuration
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

# EMBEDDING_MODEL_NAME = "allenai/specter2"