time-aware-rag / scripts /01_setup_environment.sh
manojarulmurugan's picture
Add full pipeline code + precomputed demo_data
46b9b58 verified
#!/bin/bash
# Step 1: Environment Setup and Dependency Installation
echo "Setting up Python environment and installing dependencies..."
set -e
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
echo "Environment setup - Project root: $PROJECT_ROOT"
cd "$PROJECT_ROOT"
echo "Changed to project directory: $(pwd)"
# Check Python version
python_version=$(python3 --version 2>&1)
echo "Using Python: $python_version"
# Skip virtual environment creation for now
echo "Skipping virtual environment creation (will be created later if needed)"
echo "Installing packages globally for now..."
# Upgrade pip
echo "Upgrading pip..."
pip install --upgrade pip
# Install dependencies
echo "Installing dependencies from requirements.txt..."
pip install -r requirements.txt
# Install additional development dependencies
echo "Installing additional development tools..."
pip install jupyter ipykernel
# Set up pre-commit hooks (optional)
echo "Setting up development tools..."
pip install pre-commit black flake8
# Download NLTK data (required for some evaluations)
echo "Downloading NLTK data..."
python -c "
import nltk
try:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
print('NLTK data downloaded successfully')
except Exception as e:
print(f'Note: NLTK download failed: {e}')
"
# Check if CUDA is available
echo "Checking CUDA availability..."
python -c "
import torch
if torch.cuda.is_available():
print(f'CUDA available: {torch.cuda.get_device_name(0)}')
print(f'CUDA version: {torch.version.cuda}')
else:
print('CUDA not available, will use CPU')
"
# Set up Weights & Biases (optional)
echo "Setting up Weights & Biases (optional)..."
if command -v wandb &> /dev/null; then
echo "wandb is already installed"
else
pip install wandb
fi
echo "Note: To use wandb logging, run 'wandb login' and set your API key"
# Create necessary directories
echo "Creating project directories..."
mkdir -p data/{chroniclingqa,temprageval,generated_questions,atlas_2021,fineweb}
mkdir -p models/{cache,time_aware_contriever}
mkdir -p outputs/{chroniclingqa,temprageval,mrag}
mkdir -p logs
# Download FineWeb dataset (matches notebook exactly)
echo ""
echo "Downloading and processing FineWeb dataset..."
echo "This will collect 500,000 temporal passages from FineWeb-edu (this may take some time)..."
python src/fineweb_loader.py
echo "Environment setup completed successfully!"
echo "Using Python at: $(which python3)"
echo ""
echo "Note: Packages installed globally. You can create a virtual environment later if needed."
echo ""
echo "Next step: Run ./scripts/02_generate_questions.sh"