rdisipio's picture
Add master scraping orchestration system
8b437a3
#!/bin/bash
# Master Course Scraper - Bash Wrapper
# This is a convenience wrapper around the Python master scraper
set -e # Exit on any error
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Get the directory where this script is located
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
# Default values
CONFIG_FILE="$PROJECT_ROOT/config/scraping_config.yaml"
DRY_RUN=false
TOPIC=""
PLATFORM=""
# Function to show usage
show_usage() {
echo -e "${BLUE}Master Course Scraper${NC}"
echo ""
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " -c, --config FILE Use custom configuration file (default: config/scraping_config.yaml)"
echo " -d, --dry-run Show what would be scraped without actually running"
echo " -t, --topic TOPIC Only scrape courses for this topic (partial match)"
echo " -p, --platform PLATFORM Only scrape from this platform (coursera, udemy, edx)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " $0 # Scrape everything in config"
echo " $0 --dry-run # Show what would be scraped"
echo " $0 --topic \"machine learning\" # Only scrape ML courses"
echo " $0 --platform coursera # Only scrape from Coursera"
echo " $0 --topic python --platform udemy # Only Python courses from Udemy"
echo ""
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-c|--config)
CONFIG_FILE="$2"
shift 2
;;
-d|--dry-run)
DRY_RUN=true
shift
;;
-t|--topic)
TOPIC="$2"
shift 2
;;
-p|--platform)
PLATFORM="$2"
shift 2
;;
-h|--help)
show_usage
exit 0
;;
*)
echo -e "${RED}Error: Unknown option $1${NC}"
show_usage
exit 1
;;
esac
done
# Check if config file exists
if [[ ! -f "$CONFIG_FILE" ]]; then
echo -e "${RED}Error: Configuration file not found: $CONFIG_FILE${NC}"
exit 1
fi
# Find Python executable using pipenv
if ! command -v pipenv &> /dev/null; then
echo -e "${RED}Error: pipenv not found. Please install pipenv first.${NC}"
echo "Install with: pip install pipenv"
exit 1
fi
# Build the command using pipenv run
CMD=("pipenv" "run" "python" "$SCRIPT_DIR/master_scraper.py" "--config" "$CONFIG_FILE")
if [[ "$DRY_RUN" == true ]]; then
CMD+=("--dry-run")
fi
if [[ -n "$TOPIC" ]]; then
CMD+=("--topic" "$TOPIC")
fi
if [[ -n "$PLATFORM" ]]; then
CMD+=("--platform" "$PLATFORM")
fi
# Show what we're about to run
echo -e "${BLUE}Running master scraper...${NC}"
echo -e "${YELLOW}Config:${NC} $CONFIG_FILE"
if [[ "$DRY_RUN" == true ]]; then
echo -e "${YELLOW}Mode:${NC} DRY RUN"
fi
if [[ -n "$TOPIC" ]]; then
echo -e "${YELLOW}Topic filter:${NC} $TOPIC"
fi
if [[ -n "$PLATFORM" ]]; then
echo -e "${YELLOW}Platform filter:${NC} $PLATFORM"
fi
echo ""
# Change to project directory
cd "$PROJECT_ROOT"
# Run the command
if "${CMD[@]}"; then
echo -e "${GREEN}✅ Master scraper completed successfully${NC}"
else
echo -e "${RED}❌ Master scraper failed${NC}"
exit 1
fi