#!/bin/bash # Extract Form 990 ZIPs for dev states - PARALLEL VERSION # Uses parallel processing and optimized filtering for speed set -eo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" DATA_DIR="${PROJECT_ROOT}/data/cache/form990" ZIPS_DIR="${DATA_DIR}" OUTPUT_DIR="${DATA_DIR}/xmls_dev_states" # Dev states DEV_STATES="WA|MA|AL|GA|WI" # Parallel jobs (adjust based on CPU cores) PARALLEL_JOBS=${PARALLEL_JOBS:-4} # Colors GREEN='\033[0;32m' BLUE='\033[0;34m' YELLOW='\033[1;33m' NC='\033[0m' echo -e "${BLUE}========================================${NC}" echo -e "${BLUE}Fast Extract - Form 990 Dev States${NC}" echo -e "${BLUE}========================================${NC}" echo "" echo -e "${GREEN}States: WA, MA, AL, GA, WI${NC}" echo -e "${YELLOW}Parallel jobs: $PARALLEL_JOBS${NC}" echo -e "${BLUE}Output: $OUTPUT_DIR${NC}" echo "" # Create output directories mkdir -p "$OUTPUT_DIR" for state in WA MA AL GA WI; do mkdir -p "$OUTPUT_DIR/$state" done # Check for parallel or fallback to xargs if command -v parallel &>/dev/null; then PARALLEL_CMD="parallel" echo -e "${GREEN}✓ Using GNU parallel${NC}" elif command -v xargs &>/dev/null; then PARALLEL_CMD="xargs" echo -e "${YELLOW}⚠ Using xargs (slower than GNU parallel)${NC}" else echo -e "${RED}Error: Need GNU parallel or xargs${NC}" exit 1 fi # Function to process one ZIP process_zip() { local zipfile="$1" local basename=$(basename "$zipfile") local temp_dir=$(mktemp -d) # Extract to temp if ! unzip -q -o "$zipfile" -d "$temp_dir" 2>/dev/null; then rm -rf "$temp_dir" echo "[$basename] Failed to extract" return 1 fi local total_xmls=$(find "$temp_dir" -name "*.xml" -type f 2>/dev/null | wc -l) local kept=0 # Find matching XMLs and copy them # Using grep -l to list files, then process each one if command -v rg &>/dev/null; then # Use ripgrep (faster) local matching_files=$(rg -l "(WA|MA|AL|GA|WI)" "$temp_dir" 2>/dev/null || true) else # Use grep (slower but more compatible) local matching_files=$(grep -rl "\(WA\|MA\|AL\|GA\|WI\)" "$temp_dir" 2>/dev/null || true) fi # Process each matching file if [ -n "$matching_files" ]; then while IFS= read -r xml; do [ -f "$xml" ] || continue # Extract state code - look for first match local state_code=$(grep -oE "(WA|MA|AL|GA|WI)" "$xml" 2>/dev/null | head -1 | sed 's/<[^>]*>//g') if [ -n "$state_code" ]; then local xmlname=$(basename "$xml") local dest="$OUTPUT_DIR/$state_code/$xmlname" # Copy if not already exists if [ ! -f "$dest" ]; then cp "$xml" "$dest" 2>/dev/null && ((kept++)) fi fi done <<< "$matching_files" fi # Cleanup rm -rf "$temp_dir" echo "[$basename] Total: $total_xmls | Kept: $kept" return 0 } # Export function for parallel export -f process_zip export OUTPUT_DIR DEV_STATES # Get list of ZIPs mapfile -t ZIPS < <(find "$ZIPS_DIR" -maxdepth 1 -name "*.zip" -type f | sort) ZIP_COUNT=${#ZIPS[@]} if [ $ZIP_COUNT -eq 0 ]; then echo -e "${YELLOW}No ZIP files found${NC}" exit 1 fi echo -e "${GREEN}Found $ZIP_COUNT ZIP files${NC}" echo "" # Process ZIPs in parallel if [ "$PARALLEL_CMD" = "parallel" ]; then # GNU parallel (best performance) printf '%s\n' "${ZIPS[@]}" | parallel -j "$PARALLEL_JOBS" --bar process_zip else # xargs fallback printf '%s\n' "${ZIPS[@]}" | xargs -P "$PARALLEL_JOBS" -I {} bash -c 'process_zip "$@"' _ {} fi # Final stats echo "" echo -e "${GREEN}========================================${NC}" echo -e "${GREEN}✅ Extraction Complete!${NC}" echo -e "${GREEN}========================================${NC}" echo "" FINAL_COUNT=$(find "$OUTPUT_DIR" -name "*.xml" 2>/dev/null | wc -l) FINAL_SIZE=$(du -sh "$OUTPUT_DIR" 2>/dev/null | cut -f1) echo -e "${GREEN}Total XMLs:${NC} $FINAL_COUNT files ($FINAL_SIZE)" echo "" echo -e "${BLUE}XMLs per state:${NC}" for state in WA MA AL GA WI; do count=$(find "$OUTPUT_DIR/$state" -name "*.xml" 2>/dev/null | wc -l) size=$(du -sh "$OUTPUT_DIR/$state" 2>/dev/null | cut -f1) echo -e " ${GREEN}$state:${NC} $count files ($size)" done echo "" echo -e "Next step: Build index of these XMLs" echo -e " ${BLUE}python scripts/build_990_local_index.py --xmls-dir $OUTPUT_DIR${NC}" echo ""