Spaces:
Sleeping
Sleeping
File size: 3,604 Bytes
9ed2c7f 4542bf4 9ed2c7f 4542bf4 9ed2c7f 4542bf4 9ed2c7f 4542bf4 9ed2c7f 4542bf4 9ed2c7f 4542bf4 9ed2c7f 4542bf4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | #!/bin/bash
# Script to build vector store locally
# Usage: ./scripts/build-vector-store.sh [--force-recreate] [--cleanup] [--help]
set -e # Exit on error
# Default values
OUTPUT_DIR=${OUTPUT_DIR:-"./artifacts"}
DATA_DIR="/home/mafzaal/source/d365stuff/posts/"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$OUTPUT_DIR/build_${TIMESTAMP}.log"
# Function to display help
show_help() {
cat << EOF
Usage: ./scripts/build-vector-store.sh [OPTIONS]
Options:
--force-recreate Force recreation of the vector store
--cleanup Clean up temporary files after build
--help Show this help message
Environment variables:
FORCE_RECREATE Set to "true" to force recreation of the vector store
OUTPUT_DIR Directory to save stats and artifacts (default: ./artifacts)
USE_CHUNKING Set to "false" to disable document chunking
SHOULD_SAVE_STATS Set to "false" to disable saving document statistics
EOF
exit 0
}
# Function to log messages
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Function to check if required tools are installed
check_requirements() {
local missing_tools=()
for tool in uv zip; do
if ! command -v "$tool" &> /dev/null; then
missing_tools+=("$tool")
fi
done
if [ ${#missing_tools[@]} -ne 0 ]; then
log "Error: The following required tools are missing: ${missing_tools[*]}"
exit 1
fi
}
# Parse command line arguments
FORCE_RECREATE=""
CLEANUP=false
while [[ $# -gt 0 ]]; do
case $1 in
--force-recreate)
FORCE_RECREATE="--force-recreate"
shift
;;
--cleanup)
CLEANUP=true
shift
;;
--help)
show_help
;;
*)
log "Error: Unknown option $1"
show_help
;;
esac
done
# Create output directory and log file
mkdir -p "$OUTPUT_DIR"
touch "$LOG_FILE"
# Check requirements
check_requirements
# Validate data directory
if [ ! -d "$DATA_DIR" ]; then
log "Error: Data directory '$DATA_DIR' does not exist"
exit 1
fi
log "Starting vector store build"
log "Output directory: $OUTPUT_DIR"
log "Force recreate: ${FORCE_RECREATE:-false}"
log "Cleanup after build: $CLEANUP"
# Run pipeline in CI mode
log "Running pipeline..."
if ! uv run -m lets_talk.pipeline $FORCE_RECREATE \
--ci \
--data-dir "$DATA_DIR" \
--data-dir-pattern "*.md" \
--base-url "https://www.d365stuff.co/" \
--blog-base-url "https://www.d365stuff.co/" \
--output-dir "$OUTPUT_DIR" \
--vector-storage-path "$OUTPUT_DIR/vector_store_d365stuff" \
--collection-name d365stuff_documents; then
log "Error: Pipeline execution failed"
exit 1
fi
# Check if vector store directory exists and create zip
if [ -d "$OUTPUT_DIR/vector_store_d365stuff" ]; then
log "Creating vector store zip file..."
cd "$OUTPUT_DIR/vector_store_d365stuff/"
zip -r "$OUTPUT_DIR/vector_store_${TIMESTAMP}.zip" "$OUTPUT_DIR/vector_store_d365stuff/"
log "Vector store zip created at $OUTPUT_DIR/vector_store_${TIMESTAMP}.zip"
else
log "Error: Vector store directory not found at $OUTPUT_DIR/vector_store_d365stuff"
exit 1
fi
# Cleanup if requested
if [ "$CLEANUP" = true ]; then
log "Cleaning up temporary files..."
rm -rf "$OUTPUT_DIR/vector_store_d365stuff"
log "Cleanup completed"
fi
log "Build completed successfully"
log "Artifacts available in $OUTPUT_DIR:"
ls -la "$OUTPUT_DIR" | tee -a "$LOG_FILE"
exit 0
|