d365stuff-chat / scripts /build-vector-store.sh
mafzaal's picture
Update .dockerignore and .gitignore to include new directories and log files; enhance app.py with Dropbox download functionality and logging; modify build-vector-store.sh for improved error handling and options; remove obsolete posts and pipeline.py.
4542bf4
#!/bin/bash
# Script to build vector store locally
# Usage: ./scripts/build-vector-store.sh [--force-recreate] [--cleanup] [--help]
set -e # Exit on error
# Default values
OUTPUT_DIR=${OUTPUT_DIR:-"./artifacts"}
DATA_DIR="/home/mafzaal/source/d365stuff/posts/"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="$OUTPUT_DIR/build_${TIMESTAMP}.log"
# Function to display help
show_help() {
cat << EOF
Usage: ./scripts/build-vector-store.sh [OPTIONS]
Options:
--force-recreate Force recreation of the vector store
--cleanup Clean up temporary files after build
--help Show this help message
Environment variables:
FORCE_RECREATE Set to "true" to force recreation of the vector store
OUTPUT_DIR Directory to save stats and artifacts (default: ./artifacts)
USE_CHUNKING Set to "false" to disable document chunking
SHOULD_SAVE_STATS Set to "false" to disable saving document statistics
EOF
exit 0
}
# Function to log messages
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Function to check if required tools are installed
check_requirements() {
local missing_tools=()
for tool in uv zip; do
if ! command -v "$tool" &> /dev/null; then
missing_tools+=("$tool")
fi
done
if [ ${#missing_tools[@]} -ne 0 ]; then
log "Error: The following required tools are missing: ${missing_tools[*]}"
exit 1
fi
}
# Parse command line arguments
FORCE_RECREATE=""
CLEANUP=false
while [[ $# -gt 0 ]]; do
case $1 in
--force-recreate)
FORCE_RECREATE="--force-recreate"
shift
;;
--cleanup)
CLEANUP=true
shift
;;
--help)
show_help
;;
*)
log "Error: Unknown option $1"
show_help
;;
esac
done
# Create output directory and log file
mkdir -p "$OUTPUT_DIR"
touch "$LOG_FILE"
# Check requirements
check_requirements
# Validate data directory
if [ ! -d "$DATA_DIR" ]; then
log "Error: Data directory '$DATA_DIR' does not exist"
exit 1
fi
log "Starting vector store build"
log "Output directory: $OUTPUT_DIR"
log "Force recreate: ${FORCE_RECREATE:-false}"
log "Cleanup after build: $CLEANUP"
# Run pipeline in CI mode
log "Running pipeline..."
if ! uv run -m lets_talk.pipeline $FORCE_RECREATE \
--ci \
--data-dir "$DATA_DIR" \
--data-dir-pattern "*.md" \
--base-url "https://www.d365stuff.co/" \
--blog-base-url "https://www.d365stuff.co/" \
--output-dir "$OUTPUT_DIR" \
--vector-storage-path "$OUTPUT_DIR/vector_store_d365stuff" \
--collection-name d365stuff_documents; then
log "Error: Pipeline execution failed"
exit 1
fi
# Check if vector store directory exists and create zip
if [ -d "$OUTPUT_DIR/vector_store_d365stuff" ]; then
log "Creating vector store zip file..."
cd "$OUTPUT_DIR/vector_store_d365stuff/"
zip -r "$OUTPUT_DIR/vector_store_${TIMESTAMP}.zip" "$OUTPUT_DIR/vector_store_d365stuff/"
log "Vector store zip created at $OUTPUT_DIR/vector_store_${TIMESTAMP}.zip"
else
log "Error: Vector store directory not found at $OUTPUT_DIR/vector_store_d365stuff"
exit 1
fi
# Cleanup if requested
if [ "$CLEANUP" = true ]; then
log "Cleaning up temporary files..."
rm -rf "$OUTPUT_DIR/vector_store_d365stuff"
log "Cleanup completed"
fi
log "Build completed successfully"
log "Artifacts available in $OUTPUT_DIR:"
ls -la "$OUTPUT_DIR" | tee -a "$LOG_FILE"
exit 0