import os import shutil import pickle import tarfile from pathlib import Path # Define paths PROCESSED_DATA_DIR = Path("processed_data") CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl" QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore" TARGET_PACKAGE = "processed_data.tar.gz" def package_data(): """ Package the processed data into a single compressed file for deployment. This creates a tar.gz file that contains the document chunks and Qdrant vector database, which can be uploaded to Hugging Face. """ print("Starting data packaging process...") # Verify source files exist if not os.path.exists(CHUNKS_FILE): raise FileNotFoundError(f"Document chunks file not found: {CHUNKS_FILE}") if not os.path.exists(QDRANT_DIR): raise FileNotFoundError(f"Qdrant directory not found: {QDRANT_DIR}") # Verify chunks file is valid try: with open(CHUNKS_FILE, 'rb') as f: chunks = pickle.load(f) print(f"Verified document chunks file. Contains {len(chunks)} chunks.") except Exception as e: raise ValueError(f"Invalid document chunks file: {str(e)}") # Create tar.gz file print(f"Creating package file: {TARGET_PACKAGE}") with tarfile.open(TARGET_PACKAGE, "w:gz") as tar: # Add chunks file tar.add(CHUNKS_FILE, arcname=os.path.basename(CHUNKS_FILE)) # Add Qdrant directory for root, dirs, files in os.walk(QDRANT_DIR): for file in files: file_path = os.path.join(root, file) arcname = os.path.join( "qdrant_vectorstore", os.path.relpath(file_path, QDRANT_DIR) ) print(f"Adding: {file_path} -> {arcname}") tar.add(file_path, arcname=arcname) # Verify the tarfile was created if os.path.exists(TARGET_PACKAGE): size_mb = os.path.getsize(TARGET_PACKAGE) / (1024 * 1024) print(f"Package created successfully: {TARGET_PACKAGE} ({size_mb:.2f} MB)") print("\nInstructions:") print(f"1. Upload {TARGET_PACKAGE} to your Hugging Face Space") print("2. The app will automatically extract it on startup") else: print("Failed to create package file") if __name__ == "__main__": package_data()