Spaces:
Sleeping
Sleeping
File size: 2,371 Bytes
ece2d3a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 | import os
import shutil
import pickle
import tarfile
from pathlib import Path
# Define paths
PROCESSED_DATA_DIR = Path("processed_data")
CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
TARGET_PACKAGE = "processed_data.tar.gz"
def package_data():
"""
Package the processed data into a single compressed file for deployment.
This creates a tar.gz file that contains the document chunks and
Qdrant vector database, which can be uploaded to Hugging Face.
"""
print("Starting data packaging process...")
# Verify source files exist
if not os.path.exists(CHUNKS_FILE):
raise FileNotFoundError(f"Document chunks file not found: {CHUNKS_FILE}")
if not os.path.exists(QDRANT_DIR):
raise FileNotFoundError(f"Qdrant directory not found: {QDRANT_DIR}")
# Verify chunks file is valid
try:
with open(CHUNKS_FILE, 'rb') as f:
chunks = pickle.load(f)
print(f"Verified document chunks file. Contains {len(chunks)} chunks.")
except Exception as e:
raise ValueError(f"Invalid document chunks file: {str(e)}")
# Create tar.gz file
print(f"Creating package file: {TARGET_PACKAGE}")
with tarfile.open(TARGET_PACKAGE, "w:gz") as tar:
# Add chunks file
tar.add(CHUNKS_FILE, arcname=os.path.basename(CHUNKS_FILE))
# Add Qdrant directory
for root, dirs, files in os.walk(QDRANT_DIR):
for file in files:
file_path = os.path.join(root, file)
arcname = os.path.join(
"qdrant_vectorstore",
os.path.relpath(file_path, QDRANT_DIR)
)
print(f"Adding: {file_path} -> {arcname}")
tar.add(file_path, arcname=arcname)
# Verify the tarfile was created
if os.path.exists(TARGET_PACKAGE):
size_mb = os.path.getsize(TARGET_PACKAGE) / (1024 * 1024)
print(f"Package created successfully: {TARGET_PACKAGE} ({size_mb:.2f} MB)")
print("\nInstructions:")
print(f"1. Upload {TARGET_PACKAGE} to your Hugging Face Space")
print("2. The app will automatically extract it on startup")
else:
print("Failed to create package file")
if __name__ == "__main__":
package_data() |