File size: 2,371 Bytes
ece2d3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import os
import shutil
import pickle
import tarfile
from pathlib import Path

# Define paths
PROCESSED_DATA_DIR = Path("processed_data")
CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
TARGET_PACKAGE = "processed_data.tar.gz"

def package_data():
    """
    Package the processed data into a single compressed file for deployment.
    
    This creates a tar.gz file that contains the document chunks and 
    Qdrant vector database, which can be uploaded to Hugging Face.
    """
    print("Starting data packaging process...")
    
    # Verify source files exist
    if not os.path.exists(CHUNKS_FILE):
        raise FileNotFoundError(f"Document chunks file not found: {CHUNKS_FILE}")
    
    if not os.path.exists(QDRANT_DIR):
        raise FileNotFoundError(f"Qdrant directory not found: {QDRANT_DIR}")
    
    # Verify chunks file is valid
    try:
        with open(CHUNKS_FILE, 'rb') as f:
            chunks = pickle.load(f)
            print(f"Verified document chunks file. Contains {len(chunks)} chunks.")
    except Exception as e:
        raise ValueError(f"Invalid document chunks file: {str(e)}")
    
    # Create tar.gz file
    print(f"Creating package file: {TARGET_PACKAGE}")
    with tarfile.open(TARGET_PACKAGE, "w:gz") as tar:
        # Add chunks file
        tar.add(CHUNKS_FILE, arcname=os.path.basename(CHUNKS_FILE))
        
        # Add Qdrant directory
        for root, dirs, files in os.walk(QDRANT_DIR):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.join(
                    "qdrant_vectorstore", 
                    os.path.relpath(file_path, QDRANT_DIR)
                )
                print(f"Adding: {file_path} -> {arcname}")
                tar.add(file_path, arcname=arcname)
    
    # Verify the tarfile was created
    if os.path.exists(TARGET_PACKAGE):
        size_mb = os.path.getsize(TARGET_PACKAGE) / (1024 * 1024)
        print(f"Package created successfully: {TARGET_PACKAGE} ({size_mb:.2f} MB)")
        print("\nInstructions:")
        print(f"1. Upload {TARGET_PACKAGE} to your Hugging Face Space")
        print("2. The app will automatically extract it on startup")
    else:
        print("Failed to create package file")

if __name__ == "__main__":
    package_data()