kamkol commited on
Commit
ece2d3a
·
1 Parent(s): db89085

Add simplified data packaging functionality

Browse files
Files changed (4) hide show
  1. .DS_Store +0 -0
  2. package_data.py +65 -0
  3. simple_package.py +38 -0
  4. streamlit_app.py +34 -63
.DS_Store CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
 
package_data.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import pickle
4
+ import tarfile
5
+ from pathlib import Path
6
+
7
+ # Define paths
8
+ PROCESSED_DATA_DIR = Path("processed_data")
9
+ CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
10
+ QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
11
+ TARGET_PACKAGE = "processed_data.tar.gz"
12
+
13
+ def package_data():
14
+ """
15
+ Package the processed data into a single compressed file for deployment.
16
+
17
+ This creates a tar.gz file that contains the document chunks and
18
+ Qdrant vector database, which can be uploaded to Hugging Face.
19
+ """
20
+ print("Starting data packaging process...")
21
+
22
+ # Verify source files exist
23
+ if not os.path.exists(CHUNKS_FILE):
24
+ raise FileNotFoundError(f"Document chunks file not found: {CHUNKS_FILE}")
25
+
26
+ if not os.path.exists(QDRANT_DIR):
27
+ raise FileNotFoundError(f"Qdrant directory not found: {QDRANT_DIR}")
28
+
29
+ # Verify chunks file is valid
30
+ try:
31
+ with open(CHUNKS_FILE, 'rb') as f:
32
+ chunks = pickle.load(f)
33
+ print(f"Verified document chunks file. Contains {len(chunks)} chunks.")
34
+ except Exception as e:
35
+ raise ValueError(f"Invalid document chunks file: {str(e)}")
36
+
37
+ # Create tar.gz file
38
+ print(f"Creating package file: {TARGET_PACKAGE}")
39
+ with tarfile.open(TARGET_PACKAGE, "w:gz") as tar:
40
+ # Add chunks file
41
+ tar.add(CHUNKS_FILE, arcname=os.path.basename(CHUNKS_FILE))
42
+
43
+ # Add Qdrant directory
44
+ for root, dirs, files in os.walk(QDRANT_DIR):
45
+ for file in files:
46
+ file_path = os.path.join(root, file)
47
+ arcname = os.path.join(
48
+ "qdrant_vectorstore",
49
+ os.path.relpath(file_path, QDRANT_DIR)
50
+ )
51
+ print(f"Adding: {file_path} -> {arcname}")
52
+ tar.add(file_path, arcname=arcname)
53
+
54
+ # Verify the tarfile was created
55
+ if os.path.exists(TARGET_PACKAGE):
56
+ size_mb = os.path.getsize(TARGET_PACKAGE) / (1024 * 1024)
57
+ print(f"Package created successfully: {TARGET_PACKAGE} ({size_mb:.2f} MB)")
58
+ print("\nInstructions:")
59
+ print(f"1. Upload {TARGET_PACKAGE} to your Hugging Face Space")
60
+ print("2. The app will automatically extract it on startup")
61
+ else:
62
+ print("Failed to create package file")
63
+
64
+ if __name__ == "__main__":
65
+ package_data()
simple_package.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tarfile
3
+ from pathlib import Path
4
+
5
+ # Define paths
6
+ PROCESSED_DATA_DIR = Path("processed_data")
7
+ TARGET_PACKAGE = "processed_data.tar.gz"
8
+
9
+ def simple_package():
10
+ """
11
+ Package the processed data into a single compressed file without validation.
12
+ This creates a tar.gz file containing the entire processed_data directory.
13
+ """
14
+ print("Starting simple packaging process...")
15
+
16
+ # Check if folder exists
17
+ if not os.path.exists(PROCESSED_DATA_DIR):
18
+ print(f"ERROR: Directory {PROCESSED_DATA_DIR} not found!")
19
+ return
20
+
21
+ # Create tar.gz file
22
+ print(f"Creating package file: {TARGET_PACKAGE}")
23
+ with tarfile.open(TARGET_PACKAGE, "w:gz") as tar:
24
+ # Add the entire directory
25
+ tar.add(PROCESSED_DATA_DIR, arcname=PROCESSED_DATA_DIR.name)
26
+
27
+ # Verify the tarfile was created
28
+ if os.path.exists(TARGET_PACKAGE):
29
+ size_mb = os.path.getsize(TARGET_PACKAGE) / (1024 * 1024)
30
+ print(f"Package created successfully: {TARGET_PACKAGE} ({size_mb:.2f} MB)")
31
+ print("\nInstructions:")
32
+ print(f"1. Upload {TARGET_PACKAGE} to your Hugging Face Space")
33
+ print("2. The app will automatically extract it on startup")
34
+ else:
35
+ print("Failed to create package file")
36
+
37
+ if __name__ == "__main__":
38
+ simple_package()
streamlit_app.py CHANGED
@@ -2,6 +2,7 @@ import os
2
  import pickle
3
  import streamlit as st
4
  from pathlib import Path
 
5
  from dotenv import load_dotenv
6
  from langchain_openai.chat_models import ChatOpenAI
7
  from langchain_openai.embeddings import OpenAIEmbeddings
@@ -44,73 +45,43 @@ if os.path.exists(HF_SPACES_DIR):
44
  else:
45
  print(f"No Hugging Face Spaces data directory found at {HF_SPACES_DIR}")
46
 
47
- # Find the processed data directory
48
- # Try multiple possible paths, including Hugging Face Spaces paths
49
- possible_paths = [
50
- "/data/processed_data", # HF Spaces UI uploaded files
51
- "/data/files/processed_data", # Another possible HF path
52
- "/data/projects/processed_data", # Another possible HF path
53
- "/home/user/processed_data", # User home directory
54
- "/home/user/app/processed_data", # App directory in user home
55
- "/app/processed_data", # Container app directory
56
- "processed_data", # Relative to working directory
57
- "../processed_data", # Parent directory
58
- "./processed_data", # Explicit current directory
59
- ]
60
-
61
- # Find the first path that exists or has pickle files
62
- found_pickle_files = False
63
- for path in possible_paths:
64
- print(f"Checking path: {path}")
65
- if os.path.exists(path):
66
- PROCESSED_DATA_DIR = Path(path)
67
- print(f"Found processed data at: {path}")
68
- print(f"Contents: {os.listdir(path)}")
69
-
70
- # Check if it has pickle files or a qdrant directory
71
- has_pickle = any(f.endswith('.pkl') for f in os.listdir(path))
72
- has_qdrant = os.path.exists(os.path.join(path, 'qdrant_vectorstore'))
73
-
74
- if has_pickle or has_qdrant:
75
- print(f"Found data files in {path}")
76
- found_pickle_files = True
77
- break
78
- elif os.path.exists(Path(path).parent):
79
- print(f"Parent directory exists: {Path(path).parent}")
80
- print(f"Contents: {os.listdir(Path(path).parent)}")
81
-
82
- # If we didn't find a path with data files, try a more exhaustive search
83
- if not found_pickle_files:
84
- print("No processed data found in standard locations, searching the file system...")
85
- # Try to find any directory with document_chunks.pkl
86
- for root, dirs, files in os.walk('/data', topdown=True, followlinks=False):
87
- if 'document_chunks.pkl' in files:
88
- PROCESSED_DATA_DIR = Path(root)
89
- print(f"Found document_chunks.pkl in: {root}")
90
- found_pickle_files = True
91
- break
92
- # Avoid going too deep
93
- if root.count(os.sep) >= 5:
94
- dirs[:] = []
95
-
96
- # If still not found, use default and create it
97
- if not found_pickle_files:
98
- PROCESSED_DATA_DIR = Path("processed_data")
99
- print(f"Using default processed data path: {PROCESSED_DATA_DIR}")
100
-
101
- # Create directory if it doesn't exist (for logging)
102
- if not os.path.exists(PROCESSED_DATA_DIR):
103
- os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
104
- print(f"Created directory: {PROCESSED_DATA_DIR}")
105
-
106
- # Paths to pre-processed data
107
  CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
108
  QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
 
109
 
110
- # Print paths for debugging
111
- print(f"CHUNKS_FILE path: {CHUNKS_FILE}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  print(f"CHUNKS_FILE exists: {os.path.exists(CHUNKS_FILE)}")
113
- print(f"QDRANT_DIR path: {QDRANT_DIR}")
114
  print(f"QDRANT_DIR exists: {os.path.exists(QDRANT_DIR)}")
115
  if os.path.exists(QDRANT_DIR):
116
  print(f"QDRANT_DIR contents: {os.listdir(QDRANT_DIR)}")
 
2
  import pickle
3
  import streamlit as st
4
  from pathlib import Path
5
+ import tarfile
6
  from dotenv import load_dotenv
7
  from langchain_openai.chat_models import ChatOpenAI
8
  from langchain_openai.embeddings import OpenAIEmbeddings
 
45
  else:
46
  print(f"No Hugging Face Spaces data directory found at {HF_SPACES_DIR}")
47
 
48
+ # Paths to pre-processed data and package
49
+ PROCESSED_DATA_DIR = Path("processed_data")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  CHUNKS_FILE = PROCESSED_DATA_DIR / "document_chunks.pkl"
51
  QDRANT_DIR = PROCESSED_DATA_DIR / "qdrant_vectorstore"
52
+ PACKAGE_FILE = "processed_data.tar.gz"
53
 
54
+ # Extract packaged data if available
55
+ def extract_packaged_data():
56
+ """Extract the packaged data if it exists."""
57
+ if os.path.exists(PACKAGE_FILE):
58
+ print(f"Found packaged data: {PACKAGE_FILE}")
59
+
60
+ # Create processed_data directory if it doesn't exist
61
+ if not os.path.exists(PROCESSED_DATA_DIR):
62
+ os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
63
+ print(f"Created directory: {PROCESSED_DATA_DIR}")
64
+
65
+ # Extract the package
66
+ try:
67
+ with tarfile.open(PACKAGE_FILE, "r:gz") as tar:
68
+ print("Extracting package...")
69
+ tar.extractall(path=PROCESSED_DATA_DIR)
70
+ print("Extraction complete")
71
+ return True
72
+ except Exception as e:
73
+ print(f"Error extracting package: {str(e)}")
74
+ return False
75
+ else:
76
+ print(f"No packaged data found: {PACKAGE_FILE}")
77
+ return False
78
+
79
+ # Extract packaged data on startup
80
+ extract_packaged_data()
81
+
82
+ # Check if processed data exists
83
+ print(f"Checking for processed data...")
84
  print(f"CHUNKS_FILE exists: {os.path.exists(CHUNKS_FILE)}")
 
85
  print(f"QDRANT_DIR exists: {os.path.exists(QDRANT_DIR)}")
86
  if os.path.exists(QDRANT_DIR):
87
  print(f"QDRANT_DIR contents: {os.listdir(QDRANT_DIR)}")