kamkol commited on
Commit
06d2b05
·
1 Parent(s): ece2d3a

Fix tar extraction to handle nested directories

Browse files
Files changed (1) hide show
  1. streamlit_app.py +57 -1
streamlit_app.py CHANGED
@@ -65,12 +65,68 @@ def extract_packaged_data():
65
  # Extract the package
66
  try:
67
  with tarfile.open(PACKAGE_FILE, "r:gz") as tar:
 
 
 
 
 
68
  print("Extracting package...")
69
- tar.extractall(path=PROCESSED_DATA_DIR)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  print("Extraction complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  return True
72
  except Exception as e:
73
  print(f"Error extracting package: {str(e)}")
 
 
74
  return False
75
  else:
76
  print(f"No packaged data found: {PACKAGE_FILE}")
 
65
  # Extract the package
66
  try:
67
  with tarfile.open(PACKAGE_FILE, "r:gz") as tar:
68
+ print("Examining tar file contents before extraction:")
69
+ for member in tar.getmembers():
70
+ print(f" File in archive: {member.name}")
71
+
72
+ # Extract files, handling potential nested directories
73
  print("Extracting package...")
74
+ for member in tar.getmembers():
75
+ # Skip directories
76
+ if member.isdir():
77
+ continue
78
+
79
+ # Get the basename and handle nested paths
80
+ # If file is in processed_data/something, extract just "something"
81
+ # If file is just something, extract as is
82
+ basename = os.path.basename(member.name)
83
+
84
+ # Determine target path
85
+ if basename == "document_chunks.pkl":
86
+ target_path = CHUNKS_FILE
87
+ elif "qdrant_vectorstore" in member.name:
88
+ # For Qdrant files, preserve the subdirectory structure
89
+ if member.name.startswith("processed_data/"):
90
+ # Remove 'processed_data/' prefix if it exists
91
+ relative_path = member.name[len("processed_data/"):]
92
+ else:
93
+ relative_path = member.name
94
+
95
+ target_path = PROCESSED_DATA_DIR / relative_path
96
+ else:
97
+ # Other files go directly in processed_data
98
+ target_path = PROCESSED_DATA_DIR / basename
99
+
100
+ # Create directories if needed
101
+ os.makedirs(os.path.dirname(target_path), exist_ok=True)
102
+
103
+ # Extract the file
104
+ print(f" Extracting {member.name} to {target_path}")
105
+ f = tar.extractfile(member)
106
+ if f is not None:
107
+ with open(target_path, "wb") as out_file:
108
+ out_file.write(f.read())
109
+
110
  print("Extraction complete")
111
+
112
+ # Verify extraction worked
113
+ print("Checking extracted files:")
114
+ if os.path.exists(CHUNKS_FILE):
115
+ print(f" {CHUNKS_FILE} exists: ✓")
116
+ else:
117
+ print(f" {CHUNKS_FILE} exists: ✗")
118
+
119
+ if os.path.exists(QDRANT_DIR):
120
+ print(f" {QDRANT_DIR} exists: ✓")
121
+ print(f" Contents: {os.listdir(QDRANT_DIR)}")
122
+ else:
123
+ print(f" {QDRANT_DIR} exists: ✗")
124
+
125
  return True
126
  except Exception as e:
127
  print(f"Error extracting package: {str(e)}")
128
+ import traceback
129
+ traceback.print_exc()
130
  return False
131
  else:
132
  print(f"No packaged data found: {PACKAGE_FILE}")