kamkol commited on
Commit
db89085
·
1 Parent(s): 525d5c5

Add file system search and update path finding

Browse files
Files changed (2) hide show
  1. find_data.py +91 -0
  2. streamlit_app.py +49 -10
find_data.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+
4
+ def search_for_directory(target_dir, max_depth=5, start_paths=None):
5
+ """
6
+ Search for a directory by name across the file system,
7
+ with constraints to avoid system directories and excessive depth.
8
+
9
+ Args:
10
+ target_dir: The directory name to find
11
+ max_depth: Maximum directory depth to search
12
+ start_paths: List of paths to start search from. If None, use current directory
13
+ """
14
+ if start_paths is None:
15
+ start_paths = ['.', '..', '/home', '/app', '/mnt', '/tmp']
16
+
17
+ print(f"Searching for directory: {target_dir}")
18
+ print(f"Starting search from: {start_paths}")
19
+
20
+ found_paths = []
21
+
22
+ # Directories to skip for efficiency and to avoid permission errors
23
+ skip_dirs = {'/proc', '/sys', '/dev', '/run', '/snap'}
24
+
25
+ def search_dir(path, depth=0):
26
+ if depth > max_depth:
27
+ return
28
+
29
+ try:
30
+ # Skip specified directories
31
+ if path in skip_dirs:
32
+ return
33
+
34
+ # List contents with full paths
35
+ for item in os.listdir(path):
36
+ full_path = os.path.join(path, item)
37
+
38
+ # Check if this is our target
39
+ if item == target_dir and os.path.isdir(full_path):
40
+ print(f"FOUND: {full_path}")
41
+ found_paths.append(full_path)
42
+
43
+ # Print contents to verify it's what we're looking for
44
+ try:
45
+ contents = os.listdir(full_path)
46
+ print(f"Contents: {contents}")
47
+ except Exception as e:
48
+ print(f"Could not list contents: {e}")
49
+
50
+ # Recursively search subdirectories
51
+ if os.path.isdir(full_path) and not os.path.islink(full_path):
52
+ search_dir(full_path, depth + 1)
53
+ except (PermissionError, FileNotFoundError) as e:
54
+ # Skip directories we can't access
55
+ pass
56
+ except Exception as e:
57
+ # Print other errors but continue searching
58
+ print(f"Error accessing {path}: {e}")
59
+
60
+ start_time = time.time()
61
+
62
+ # Start the search from each starting path
63
+ for start_path in start_paths:
64
+ if os.path.exists(start_path) and os.path.isdir(start_path):
65
+ search_dir(start_path)
66
+
67
+ elapsed_time = time.time() - start_time
68
+ print(f"Search completed in {elapsed_time:.2f} seconds")
69
+ print(f"Found {len(found_paths)} matching directories:")
70
+ for path in found_paths:
71
+ print(f" - {path}")
72
+
73
+ return found_paths
74
+
75
+ if __name__ == "__main__":
76
+ # Look for both processed_data and .streamlit to understand how directories are organized
77
+ search_for_directory('processed_data')
78
+ search_for_directory('.streamlit')
79
+
80
+ # Also search for document_chunks.pkl file directly
81
+ print("\nSearching for document_chunks.pkl file...")
82
+ for root, dirs, files in os.walk('/home/user'):
83
+ if 'document_chunks.pkl' in files:
84
+ full_path = os.path.join(root, 'document_chunks.pkl')
85
+ print(f"FOUND FILE: {full_path}")
86
+
87
+ # Print out environment variables - they might contain useful information
88
+ print("\nEnvironment variables:")
89
+ for key, value in os.environ.items():
90
+ if 'PATH' in key or 'DIR' in key or 'HOME' in key:
91
+ print(f"{key}: {value}")
streamlit_app.py CHANGED
@@ -36,26 +36,65 @@ if not os.environ.get("OPENAI_API_KEY"):
36
  print(f"Current directory: {os.getcwd()}")
37
  print(f"Directory contents: {os.listdir('.')}")
38
 
 
 
 
 
 
 
 
 
39
  # Find the processed data directory
40
- # Try multiple possible paths
41
  possible_paths = [
42
- "processed_data",
43
- "/app/processed_data",
44
- "../processed_data",
45
- "./processed_data",
46
- "/home/user/app/processed_data"
 
 
 
 
47
  ]
48
 
49
- # Find the first path that exists
 
50
  for path in possible_paths:
51
  print(f"Checking path: {path}")
52
  if os.path.exists(path):
53
  PROCESSED_DATA_DIR = Path(path)
54
  print(f"Found processed data at: {path}")
55
  print(f"Contents: {os.listdir(path)}")
56
- break
57
- else:
58
- # Default if none found
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  PROCESSED_DATA_DIR = Path("processed_data")
60
  print(f"Using default processed data path: {PROCESSED_DATA_DIR}")
61
 
 
36
  print(f"Current directory: {os.getcwd()}")
37
  print(f"Directory contents: {os.listdir('.')}")
38
 
39
+ # Check for Hugging Face Spaces path - this is where uploaded files through UI should be
40
+ HF_SPACES_DIR = "/data"
41
+ if os.path.exists(HF_SPACES_DIR):
42
+ print(f"Found Hugging Face Spaces data directory at {HF_SPACES_DIR}")
43
+ print(f"Contents: {os.listdir(HF_SPACES_DIR)}")
44
+ else:
45
+ print(f"No Hugging Face Spaces data directory found at {HF_SPACES_DIR}")
46
+
47
  # Find the processed data directory
48
+ # Try multiple possible paths, including Hugging Face Spaces paths
49
  possible_paths = [
50
+ "/data/processed_data", # HF Spaces UI uploaded files
51
+ "/data/files/processed_data", # Another possible HF path
52
+ "/data/projects/processed_data", # Another possible HF path
53
+ "/home/user/processed_data", # User home directory
54
+ "/home/user/app/processed_data", # App directory in user home
55
+ "/app/processed_data", # Container app directory
56
+ "processed_data", # Relative to working directory
57
+ "../processed_data", # Parent directory
58
+ "./processed_data", # Explicit current directory
59
  ]
60
 
61
+ # Find the first path that exists or has pickle files
62
+ found_pickle_files = False
63
  for path in possible_paths:
64
  print(f"Checking path: {path}")
65
  if os.path.exists(path):
66
  PROCESSED_DATA_DIR = Path(path)
67
  print(f"Found processed data at: {path}")
68
  print(f"Contents: {os.listdir(path)}")
69
+
70
+ # Check if it has pickle files or a qdrant directory
71
+ has_pickle = any(f.endswith('.pkl') for f in os.listdir(path))
72
+ has_qdrant = os.path.exists(os.path.join(path, 'qdrant_vectorstore'))
73
+
74
+ if has_pickle or has_qdrant:
75
+ print(f"Found data files in {path}")
76
+ found_pickle_files = True
77
+ break
78
+ elif os.path.exists(Path(path).parent):
79
+ print(f"Parent directory exists: {Path(path).parent}")
80
+ print(f"Contents: {os.listdir(Path(path).parent)}")
81
+
82
+ # If we didn't find a path with data files, try a more exhaustive search
83
+ if not found_pickle_files:
84
+ print("No processed data found in standard locations, searching the file system...")
85
+ # Try to find any directory with document_chunks.pkl
86
+ for root, dirs, files in os.walk('/data', topdown=True, followlinks=False):
87
+ if 'document_chunks.pkl' in files:
88
+ PROCESSED_DATA_DIR = Path(root)
89
+ print(f"Found document_chunks.pkl in: {root}")
90
+ found_pickle_files = True
91
+ break
92
+ # Avoid going too deep
93
+ if root.count(os.sep) >= 5:
94
+ dirs[:] = []
95
+
96
+ # If still not found, use default and create it
97
+ if not found_pickle_files:
98
  PROCESSED_DATA_DIR = Path("processed_data")
99
  print(f"Using default processed data path: {PROCESSED_DATA_DIR}")
100