AB_Testing_RAG_Agent / find_data.py
kamkol's picture
Add file system search and update path finding
db89085
import os
import time
def search_for_directory(target_dir, max_depth=5, start_paths=None):
"""
Search for a directory by name across the file system,
with constraints to avoid system directories and excessive depth.
Args:
target_dir: The directory name to find
max_depth: Maximum directory depth to search
start_paths: List of paths to start search from. If None, use current directory
"""
if start_paths is None:
start_paths = ['.', '..', '/home', '/app', '/mnt', '/tmp']
print(f"Searching for directory: {target_dir}")
print(f"Starting search from: {start_paths}")
found_paths = []
# Directories to skip for efficiency and to avoid permission errors
skip_dirs = {'/proc', '/sys', '/dev', '/run', '/snap'}
def search_dir(path, depth=0):
if depth > max_depth:
return
try:
# Skip specified directories
if path in skip_dirs:
return
# List contents with full paths
for item in os.listdir(path):
full_path = os.path.join(path, item)
# Check if this is our target
if item == target_dir and os.path.isdir(full_path):
print(f"FOUND: {full_path}")
found_paths.append(full_path)
# Print contents to verify it's what we're looking for
try:
contents = os.listdir(full_path)
print(f"Contents: {contents}")
except Exception as e:
print(f"Could not list contents: {e}")
# Recursively search subdirectories
if os.path.isdir(full_path) and not os.path.islink(full_path):
search_dir(full_path, depth + 1)
except (PermissionError, FileNotFoundError) as e:
# Skip directories we can't access
pass
except Exception as e:
# Print other errors but continue searching
print(f"Error accessing {path}: {e}")
start_time = time.time()
# Start the search from each starting path
for start_path in start_paths:
if os.path.exists(start_path) and os.path.isdir(start_path):
search_dir(start_path)
elapsed_time = time.time() - start_time
print(f"Search completed in {elapsed_time:.2f} seconds")
print(f"Found {len(found_paths)} matching directories:")
for path in found_paths:
print(f" - {path}")
return found_paths
if __name__ == "__main__":
# Look for both processed_data and .streamlit to understand how directories are organized
search_for_directory('processed_data')
search_for_directory('.streamlit')
# Also search for document_chunks.pkl file directly
print("\nSearching for document_chunks.pkl file...")
for root, dirs, files in os.walk('/home/user'):
if 'document_chunks.pkl' in files:
full_path = os.path.join(root, 'document_chunks.pkl')
print(f"FOUND FILE: {full_path}")
# Print out environment variables - they might contain useful information
print("\nEnvironment variables:")
for key, value in os.environ.items():
if 'PATH' in key or 'DIR' in key or 'HOME' in key:
print(f"{key}: {value}")