Spaces:

kamkol
/

AB_Testing_RAG_Agent

Sleeping

App Files Files Community

kamkol commited on Apr 30, 2025

Commit

db89085

1 Parent(s): 525d5c5

Add file system search and update path finding

Browse files

Files changed (2) hide show

find_data.py +91 -0
streamlit_app.py +49 -10

find_data.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import os
+import time
+def search_for_directory(target_dir, max_depth=5, start_paths=None):
+    """
+    Search for a directory by name across the file system,
+    with constraints to avoid system directories and excessive depth.
+    Args:
+        target_dir: The directory name to find
+        max_depth: Maximum directory depth to search
+        start_paths: List of paths to start search from. If None, use current directory
+    """
+    if start_paths is None:
+        start_paths = ['.', '..', '/home', '/app', '/mnt', '/tmp']
+    print(f"Searching for directory: {target_dir}")
+    print(f"Starting search from: {start_paths}")
+    found_paths = []
+    # Directories to skip for efficiency and to avoid permission errors
+    skip_dirs = {'/proc', '/sys', '/dev', '/run', '/snap'}
+    def search_dir(path, depth=0):
+        if depth > max_depth:
+            return
+        try:
+            # Skip specified directories
+            if path in skip_dirs:
+                return
+            # List contents with full paths
+            for item in os.listdir(path):
+                full_path = os.path.join(path, item)
+                # Check if this is our target
+                if item == target_dir and os.path.isdir(full_path):
+                    print(f"FOUND: {full_path}")
+                    found_paths.append(full_path)
+                    # Print contents to verify it's what we're looking for
+                    try:
+                        contents = os.listdir(full_path)
+                        print(f"Contents: {contents}")
+                    except Exception as e:
+                        print(f"Could not list contents: {e}")
+                # Recursively search subdirectories
+                if os.path.isdir(full_path) and not os.path.islink(full_path):
+                    search_dir(full_path, depth + 1)
+        except (PermissionError, FileNotFoundError) as e:
+            # Skip directories we can't access
+            pass
+        except Exception as e:
+            # Print other errors but continue searching
+            print(f"Error accessing {path}: {e}")
+    start_time = time.time()
+    # Start the search from each starting path
+    for start_path in start_paths:
+        if os.path.exists(start_path) and os.path.isdir(start_path):
+            search_dir(start_path)
+    elapsed_time = time.time() - start_time
+    print(f"Search completed in {elapsed_time:.2f} seconds")
+    print(f"Found {len(found_paths)} matching directories:")
+    for path in found_paths:
+        print(f"  - {path}")
+    return found_paths
+if __name__ == "__main__":
+    # Look for both processed_data and .streamlit to understand how directories are organized
+    search_for_directory('processed_data')
+    search_for_directory('.streamlit')
+    # Also search for document_chunks.pkl file directly
+    print("\nSearching for document_chunks.pkl file...")
+    for root, dirs, files in os.walk('/home/user'):
+        if 'document_chunks.pkl' in files:
+            full_path = os.path.join(root, 'document_chunks.pkl')
+            print(f"FOUND FILE: {full_path}")
+    # Print out environment variables - they might contain useful information
+    print("\nEnvironment variables:")
+    for key, value in os.environ.items():
+        if 'PATH' in key or 'DIR' in key or 'HOME' in key:
+            print(f"{key}: {value}")

streamlit_app.py CHANGED Viewed

@@ -36,26 +36,65 @@ if not os.environ.get("OPENAI_API_KEY"):
 print(f"Current directory: {os.getcwd()}")
 print(f"Directory contents: {os.listdir('.')}")
 # Find the processed data directory
-# Try multiple possible paths
 possible_paths = [
-    "processed_data",
-    "/app/processed_data",
-    "../processed_data",
-    "./processed_data",
-    "/home/user/app/processed_data"
 ]
-# Find the first path that exists
 for path in possible_paths:
     print(f"Checking path: {path}")
     if os.path.exists(path):
         PROCESSED_DATA_DIR = Path(path)
         print(f"Found processed data at: {path}")
         print(f"Contents: {os.listdir(path)}")
-        break
-else:
-    # Default if none found
     PROCESSED_DATA_DIR = Path("processed_data")
     print(f"Using default processed data path: {PROCESSED_DATA_DIR}")

 print(f"Current directory: {os.getcwd()}")
 print(f"Directory contents: {os.listdir('.')}")
+# Check for Hugging Face Spaces path - this is where uploaded files through UI should be
+HF_SPACES_DIR = "/data"
+if os.path.exists(HF_SPACES_DIR):
+    print(f"Found Hugging Face Spaces data directory at {HF_SPACES_DIR}")
+    print(f"Contents: {os.listdir(HF_SPACES_DIR)}")
+else:
+    print(f"No Hugging Face Spaces data directory found at {HF_SPACES_DIR}")
 # Find the processed data directory
+# Try multiple possible paths, including Hugging Face Spaces paths
 possible_paths = [
+    "/data/processed_data",                # HF Spaces UI uploaded files
+    "/data/files/processed_data",          # Another possible HF path
+    "/data/projects/processed_data",       # Another possible HF path
+    "/home/user/processed_data",           # User home directory
+    "/home/user/app/processed_data",       # App directory in user home
+    "/app/processed_data",                 # Container app directory
+    "processed_data",                      # Relative to working directory
+    "../processed_data",                   # Parent directory
+    "./processed_data",                    # Explicit current directory
 ]
+# Find the first path that exists or has pickle files
+found_pickle_files = False
 for path in possible_paths:
     print(f"Checking path: {path}")
     if os.path.exists(path):
         PROCESSED_DATA_DIR = Path(path)
         print(f"Found processed data at: {path}")
         print(f"Contents: {os.listdir(path)}")
+        # Check if it has pickle files or a qdrant directory
+        has_pickle = any(f.endswith('.pkl') for f in os.listdir(path))
+        has_qdrant = os.path.exists(os.path.join(path, 'qdrant_vectorstore'))
+        if has_pickle or has_qdrant:
+            print(f"Found data files in {path}")
+            found_pickle_files = True
+            break
+    elif os.path.exists(Path(path).parent):
+        print(f"Parent directory exists: {Path(path).parent}")
+        print(f"Contents: {os.listdir(Path(path).parent)}")
+# If we didn't find a path with data files, try a more exhaustive search
+if not found_pickle_files:
+    print("No processed data found in standard locations, searching the file system...")
+    # Try to find any directory with document_chunks.pkl
+    for root, dirs, files in os.walk('/data', topdown=True, followlinks=False):
+        if 'document_chunks.pkl' in files:
+            PROCESSED_DATA_DIR = Path(root)
+            print(f"Found document_chunks.pkl in: {root}")
+            found_pickle_files = True
+            break
+        # Avoid going too deep
+        if root.count(os.sep) >= 5:
+            dirs[:] = []
+# If still not found, use default and create it
+if not found_pickle_files:
     PROCESSED_DATA_DIR = Path("processed_data")
     print(f"Using default processed data path: {PROCESSED_DATA_DIR}")