Spaces:

AtPeak
/

deepwiki-open

Sleeping

App Files Files Community

bhavinmatariya commited on Sep 29, 2025

Commit

e00c5eb

1 Parent(s): 3701b8b

remove debugging points and fix the errors

Browse files

Files changed (1) hide show

api/data_pipeline.py +20 -45

api/data_pipeline.py CHANGED Viewed

@@ -204,21 +204,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
         logger.info(f"Excluded files: {excluded_files}")
     logger.info(f"Reading documents from {path}")
-    # Debug: Check if path exists and list some files
-    if os.path.exists(path):
-        try:
-            all_files = []
-            for root, dirs, files in os.walk(path):
-                for file in files[:5]:  # Limit to first 5 files for logging
-                    all_files.append(os.path.join(root, file))
-                if len(all_files) >= 10:  # Stop after 10 files total
-                    break
-            logger.info(f"Debug: Found {len(all_files)} sample files in {path}: {all_files[:5]}")
-        except Exception as e:
-            logger.error(f"Debug: Error listing files in {path}: {e}")
-    else:
-        logger.error(f"Debug: Path does not exist: {path}")
     def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
                            excluded_dirs: List[str], excluded_files: List[str]) -> bool:
@@ -246,11 +231,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
             # Check if file is in an included directory
             if included_dirs:
                 for included in included_dirs:
                     clean_included = included.strip("./").rstrip("/")
-                    # Check if the directory appears in the path - handle both .git and git patterns
-                    path_parts = normalized_path.split('/')
-                    if clean_included in path_parts or f".{clean_included}" in path_parts:
                         is_included = True
                         break
@@ -277,11 +269,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
             is_excluded = False
             # Check if file is in an excluded directory
             for excluded in excluded_dirs:
                 clean_excluded = excluded.strip("./").rstrip("/")
-                # Check if the directory appears in the path - handle both .git and git patterns
-                path_parts = normalized_path.split('/')
-                if clean_excluded in path_parts or f".{clean_excluded}" in path_parts:
                     is_excluded = True
                     break
@@ -304,20 +303,10 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
     # Process code files first
     for ext in code_extensions:
         files = glob.glob(f"{path}/**/*{ext}", recursive=True)
-        logger.info(f"Debug: Found {len(files)} files with extension {ext}")
-        if files and len(files) <= 3:  # Log specific files if not too many
-            logger.info(f"Debug: Files for {ext}: {files}")
-        processed_count = 0
-        excluded_count = 0
         for file_path in files:
             # Check if file should be processed based on inclusion/exclusion rules
             if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
-                excluded_count += 1
-                if excluded_count <= 3:  # Log first few excluded files for debugging
-                    logger.info(f"Debug: Excluding file: {file_path}")
                 continue
-            processed_count += 1
             try:
                 with open(file_path, "r", encoding="utf-8") as f:
@@ -351,26 +340,14 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
                     documents.append(doc)
             except Exception as e:
                 logger.error(f"Error reading {file_path}: {e}")
-        logger.info(f"Debug: For extension {ext}: processed {processed_count}, excluded {excluded_count} files")
     # Then process documentation files
     for ext in doc_extensions:
         files = glob.glob(f"{path}/**/*{ext}", recursive=True)
-        logger.info(f"Debug: Found {len(files)} documentation files with extension {ext}")
-        if files and len(files) <= 3:  # Log specific files if not too many
-            logger.info(f"Debug: Doc files for {ext}: {files}")
-        processed_count = 0
-        excluded_count = 0
         for file_path in files:
             # Check if file should be processed based on inclusion/exclusion rules
             if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
-                excluded_count += 1
-                if excluded_count <= 3:  # Log first few excluded files for debugging
-                    logger.info(f"Debug: Excluding doc file: {file_path}")
                 continue
-            processed_count += 1
             try:
                 with open(file_path, "r", encoding="utf-8") as f:
@@ -397,8 +374,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
                     documents.append(doc)
             except Exception as e:
                 logger.error(f"Error reading {file_path}: {e}")
-        logger.info(f"Debug: For doc extension {ext}: processed {processed_count}, excluded {excluded_count} files")
     logger.info(f"Found {len(documents)} documents")
     return documents

         logger.info(f"Excluded files: {excluded_files}")
     logger.info(f"Reading documents from {path}")
     def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
                            excluded_dirs: List[str], excluded_files: List[str]) -> bool:
             # Check if file is in an included directory
             if included_dirs:
+                # We need to check relative to the repository root, not absolute paths
+                relative_path = os.path.relpath(file_path, path)
+                relative_normalized = relative_path.replace(os.sep, '/')
                 for included in included_dirs:
                     clean_included = included.strip("./").rstrip("/")
+                    # Check if the directory appears in the relative path
+                    if f"/{clean_included}/" in f"/{relative_normalized}" or f"/.{clean_included}/" in f"/{relative_normalized}":
+                        is_included = True
+                        break
+                    # Also check if the relative path starts with the included directory
+                    if relative_normalized.startswith(f"{clean_included}/") or relative_normalized.startswith(f".{clean_included}/"):
                         is_included = True
                         break
             is_excluded = False
             # Check if file is in an excluded directory
+            # We need to check relative to the repository root, not absolute paths
+            relative_path = os.path.relpath(file_path, path)
+            relative_normalized = relative_path.replace(os.sep, '/')
             for excluded in excluded_dirs:
                 clean_excluded = excluded.strip("./").rstrip("/")
+                # Check if the directory appears in the relative path
+                if f"/{clean_excluded}/" in f"/{relative_normalized}" or f"/.{clean_excluded}/" in f"/{relative_normalized}":
+                    is_excluded = True
+                    break
+                # Also check if the relative path starts with the excluded directory
+                if relative_normalized.startswith(f"{clean_excluded}/") or relative_normalized.startswith(f".{clean_excluded}/"):
                     is_excluded = True
                     break
     # Process code files first
     for ext in code_extensions:
         files = glob.glob(f"{path}/**/*{ext}", recursive=True)
         for file_path in files:
             # Check if file should be processed based on inclusion/exclusion rules
             if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
                 continue
             try:
                 with open(file_path, "r", encoding="utf-8") as f:
                     documents.append(doc)
             except Exception as e:
                 logger.error(f"Error reading {file_path}: {e}")
     # Then process documentation files
     for ext in doc_extensions:
         files = glob.glob(f"{path}/**/*{ext}", recursive=True)
         for file_path in files:
             # Check if file should be processed based on inclusion/exclusion rules
             if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
                 continue
             try:
                 with open(file_path, "r", encoding="utf-8") as f:
                     documents.append(doc)
             except Exception as e:
                 logger.error(f"Error reading {file_path}: {e}")
     logger.info(f"Found {len(documents)} documents")
     return documents