Spaces:
Sleeping
Sleeping
Commit ·
4ea0a84
1
Parent(s): fffe8f4
Change path setting
Browse files- api/data_pipeline.py +17 -5
api/data_pipeline.py
CHANGED
|
@@ -221,7 +221,8 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 221 |
Returns:
|
| 222 |
bool: True if the file should be processed, False otherwise
|
| 223 |
"""
|
| 224 |
-
|
|
|
|
| 225 |
file_name = os.path.basename(file_path)
|
| 226 |
|
| 227 |
if use_inclusion:
|
|
@@ -232,7 +233,9 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 232 |
if included_dirs:
|
| 233 |
for included in included_dirs:
|
| 234 |
clean_included = included.strip("./").rstrip("/")
|
| 235 |
-
if
|
|
|
|
|
|
|
| 236 |
is_included = True
|
| 237 |
break
|
| 238 |
|
|
@@ -261,14 +264,23 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 261 |
# Check if file is in an excluded directory
|
| 262 |
for excluded in excluded_dirs:
|
| 263 |
clean_excluded = excluded.strip("./").rstrip("/")
|
| 264 |
-
if
|
|
|
|
|
|
|
| 265 |
is_excluded = True
|
| 266 |
break
|
| 267 |
|
| 268 |
-
# Check if file matches excluded file patterns
|
| 269 |
if not is_excluded:
|
| 270 |
for excluded_file in excluded_files:
|
| 271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 272 |
is_excluded = True
|
| 273 |
break
|
| 274 |
|
|
|
|
| 221 |
Returns:
|
| 222 |
bool: True if the file should be processed, False otherwise
|
| 223 |
"""
|
| 224 |
+
# Normalize the file path for cross-platform compatibility
|
| 225 |
+
normalized_path = os.path.normpath(file_path).replace(os.sep, '/')
|
| 226 |
file_name = os.path.basename(file_path)
|
| 227 |
|
| 228 |
if use_inclusion:
|
|
|
|
| 233 |
if included_dirs:
|
| 234 |
for included in included_dirs:
|
| 235 |
clean_included = included.strip("./").rstrip("/")
|
| 236 |
+
# Check if the directory appears in the path - handle both .git and git patterns
|
| 237 |
+
path_parts = normalized_path.split('/')
|
| 238 |
+
if clean_included in path_parts or f".{clean_included}" in path_parts:
|
| 239 |
is_included = True
|
| 240 |
break
|
| 241 |
|
|
|
|
| 264 |
# Check if file is in an excluded directory
|
| 265 |
for excluded in excluded_dirs:
|
| 266 |
clean_excluded = excluded.strip("./").rstrip("/")
|
| 267 |
+
# Check if the directory appears in the path - handle both .git and git patterns
|
| 268 |
+
path_parts = normalized_path.split('/')
|
| 269 |
+
if clean_excluded in path_parts or f".{clean_excluded}" in path_parts:
|
| 270 |
is_excluded = True
|
| 271 |
break
|
| 272 |
|
| 273 |
+
# Check if file matches excluded file patterns
|
| 274 |
if not is_excluded:
|
| 275 |
for excluded_file in excluded_files:
|
| 276 |
+
# Handle pattern matching more robustly
|
| 277 |
+
if excluded_file.startswith("*."):
|
| 278 |
+
# Handle wildcard patterns like "*.pyc"
|
| 279 |
+
extension = excluded_file[1:] # Remove the *
|
| 280 |
+
if file_name.endswith(extension):
|
| 281 |
+
is_excluded = True
|
| 282 |
+
break
|
| 283 |
+
elif file_name == excluded_file:
|
| 284 |
is_excluded = True
|
| 285 |
break
|
| 286 |
|