Spaces:
Sleeping
Sleeping
Commit ·
e00c5eb
1
Parent(s): 3701b8b
remove debugging points and fix the errors
Browse files- api/data_pipeline.py +20 -45
api/data_pipeline.py
CHANGED
|
@@ -204,21 +204,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 204 |
logger.info(f"Excluded files: {excluded_files}")
|
| 205 |
|
| 206 |
logger.info(f"Reading documents from {path}")
|
| 207 |
-
|
| 208 |
-
# Debug: Check if path exists and list some files
|
| 209 |
-
if os.path.exists(path):
|
| 210 |
-
try:
|
| 211 |
-
all_files = []
|
| 212 |
-
for root, dirs, files in os.walk(path):
|
| 213 |
-
for file in files[:5]: # Limit to first 5 files for logging
|
| 214 |
-
all_files.append(os.path.join(root, file))
|
| 215 |
-
if len(all_files) >= 10: # Stop after 10 files total
|
| 216 |
-
break
|
| 217 |
-
logger.info(f"Debug: Found {len(all_files)} sample files in {path}: {all_files[:5]}")
|
| 218 |
-
except Exception as e:
|
| 219 |
-
logger.error(f"Debug: Error listing files in {path}: {e}")
|
| 220 |
-
else:
|
| 221 |
-
logger.error(f"Debug: Path does not exist: {path}")
|
| 222 |
|
| 223 |
def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
|
| 224 |
excluded_dirs: List[str], excluded_files: List[str]) -> bool:
|
|
@@ -246,11 +231,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 246 |
|
| 247 |
# Check if file is in an included directory
|
| 248 |
if included_dirs:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
for included in included_dirs:
|
| 250 |
clean_included = included.strip("./").rstrip("/")
|
| 251 |
-
# Check if the directory appears in the
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
| 254 |
is_included = True
|
| 255 |
break
|
| 256 |
|
|
@@ -277,11 +269,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 277 |
is_excluded = False
|
| 278 |
|
| 279 |
# Check if file is in an excluded directory
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
for excluded in excluded_dirs:
|
| 281 |
clean_excluded = excluded.strip("./").rstrip("/")
|
| 282 |
-
# Check if the directory appears in the
|
| 283 |
-
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
| 285 |
is_excluded = True
|
| 286 |
break
|
| 287 |
|
|
@@ -304,20 +303,10 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 304 |
# Process code files first
|
| 305 |
for ext in code_extensions:
|
| 306 |
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
|
| 307 |
-
logger.info(f"Debug: Found {len(files)} files with extension {ext}")
|
| 308 |
-
if files and len(files) <= 3: # Log specific files if not too many
|
| 309 |
-
logger.info(f"Debug: Files for {ext}: {files}")
|
| 310 |
-
|
| 311 |
-
processed_count = 0
|
| 312 |
-
excluded_count = 0
|
| 313 |
for file_path in files:
|
| 314 |
# Check if file should be processed based on inclusion/exclusion rules
|
| 315 |
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
|
| 316 |
-
excluded_count += 1
|
| 317 |
-
if excluded_count <= 3: # Log first few excluded files for debugging
|
| 318 |
-
logger.info(f"Debug: Excluding file: {file_path}")
|
| 319 |
continue
|
| 320 |
-
processed_count += 1
|
| 321 |
|
| 322 |
try:
|
| 323 |
with open(file_path, "r", encoding="utf-8") as f:
|
|
@@ -351,26 +340,14 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 351 |
documents.append(doc)
|
| 352 |
except Exception as e:
|
| 353 |
logger.error(f"Error reading {file_path}: {e}")
|
| 354 |
-
|
| 355 |
-
logger.info(f"Debug: For extension {ext}: processed {processed_count}, excluded {excluded_count} files")
|
| 356 |
|
| 357 |
# Then process documentation files
|
| 358 |
for ext in doc_extensions:
|
| 359 |
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
|
| 360 |
-
logger.info(f"Debug: Found {len(files)} documentation files with extension {ext}")
|
| 361 |
-
if files and len(files) <= 3: # Log specific files if not too many
|
| 362 |
-
logger.info(f"Debug: Doc files for {ext}: {files}")
|
| 363 |
-
|
| 364 |
-
processed_count = 0
|
| 365 |
-
excluded_count = 0
|
| 366 |
for file_path in files:
|
| 367 |
# Check if file should be processed based on inclusion/exclusion rules
|
| 368 |
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
|
| 369 |
-
excluded_count += 1
|
| 370 |
-
if excluded_count <= 3: # Log first few excluded files for debugging
|
| 371 |
-
logger.info(f"Debug: Excluding doc file: {file_path}")
|
| 372 |
continue
|
| 373 |
-
processed_count += 1
|
| 374 |
|
| 375 |
try:
|
| 376 |
with open(file_path, "r", encoding="utf-8") as f:
|
|
@@ -397,8 +374,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
|
|
| 397 |
documents.append(doc)
|
| 398 |
except Exception as e:
|
| 399 |
logger.error(f"Error reading {file_path}: {e}")
|
| 400 |
-
|
| 401 |
-
logger.info(f"Debug: For doc extension {ext}: processed {processed_count}, excluded {excluded_count} files")
|
| 402 |
|
| 403 |
logger.info(f"Found {len(documents)} documents")
|
| 404 |
return documents
|
|
|
|
| 204 |
logger.info(f"Excluded files: {excluded_files}")
|
| 205 |
|
| 206 |
logger.info(f"Reading documents from {path}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
|
| 209 |
excluded_dirs: List[str], excluded_files: List[str]) -> bool:
|
|
|
|
| 231 |
|
| 232 |
# Check if file is in an included directory
|
| 233 |
if included_dirs:
|
| 234 |
+
# We need to check relative to the repository root, not absolute paths
|
| 235 |
+
relative_path = os.path.relpath(file_path, path)
|
| 236 |
+
relative_normalized = relative_path.replace(os.sep, '/')
|
| 237 |
+
|
| 238 |
for included in included_dirs:
|
| 239 |
clean_included = included.strip("./").rstrip("/")
|
| 240 |
+
# Check if the directory appears in the relative path
|
| 241 |
+
if f"/{clean_included}/" in f"/{relative_normalized}" or f"/.{clean_included}/" in f"/{relative_normalized}":
|
| 242 |
+
is_included = True
|
| 243 |
+
break
|
| 244 |
+
# Also check if the relative path starts with the included directory
|
| 245 |
+
if relative_normalized.startswith(f"{clean_included}/") or relative_normalized.startswith(f".{clean_included}/"):
|
| 246 |
is_included = True
|
| 247 |
break
|
| 248 |
|
|
|
|
| 269 |
is_excluded = False
|
| 270 |
|
| 271 |
# Check if file is in an excluded directory
|
| 272 |
+
# We need to check relative to the repository root, not absolute paths
|
| 273 |
+
relative_path = os.path.relpath(file_path, path)
|
| 274 |
+
relative_normalized = relative_path.replace(os.sep, '/')
|
| 275 |
+
|
| 276 |
for excluded in excluded_dirs:
|
| 277 |
clean_excluded = excluded.strip("./").rstrip("/")
|
| 278 |
+
# Check if the directory appears in the relative path
|
| 279 |
+
if f"/{clean_excluded}/" in f"/{relative_normalized}" or f"/.{clean_excluded}/" in f"/{relative_normalized}":
|
| 280 |
+
is_excluded = True
|
| 281 |
+
break
|
| 282 |
+
# Also check if the relative path starts with the excluded directory
|
| 283 |
+
if relative_normalized.startswith(f"{clean_excluded}/") or relative_normalized.startswith(f".{clean_excluded}/"):
|
| 284 |
is_excluded = True
|
| 285 |
break
|
| 286 |
|
|
|
|
| 303 |
# Process code files first
|
| 304 |
for ext in code_extensions:
|
| 305 |
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
for file_path in files:
|
| 307 |
# Check if file should be processed based on inclusion/exclusion rules
|
| 308 |
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
|
|
|
|
|
|
|
|
|
|
| 309 |
continue
|
|
|
|
| 310 |
|
| 311 |
try:
|
| 312 |
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
|
| 340 |
documents.append(doc)
|
| 341 |
except Exception as e:
|
| 342 |
logger.error(f"Error reading {file_path}: {e}")
|
|
|
|
|
|
|
| 343 |
|
| 344 |
# Then process documentation files
|
| 345 |
for ext in doc_extensions:
|
| 346 |
files = glob.glob(f"{path}/**/*{ext}", recursive=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
for file_path in files:
|
| 348 |
# Check if file should be processed based on inclusion/exclusion rules
|
| 349 |
if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
|
|
|
|
|
|
|
|
|
|
| 350 |
continue
|
|
|
|
| 351 |
|
| 352 |
try:
|
| 353 |
with open(file_path, "r", encoding="utf-8") as f:
|
|
|
|
| 374 |
documents.append(doc)
|
| 375 |
except Exception as e:
|
| 376 |
logger.error(f"Error reading {file_path}: {e}")
|
|
|
|
|
|
|
| 377 |
|
| 378 |
logger.info(f"Found {len(documents)} documents")
|
| 379 |
return documents
|