bhavinmatariya commited on
Commit
e00c5eb
·
1 Parent(s): 3701b8b

remove debugging points and fix the errors

Browse files
Files changed (1) hide show
  1. api/data_pipeline.py +20 -45
api/data_pipeline.py CHANGED
@@ -204,21 +204,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
204
  logger.info(f"Excluded files: {excluded_files}")
205
 
206
  logger.info(f"Reading documents from {path}")
207
-
208
- # Debug: Check if path exists and list some files
209
- if os.path.exists(path):
210
- try:
211
- all_files = []
212
- for root, dirs, files in os.walk(path):
213
- for file in files[:5]: # Limit to first 5 files for logging
214
- all_files.append(os.path.join(root, file))
215
- if len(all_files) >= 10: # Stop after 10 files total
216
- break
217
- logger.info(f"Debug: Found {len(all_files)} sample files in {path}: {all_files[:5]}")
218
- except Exception as e:
219
- logger.error(f"Debug: Error listing files in {path}: {e}")
220
- else:
221
- logger.error(f"Debug: Path does not exist: {path}")
222
 
223
  def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
224
  excluded_dirs: List[str], excluded_files: List[str]) -> bool:
@@ -246,11 +231,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
246
 
247
  # Check if file is in an included directory
248
  if included_dirs:
 
 
 
 
249
  for included in included_dirs:
250
  clean_included = included.strip("./").rstrip("/")
251
- # Check if the directory appears in the path - handle both .git and git patterns
252
- path_parts = normalized_path.split('/')
253
- if clean_included in path_parts or f".{clean_included}" in path_parts:
 
 
 
254
  is_included = True
255
  break
256
 
@@ -277,11 +269,18 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
277
  is_excluded = False
278
 
279
  # Check if file is in an excluded directory
 
 
 
 
280
  for excluded in excluded_dirs:
281
  clean_excluded = excluded.strip("./").rstrip("/")
282
- # Check if the directory appears in the path - handle both .git and git patterns
283
- path_parts = normalized_path.split('/')
284
- if clean_excluded in path_parts or f".{clean_excluded}" in path_parts:
 
 
 
285
  is_excluded = True
286
  break
287
 
@@ -304,20 +303,10 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
304
  # Process code files first
305
  for ext in code_extensions:
306
  files = glob.glob(f"{path}/**/*{ext}", recursive=True)
307
- logger.info(f"Debug: Found {len(files)} files with extension {ext}")
308
- if files and len(files) <= 3: # Log specific files if not too many
309
- logger.info(f"Debug: Files for {ext}: {files}")
310
-
311
- processed_count = 0
312
- excluded_count = 0
313
  for file_path in files:
314
  # Check if file should be processed based on inclusion/exclusion rules
315
  if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
316
- excluded_count += 1
317
- if excluded_count <= 3: # Log first few excluded files for debugging
318
- logger.info(f"Debug: Excluding file: {file_path}")
319
  continue
320
- processed_count += 1
321
 
322
  try:
323
  with open(file_path, "r", encoding="utf-8") as f:
@@ -351,26 +340,14 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
351
  documents.append(doc)
352
  except Exception as e:
353
  logger.error(f"Error reading {file_path}: {e}")
354
-
355
- logger.info(f"Debug: For extension {ext}: processed {processed_count}, excluded {excluded_count} files")
356
 
357
  # Then process documentation files
358
  for ext in doc_extensions:
359
  files = glob.glob(f"{path}/**/*{ext}", recursive=True)
360
- logger.info(f"Debug: Found {len(files)} documentation files with extension {ext}")
361
- if files and len(files) <= 3: # Log specific files if not too many
362
- logger.info(f"Debug: Doc files for {ext}: {files}")
363
-
364
- processed_count = 0
365
- excluded_count = 0
366
  for file_path in files:
367
  # Check if file should be processed based on inclusion/exclusion rules
368
  if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
369
- excluded_count += 1
370
- if excluded_count <= 3: # Log first few excluded files for debugging
371
- logger.info(f"Debug: Excluding doc file: {file_path}")
372
  continue
373
- processed_count += 1
374
 
375
  try:
376
  with open(file_path, "r", encoding="utf-8") as f:
@@ -397,8 +374,6 @@ def read_all_documents(path: str, is_ollama_embedder: bool = None, excluded_dirs
397
  documents.append(doc)
398
  except Exception as e:
399
  logger.error(f"Error reading {file_path}: {e}")
400
-
401
- logger.info(f"Debug: For doc extension {ext}: processed {processed_count}, excluded {excluded_count} files")
402
 
403
  logger.info(f"Found {len(documents)} documents")
404
  return documents
 
204
  logger.info(f"Excluded files: {excluded_files}")
205
 
206
  logger.info(f"Reading documents from {path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
  def should_process_file(file_path: str, use_inclusion: bool, included_dirs: List[str], included_files: List[str],
209
  excluded_dirs: List[str], excluded_files: List[str]) -> bool:
 
231
 
232
  # Check if file is in an included directory
233
  if included_dirs:
234
+ # We need to check relative to the repository root, not absolute paths
235
+ relative_path = os.path.relpath(file_path, path)
236
+ relative_normalized = relative_path.replace(os.sep, '/')
237
+
238
  for included in included_dirs:
239
  clean_included = included.strip("./").rstrip("/")
240
+ # Check if the directory appears in the relative path
241
+ if f"/{clean_included}/" in f"/{relative_normalized}" or f"/.{clean_included}/" in f"/{relative_normalized}":
242
+ is_included = True
243
+ break
244
+ # Also check if the relative path starts with the included directory
245
+ if relative_normalized.startswith(f"{clean_included}/") or relative_normalized.startswith(f".{clean_included}/"):
246
  is_included = True
247
  break
248
 
 
269
  is_excluded = False
270
 
271
  # Check if file is in an excluded directory
272
+ # We need to check relative to the repository root, not absolute paths
273
+ relative_path = os.path.relpath(file_path, path)
274
+ relative_normalized = relative_path.replace(os.sep, '/')
275
+
276
  for excluded in excluded_dirs:
277
  clean_excluded = excluded.strip("./").rstrip("/")
278
+ # Check if the directory appears in the relative path
279
+ if f"/{clean_excluded}/" in f"/{relative_normalized}" or f"/.{clean_excluded}/" in f"/{relative_normalized}":
280
+ is_excluded = True
281
+ break
282
+ # Also check if the relative path starts with the excluded directory
283
+ if relative_normalized.startswith(f"{clean_excluded}/") or relative_normalized.startswith(f".{clean_excluded}/"):
284
  is_excluded = True
285
  break
286
 
 
303
  # Process code files first
304
  for ext in code_extensions:
305
  files = glob.glob(f"{path}/**/*{ext}", recursive=True)
 
 
 
 
 
 
306
  for file_path in files:
307
  # Check if file should be processed based on inclusion/exclusion rules
308
  if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
 
 
 
309
  continue
 
310
 
311
  try:
312
  with open(file_path, "r", encoding="utf-8") as f:
 
340
  documents.append(doc)
341
  except Exception as e:
342
  logger.error(f"Error reading {file_path}: {e}")
 
 
343
 
344
  # Then process documentation files
345
  for ext in doc_extensions:
346
  files = glob.glob(f"{path}/**/*{ext}", recursive=True)
 
 
 
 
 
 
347
  for file_path in files:
348
  # Check if file should be processed based on inclusion/exclusion rules
349
  if not should_process_file(file_path, use_inclusion_mode, included_dirs, included_files, excluded_dirs, excluded_files):
 
 
 
350
  continue
 
351
 
352
  try:
353
  with open(file_path, "r", encoding="utf-8") as f:
 
374
  documents.append(doc)
375
  except Exception as e:
376
  logger.error(f"Error reading {file_path}: {e}")
 
 
377
 
378
  logger.info(f"Found {len(documents)} documents")
379
  return documents