github-actions[bot] commited on
Commit
2056715
·
1 Parent(s): 7cf4785

Auto-sync from demo at Mon Dec 29 08:44:44 UTC 2025

Browse files
Files changed (1) hide show
  1. graphgen/operators/read/read.py +5 -0
graphgen/operators/read/read.py CHANGED
@@ -53,6 +53,7 @@ def read(
53
  working_dir: Optional[str] = "cache",
54
  parallelism: int = 4,
55
  recursive: bool = True,
 
56
  **reader_kwargs: Any,
57
  ) -> ray.data.Dataset:
58
  """
@@ -63,6 +64,7 @@ def read(
63
  :param working_dir: Directory to cache intermediate files (PDF processing)
64
  :param parallelism: Number of parallel workers
65
  :param recursive: Whether to scan directories recursively
 
66
  :param reader_kwargs: Additional kwargs passed to readers
67
  :return: Ray Dataset containing all documents
68
  """
@@ -120,6 +122,9 @@ def read(
120
  }
121
  )
122
 
 
 
 
123
  logger.info("[READ] Successfully read files from %s", input_path)
124
  return combined_ds
125
 
 
53
  working_dir: Optional[str] = "cache",
54
  parallelism: int = 4,
55
  recursive: bool = True,
56
+ read_nums: Optional[int] = None,
57
  **reader_kwargs: Any,
58
  ) -> ray.data.Dataset:
59
  """
 
64
  :param working_dir: Directory to cache intermediate files (PDF processing)
65
  :param parallelism: Number of parallel workers
66
  :param recursive: Whether to scan directories recursively
67
+ :param read_nums: Limit the number of documents to read
68
  :param reader_kwargs: Additional kwargs passed to readers
69
  :return: Ray Dataset containing all documents
70
  """
 
122
  }
123
  )
124
 
125
+ if read_nums is not None:
126
+ combined_ds = combined_ds.limit(read_nums)
127
+
128
  logger.info("[READ] Successfully read files from %s", input_path)
129
  return combined_ds
130