Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
2056715
1
Parent(s):
7cf4785
Auto-sync from demo at Mon Dec 29 08:44:44 UTC 2025
Browse files
graphgen/operators/read/read.py
CHANGED
|
@@ -53,6 +53,7 @@ def read(
|
|
| 53 |
working_dir: Optional[str] = "cache",
|
| 54 |
parallelism: int = 4,
|
| 55 |
recursive: bool = True,
|
|
|
|
| 56 |
**reader_kwargs: Any,
|
| 57 |
) -> ray.data.Dataset:
|
| 58 |
"""
|
|
@@ -63,6 +64,7 @@ def read(
|
|
| 63 |
:param working_dir: Directory to cache intermediate files (PDF processing)
|
| 64 |
:param parallelism: Number of parallel workers
|
| 65 |
:param recursive: Whether to scan directories recursively
|
|
|
|
| 66 |
:param reader_kwargs: Additional kwargs passed to readers
|
| 67 |
:return: Ray Dataset containing all documents
|
| 68 |
"""
|
|
@@ -120,6 +122,9 @@ def read(
|
|
| 120 |
}
|
| 121 |
)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
logger.info("[READ] Successfully read files from %s", input_path)
|
| 124 |
return combined_ds
|
| 125 |
|
|
|
|
| 53 |
working_dir: Optional[str] = "cache",
|
| 54 |
parallelism: int = 4,
|
| 55 |
recursive: bool = True,
|
| 56 |
+
read_nums: Optional[int] = None,
|
| 57 |
**reader_kwargs: Any,
|
| 58 |
) -> ray.data.Dataset:
|
| 59 |
"""
|
|
|
|
| 64 |
:param working_dir: Directory to cache intermediate files (PDF processing)
|
| 65 |
:param parallelism: Number of parallel workers
|
| 66 |
:param recursive: Whether to scan directories recursively
|
| 67 |
+
:param read_nums: Limit the number of documents to read
|
| 68 |
:param reader_kwargs: Additional kwargs passed to readers
|
| 69 |
:return: Ray Dataset containing all documents
|
| 70 |
"""
|
|
|
|
| 122 |
}
|
| 123 |
)
|
| 124 |
|
| 125 |
+
if read_nums is not None:
|
| 126 |
+
combined_ds = combined_ds.limit(read_nums)
|
| 127 |
+
|
| 128 |
logger.info("[READ] Successfully read files from %s", input_path)
|
| 129 |
return combined_ds
|
| 130 |
|