Spaces:
Sleeping
Sleeping
github-actions[bot]
commited on
Commit
·
bf63ef4
1
Parent(s):
61172af
Auto-sync from demo at Tue Dec 23 07:51:53 UTC 2025
Browse files
graphgen/operators/read/read.py
CHANGED
|
@@ -50,7 +50,7 @@ def _build_reader(suffix: str, cache_dir: str | None, **reader_kwargs):
|
|
| 50 |
def read(
|
| 51 |
input_path: Union[str, List[str]],
|
| 52 |
allowed_suffix: Optional[List[str]] = None,
|
| 53 |
-
|
| 54 |
parallelism: int = 4,
|
| 55 |
recursive: bool = True,
|
| 56 |
**reader_kwargs: Any,
|
|
@@ -60,7 +60,7 @@ def read(
|
|
| 60 |
|
| 61 |
:param input_path: File or directory path(s) to read from
|
| 62 |
:param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
|
| 63 |
-
:param
|
| 64 |
:param parallelism: Number of parallel workers
|
| 65 |
:param recursive: Whether to scan directories recursively
|
| 66 |
:param reader_kwargs: Additional kwargs passed to readers
|
|
@@ -70,7 +70,7 @@ def read(
|
|
| 70 |
# 1. Scan all paths to discover files
|
| 71 |
logger.info("[READ] Scanning paths: %s", input_path)
|
| 72 |
scanner = ParallelFileScanner(
|
| 73 |
-
cache_dir=
|
| 74 |
allowed_suffix=allowed_suffix,
|
| 75 |
rescan=False,
|
| 76 |
max_workers=parallelism if parallelism > 0 else 1,
|
|
@@ -100,7 +100,7 @@ def read(
|
|
| 100 |
# 3. Create read tasks
|
| 101 |
read_tasks = []
|
| 102 |
for suffix, file_paths in files_by_suffix.items():
|
| 103 |
-
reader = _build_reader(suffix,
|
| 104 |
ds = reader.read(file_paths)
|
| 105 |
read_tasks.append(ds)
|
| 106 |
|
|
|
|
| 50 |
def read(
|
| 51 |
input_path: Union[str, List[str]],
|
| 52 |
allowed_suffix: Optional[List[str]] = None,
|
| 53 |
+
working_dir: Optional[str] = "cache",
|
| 54 |
parallelism: int = 4,
|
| 55 |
recursive: bool = True,
|
| 56 |
**reader_kwargs: Any,
|
|
|
|
| 60 |
|
| 61 |
:param input_path: File or directory path(s) to read from
|
| 62 |
:param allowed_suffix: List of allowed file suffixes (e.g., ['pdf', 'txt'])
|
| 63 |
+
:param working_dir: Directory to cache intermediate files (PDF processing)
|
| 64 |
:param parallelism: Number of parallel workers
|
| 65 |
:param recursive: Whether to scan directories recursively
|
| 66 |
:param reader_kwargs: Additional kwargs passed to readers
|
|
|
|
| 70 |
# 1. Scan all paths to discover files
|
| 71 |
logger.info("[READ] Scanning paths: %s", input_path)
|
| 72 |
scanner = ParallelFileScanner(
|
| 73 |
+
cache_dir=working_dir,
|
| 74 |
allowed_suffix=allowed_suffix,
|
| 75 |
rescan=False,
|
| 76 |
max_workers=parallelism if parallelism > 0 else 1,
|
|
|
|
| 100 |
# 3. Create read tasks
|
| 101 |
read_tasks = []
|
| 102 |
for suffix, file_paths in files_by_suffix.items():
|
| 103 |
+
reader = _build_reader(suffix, working_dir, **reader_kwargs)
|
| 104 |
ds = reader.read(file_paths)
|
| 105 |
read_tasks.append(ds)
|
| 106 |
|