Spaces:
Running
Running
github-actions[bot]
commited on
Commit
·
3f86ed0
1
Parent(s):
283e483
Auto-sync from demo at Fri Nov 7 11:01:47 UTC 2025
Browse files
graphgen/operators/read/read_files.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from graphgen.models import (
|
| 2 |
CSVReader,
|
| 3 |
JSONLReader,
|
|
@@ -8,6 +11,7 @@ from graphgen.models import (
|
|
| 8 |
RDFReader,
|
| 9 |
TXTReader,
|
| 10 |
)
|
|
|
|
| 11 |
|
| 12 |
_MAPPING = {
|
| 13 |
"jsonl": JSONLReader,
|
|
@@ -23,17 +27,36 @@ _MAPPING = {
|
|
| 23 |
}
|
| 24 |
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]:
|
| 27 |
-
|
| 28 |
-
if
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
reader
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
)
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Any, Dict, List
|
| 3 |
+
|
| 4 |
from graphgen.models import (
|
| 5 |
CSVReader,
|
| 6 |
JSONLReader,
|
|
|
|
| 11 |
RDFReader,
|
| 12 |
TXTReader,
|
| 13 |
)
|
| 14 |
+
from graphgen.utils import logger
|
| 15 |
|
| 16 |
_MAPPING = {
|
| 17 |
"jsonl": JSONLReader,
|
|
|
|
| 27 |
}
|
| 28 |
|
| 29 |
|
| 30 |
+
def _build_reader(suffix: str, cache_dir: str | None):
|
| 31 |
+
suffix = suffix.lower()
|
| 32 |
+
if suffix == "pdf" and cache_dir is not None:
|
| 33 |
+
return _MAPPING[suffix](output_dir=cache_dir)
|
| 34 |
+
return _MAPPING[suffix]()
|
| 35 |
+
|
| 36 |
+
|
| 37 |
def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]:
|
| 38 |
+
path = Path(file_path).expanduser()
|
| 39 |
+
if not path.exists():
|
| 40 |
+
raise FileNotFoundError(f"input_path not found: {file_path}")
|
| 41 |
+
|
| 42 |
+
if path.is_file():
|
| 43 |
+
suffix = path.suffix.lstrip(".")
|
| 44 |
+
reader = _build_reader(suffix, cache_dir)
|
| 45 |
+
return reader.read(str(path))
|
| 46 |
+
|
| 47 |
+
support_suffix = set(_MAPPING.keys())
|
| 48 |
+
files_to_read = [
|
| 49 |
+
p for p in path.rglob("*") if p.suffix.lstrip(".").lower() in support_suffix
|
| 50 |
+
]
|
| 51 |
+
logger.info("Found %d file(s) under folder %s", len(files_to_read), file_path)
|
| 52 |
+
|
| 53 |
+
all_docs: List[Dict[str, Any]] = []
|
| 54 |
+
for p in files_to_read:
|
| 55 |
+
try:
|
| 56 |
+
suffix = p.suffix.lstrip(".")
|
| 57 |
+
reader = _build_reader(suffix, cache_dir)
|
| 58 |
+
all_docs.extend(reader.read(str(p)))
|
| 59 |
+
except Exception as e: # pylint: disable=broad-except
|
| 60 |
+
logger.exception("Error reading %s: %s", p, e)
|
| 61 |
+
|
| 62 |
+
return all_docs
|