github-actions[bot] commited on
Commit
3f86ed0
·
1 Parent(s): 283e483

Auto-sync from demo at Fri Nov 7 11:01:47 UTC 2025

Browse files
Files changed (1) hide show
  1. graphgen/operators/read/read_files.py +36 -13
graphgen/operators/read/read_files.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  from graphgen.models import (
2
  CSVReader,
3
  JSONLReader,
@@ -8,6 +11,7 @@ from graphgen.models import (
8
  RDFReader,
9
  TXTReader,
10
  )
 
11
 
12
  _MAPPING = {
13
  "jsonl": JSONLReader,
@@ -23,17 +27,36 @@ _MAPPING = {
23
  }
24
 
25
 
 
 
 
 
 
 
 
26
  def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]:
27
- suffix = file_path.split(".")[-1].lower()
28
- if suffix == "pdf":
29
- if cache_dir is not None:
30
- reader = _MAPPING[suffix](output_dir=cache_dir)
31
- else:
32
- reader = _MAPPING[suffix]()
33
- elif suffix in _MAPPING:
34
- reader = _MAPPING[suffix]()
35
- else:
36
- raise ValueError(
37
- f"Unsupported file format: {suffix}. Supported formats are: {list(_MAPPING.keys())}"
38
- )
39
- return reader.read(file_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Any, Dict, List
3
+
4
  from graphgen.models import (
5
  CSVReader,
6
  JSONLReader,
 
11
  RDFReader,
12
  TXTReader,
13
  )
14
+ from graphgen.utils import logger
15
 
16
  _MAPPING = {
17
  "jsonl": JSONLReader,
 
27
  }
28
 
29
 
30
+ def _build_reader(suffix: str, cache_dir: str | None):
31
+ suffix = suffix.lower()
32
+ if suffix == "pdf" and cache_dir is not None:
33
+ return _MAPPING[suffix](output_dir=cache_dir)
34
+ return _MAPPING[suffix]()
35
+
36
+
37
  def read_files(file_path: str, cache_dir: str | None = None) -> list[dict]:
38
+ path = Path(file_path).expanduser()
39
+ if not path.exists():
40
+ raise FileNotFoundError(f"input_path not found: {file_path}")
41
+
42
+ if path.is_file():
43
+ suffix = path.suffix.lstrip(".")
44
+ reader = _build_reader(suffix, cache_dir)
45
+ return reader.read(str(path))
46
+
47
+ support_suffix = set(_MAPPING.keys())
48
+ files_to_read = [
49
+ p for p in path.rglob("*") if p.suffix.lstrip(".").lower() in support_suffix
50
+ ]
51
+ logger.info("Found %d file(s) under folder %s", len(files_to_read), file_path)
52
+
53
+ all_docs: List[Dict[str, Any]] = []
54
+ for p in files_to_read:
55
+ try:
56
+ suffix = p.suffix.lstrip(".")
57
+ reader = _build_reader(suffix, cache_dir)
58
+ all_docs.extend(reader.read(str(p)))
59
+ except Exception as e: # pylint: disable=broad-except
60
+ logger.exception("Error reading %s: %s", p, e)
61
+
62
+ return all_docs