Corin1998 commited on
Commit
508739d
·
verified ·
1 Parent(s): 6689da3

Update rag/ingest.py

Browse files
Files changed (1) hide show
  1. rag/ingest.py +39 -19
rag/ingest.py CHANGED
@@ -4,7 +4,9 @@ import csv
4
  from glob import glob
5
  from dataclasses import dataclass, asdict
6
  from typing import List, Dict, Any
7
- from .util_text import clean_text
 
 
8
 
9
  @dataclass
10
  class Doc:
@@ -13,43 +15,61 @@ class Doc:
13
  meta: Dict[str, Any]
14
 
15
  def read_blogs(path: str) -> List[Doc]:
16
- docs = []
 
 
17
  for p in sorted(glob(os.path.join(path, "*.md"))):
18
  with open(p, "r", encoding="utf-8") as f:
19
  text = f.read()
20
- docs.append(Doc(id=f"blog::{os.path.basename(p)}",text=clean_text(text),meta={"type": "blog", "source":p}))
 
 
 
 
 
 
21
  return docs
22
 
23
  def read_reviews(path: str) -> List[Doc]:
24
- docs = []
25
- for p in sorted(glob(os.path.join(path,"*.jsol"))):
 
 
26
  with open(p, "r", encoding="utf-8") as f:
27
  for i, line in enumerate(f):
28
  try:
29
- j = json.load(line)
30
- text = clean_text(j.get("text", " "))
31
- meta = {k: j.get(k) for k in ["title", "rating", "lat", "lon", "url", "tags"]}
32
- meta.upddate({"type": "review", "source":p})
33
- docs.append(Doc(id=f"review:{os.path.basename(p)}::{i}", text=text, meta=meta))
34
  except Exception:
35
  continue
 
 
 
 
 
 
 
36
  return docs
37
 
38
  def read_events(path: str) -> List[Doc]:
39
- docs = []
 
 
40
  for p in sorted(glob(os.path.join(path, "*.csv"))):
41
  with open(p, "r", encoding="utf-8") as f:
42
  reader = csv.DictReader(f)
43
  for i, row in enumerate(reader):
44
- text = clean_text(" ".join([row.get("title", ""),row.get("description", "")]))
45
- meta = {k: row.get(k)for k in ["title", "start", "end", "lat", "low", "url", "tags", "city"]}
46
- meta.update({"type": "event", "source":p})
 
 
 
47
  docs.append(Doc(id=f"event::{os.path.basename(p)}::{i}", text=text, meta=meta))
48
  return docs
49
 
50
  def build_corpus(paths: Dict[str, str]) -> List[Dict[str, Any]]:
51
- docs = []
52
- docs += read_blogs(paths["blogs"])if os.path.isdir(paths["blogs"])else[]
53
- docs += read_reviews(paths["review"])if os.path.isdir(paths["review"])else[]
54
- docs += read_events(paths["events"])if os.path.isdir(paths["events"])else[]
55
- return[asdict(d) for d in docs]
 
4
  from glob import glob
5
  from dataclasses import dataclass, asdict
6
  from typing import List, Dict, Any
7
+
8
+ # 絶対インポートに変更(パッケージ解決を安定化)
9
+ from rag.util_text import clean_text
10
 
11
  @dataclass
12
  class Doc:
 
15
  meta: Dict[str, Any]
16
 
17
  def read_blogs(path: str) -> List[Doc]:
18
+ docs: List[Doc] = []
19
+ if not os.path.isdir(path):
20
+ return docs
21
  for p in sorted(glob(os.path.join(path, "*.md"))):
22
  with open(p, "r", encoding="utf-8") as f:
23
  text = f.read()
24
+ docs.append(
25
+ Doc(
26
+ id=f"blog::{os.path.basename(p)}",
27
+ text=clean_text(text),
28
+ meta={"type": "blog", "source": p, "title": os.path.splitext(os.path.basename(p))[0]},
29
+ )
30
+ )
31
  return docs
32
 
33
  def read_reviews(path: str) -> List[Doc]:
34
+ docs: List[Doc] = []
35
+ if not os.path.isdir(path):
36
+ return docs
37
+ for p in sorted(glob(os.path.join(path, "*.jsonl"))):
38
  with open(p, "r", encoding="utf-8") as f:
39
  for i, line in enumerate(f):
40
  try:
41
+ j = json.loads(line)
 
 
 
 
42
  except Exception:
43
  continue
44
+ text = clean_text(j.get("text", ""))
45
+ meta = {
46
+ k: j.get(k)
47
+ for k in ["title", "rating", "lat", "lon", "url", "tags"]
48
+ }
49
+ meta.update({"type": "review", "source": p})
50
+ docs.append(Doc(id=f"review::{os.path.basename(p)}::{i}", text=text, meta=meta))
51
  return docs
52
 
53
  def read_events(path: str) -> List[Doc]:
54
+ docs: List[Doc] = []
55
+ if not os.path.isdir(path):
56
+ return docs
57
  for p in sorted(glob(os.path.join(path, "*.csv"))):
58
  with open(p, "r", encoding="utf-8") as f:
59
  reader = csv.DictReader(f)
60
  for i, row in enumerate(reader):
61
+ text = clean_text(" ".join([row.get("title", ""), row.get("description", "")]))
62
+ meta = {
63
+ k: row.get(k)
64
+ for k in ["title", "start", "end", "lat", "lon", "url", "tags", "city"]
65
+ }
66
+ meta.update({"type": "event", "source": p})
67
  docs.append(Doc(id=f"event::{os.path.basename(p)}::{i}", text=text, meta=meta))
68
  return docs
69
 
70
  def build_corpus(paths: Dict[str, str]) -> List[Dict[str, Any]]:
71
+ docs: List[Doc] = []
72
+ docs += read_blogs(paths.get("blogs", ""))
73
+ docs += read_reviews(paths.get("reviews", ""))
74
+ docs += read_events(paths.get("events", ""))
75
+ return [asdict(d) for d in docs]