Spaces:

Corin1998
/

RAGWeatherStaytimeTripPlanner

Sleeping

App Files Files Community

Corin1998 commited on Sep 9, 2025

Commit

508739d

verified ·

1 Parent(s): 6689da3

Update rag/ingest.py

Browse files

Files changed (1) hide show

rag/ingest.py +39 -19

rag/ingest.py CHANGED Viewed

@@ -4,7 +4,9 @@ import csv
 from glob import glob
 from dataclasses import dataclass, asdict
 from typing import List, Dict, Any
-from .util_text import clean_text
 @dataclass
 class Doc:
@@ -13,43 +15,61 @@ class Doc:
     meta: Dict[str, Any]
 def read_blogs(path: str) -> List[Doc]:
-    docs = []
     for p in sorted(glob(os.path.join(path, "*.md"))):
         with open(p, "r", encoding="utf-8") as f:
             text = f.read()
-        docs.append(Doc(id=f"blog::{os.path.basename(p)}",text=clean_text(text),meta={"type": "blog", "source":p}))
     return docs
 def read_reviews(path: str) -> List[Doc]:
-    docs = []
-    for p in sorted(glob(os.path.join(path,"*.jsol"))):
         with open(p, "r", encoding="utf-8") as f:
             for i, line in enumerate(f):
                 try:
-                    j = json.load(line)
-                    text = clean_text(j.get("text", " "))
-                    meta = {k: j.get(k) for k in ["title", "rating", "lat", "lon", "url", "tags"]}
-                    meta.upddate({"type": "review", "source":p})
-                    docs.append(Doc(id=f"review:{os.path.basename(p)}::{i}", text=text, meta=meta))
                 except Exception:
                     continue
     return docs
 def read_events(path: str) -> List[Doc]:
-    docs = []
     for p in sorted(glob(os.path.join(path, "*.csv"))):
         with open(p, "r", encoding="utf-8") as f:
             reader = csv.DictReader(f)
             for i, row in enumerate(reader):
-                text = clean_text(" ".join([row.get("title", ""),row.get("description", "")]))
-                meta = {k: row.get(k)for k in ["title", "start", "end", "lat", "low", "url", "tags", "city"]}
-                meta.update({"type": "event",  "source":p})
                 docs.append(Doc(id=f"event::{os.path.basename(p)}::{i}", text=text, meta=meta))
     return docs
 def build_corpus(paths: Dict[str, str]) -> List[Dict[str, Any]]:
-    docs = []
-    docs += read_blogs(paths["blogs"])if os.path.isdir(paths["blogs"])else[]
-    docs += read_reviews(paths["review"])if os.path.isdir(paths["review"])else[]
-    docs += read_events(paths["events"])if os.path.isdir(paths["events"])else[]
-    return[asdict(d) for d in docs]

 from glob import glob
 from dataclasses import dataclass, asdict
 from typing import List, Dict, Any
+# 絶対インポートに変更（パッケージ解決を安定化）
+from rag.util_text import clean_text
 @dataclass
 class Doc:
     meta: Dict[str, Any]
 def read_blogs(path: str) -> List[Doc]:
+    docs: List[Doc] = []
+    if not os.path.isdir(path):
+        return docs
     for p in sorted(glob(os.path.join(path, "*.md"))):
         with open(p, "r", encoding="utf-8") as f:
             text = f.read()
+        docs.append(
+            Doc(
+                id=f"blog::{os.path.basename(p)}",
+                text=clean_text(text),
+                meta={"type": "blog", "source": p, "title": os.path.splitext(os.path.basename(p))[0]},
+            )
+        )
     return docs
 def read_reviews(path: str) -> List[Doc]:
+    docs: List[Doc] = []
+    if not os.path.isdir(path):
+        return docs
+    for p in sorted(glob(os.path.join(path, "*.jsonl"))):
         with open(p, "r", encoding="utf-8") as f:
             for i, line in enumerate(f):
                 try:
+                    j = json.loads(line)
                 except Exception:
                     continue
+                text = clean_text(j.get("text", ""))
+                meta = {
+                    k: j.get(k)
+                    for k in ["title", "rating", "lat", "lon", "url", "tags"]
+                }
+                meta.update({"type": "review", "source": p})
+                docs.append(Doc(id=f"review::{os.path.basename(p)}::{i}", text=text, meta=meta))
     return docs
 def read_events(path: str) -> List[Doc]:
+    docs: List[Doc] = []
+    if not os.path.isdir(path):
+        return docs
     for p in sorted(glob(os.path.join(path, "*.csv"))):
         with open(p, "r", encoding="utf-8") as f:
             reader = csv.DictReader(f)
             for i, row in enumerate(reader):
+                text = clean_text(" ".join([row.get("title", ""), row.get("description", "")]))
+                meta = {
+                    k: row.get(k)
+                    for k in ["title", "start", "end", "lat", "lon", "url", "tags", "city"]
+                }
+                meta.update({"type": "event", "source": p})
                 docs.append(Doc(id=f"event::{os.path.basename(p)}::{i}", text=text, meta=meta))
     return docs
 def build_corpus(paths: Dict[str, str]) -> List[Dict[str, Any]]:
+    docs: List[Doc] = []
+    docs += read_blogs(paths.get("blogs", ""))
+    docs += read_reviews(paths.get("reviews", ""))
+    docs += read_events(paths.get("events", ""))
+    return [asdict(d) for d in docs]