Spaces:

Corin1998
/

RAGWeatherStaytimeTripPlannerV2

Sleeping

App Files Files Community

Corin1998 commited on Sep 10, 2025

Commit

5373d47

verified ·

1 Parent(s): 1fd8f5d

Create retriever.py

Browse files

Files changed (1) hide show

rag/retriever.py +54 -0

rag/retriever.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from __future__ import annotations
+from typing import List, Optional
+import os, pickle
+import numpy as np
+import faiss
+from .embeddings import embed_texts
+from .util_text import clean_text
+class RagStore:
+    """
+    極小実装:
+      - build(docs): ベクトル化してFAISS IndexFlatIPに投入
+      - search(q,k): クエリを埋め込み→内積で上位k件返す
+      - available(): インデックスが存在するか
+    """
+    def __init__(self, index_dir: str = "data/index"):
+        self.index_dir = index_dir
+        self.index: Optional[faiss.Index] = None
+        self.docs: List[dict] = []
+        os.makedirs(index_dir, exist_ok=True)
+    def available(self) -> bool:
+        return self.index is not None and len(self.docs) > 0
+    def build(self, docs: List[dict]) -> None:
+        texts = [clean_text(d.get("text","")) for d in docs if d.get("text")]
+        metas = [d.get("meta") or {} for d in docs if d.get("text")]
+        ids   = [d.get("id") for d in docs if d.get("text")]
+        embs = embed_texts(texts)  # (n, dim)
+        dim = int(embs.shape[1]) if embs.ndim == 2 and embs.size > 0 else 384
+        index = faiss.IndexFlatIP(dim)
+        if embs.shape[0] > 0:
+            index.add(embs)
+        self.index = index
+        # 保存（任意）
+        with open(os.path.join(self.index_dir, "docs.pkl"), "wb") as f:
+            pickle.dump({"ids": ids, "metas": metas, "texts": texts}, f)
+        self.docs = [{"id": i, "text": t, "meta": m} for i, t, m in zip(ids, texts, metas)]
+    def search(self, query: str, k: int = 10) -> List[dict]:
+        if not self.available():
+            return []
+        qv = embed_texts([clean_text(query)])
+        if qv.shape[0] == 0:
+            return []
+        D, I = self.index.search(qv, min(k, max(1, len(self.docs))))
+        idxs = [int(i) for i in I[0] if 0 <= int(i) < len(self.docs)]
+        out = []
+        for j in idxs:
+            d = dict(self.docs[j])  # copy
+            d["score"] = float(D[0][idxs.index(j)]) if D.size else 0.0
+            out.append(d)
+        return out