Spaces:

Corin1998
/

Agent_StudioDocker

Sleeping

App Files Files Community

Corin1998 commited on Sep 21, 2025

Commit

c52f5bf

verified ·

1 Parent(s): 1716018

Update modules/rag_retiever.py

Browse files

Files changed (1) hide show

modules/rag_retiever.py +10 -53

modules/rag_retiever.py CHANGED Viewed

@@ -1,42 +1,32 @@
 import os
 import json
 import time
 from pathlib import Path
-from typing import List, Tuple, Dict, Any, Optional
 import numpy as np
-# 依存は遅延ロード（Space 起動を速く＆環境差での ImportError を回避）
 def _lazy_imports():
     from sentence_transformers import SentenceTransformer
-    import numpy as _np  # noqa
     return SentenceTransformer
-# 内部ユーティリティ
 def _now() -> int:
     return int(time.time())
-# === ストレージ場所は utils の auto-pick に従う ===
 try:
     from modules.utils import ensure_dirs, data_dir
 except Exception:
-    # 非常時フォールバック
     def ensure_dirs() -> None:
         Path("/tmp/agent_studio").mkdir(parents=True, exist_ok=True)
     def data_dir() -> Path:
         ensure_dirs()
         return Path("/tmp/agent_studio")
-# ========= チャンク読み込み =========
 def _chunks_path() -> Path:
     return data_dir() / "chunks.jsonl"
 def _load_chunks() -> List[Dict[str, Any]]:
-    """
-    rag_indexer が書き出した想定の簡易フォーマット:
-      each line: {"text": "...", "source": "path_or_url", "meta": {...}}
-    無ければ空リスト。
-    """
     p = _chunks_path()
     if not p.exists():
         return []
@@ -54,9 +44,7 @@ def _load_chunks() -> List[Dict[str, Any]]:
                 continue
     return rows
-# ========= Embedding モデルとキャッシュ =========
 def _emb_model_name() -> str:
-    # indexer と揃える前提。未指定なら軽量モデル
     return os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 def _emb_cache_dir() -> Path:
@@ -67,12 +55,6 @@ def _emb_cache_paths() -> Tuple[Path, Path]:
     return d / "embeddings.npy", d / "meta.json"
 def _load_or_build_embeddings(chunks: List[Dict[str, Any]]) -> Tuple[np.ndarray, List[int]]:
-    """
-    - 既存キャッシュ（embeddings.npy, meta.json）が chunks と一致すればそれをロード
-    - 不一致または欠損なら再計算して保存
-    Returns:
-      (emb_matrix: np.ndarray [N, D], indices: List[int] mapping to chunks)
-    """
     ensure_dirs()
     _emb_cache_dir().mkdir(parents=True, exist_ok=True)
     npy_path, meta_path = _emb_cache_paths()
@@ -81,55 +63,36 @@ def _load_or_build_embeddings(chunks: List[Dict[str, Any]]) -> Tuple[np.ndarray,
         try:
             with open(meta_path, "r", encoding="utf-8") as f:
                 meta = json.load(f)
-            N_meta = int(meta.get("n", -1))
-            if N_meta == len(chunks) and meta.get("model") == _emb_model_name():
                 emb = np.load(npy_path)
-                idx = list(range(len(chunks)))
-                # 次元不整合チェック
                 if emb.shape[0] == len(chunks):
-                    return emb, idx
         except Exception:
-            pass  # 再計算へ
-    # 再計算
     SentenceTransformer = _lazy_imports()
     model = SentenceTransformer(_emb_model_name())
     texts = [str(c.get("text", "")) for c in chunks]
     if not texts:
         return np.zeros((0, 384), dtype="float32"), []
-    emb = model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)  # shape: [N, D]
     np.save(npy_path, emb)
     with open(meta_path, "w", encoding="utf-8") as f:
         json.dump({"n": len(chunks), "model": _emb_model_name(), "ts": _now()}, f)
     return emb, list(range(len(chunks)))
-# ========= 類似度計算 =========
 def _cosine_topk(matrix: np.ndarray, query_vec: np.ndarray, top_k: int) -> List[int]:
-    """
-    行列: [N, D]（正規化済み想定）, query: [D]
-    返り値: 上位インデックス
-    """
     if matrix.size == 0:
         return []
-    # dot がそのまま cos 類似度（normalize_embeddings=True を前提）
     sims = matrix @ query_vec
-    # np.argpartition で高速 topk
     k = min(top_k, matrix.shape[0])
     part = np.argpartition(-sims, k - 1)[:k]
-    # 類似度で並べ替え
     part_sorted = part[np.argsort(-sims[part])]
     return part_sorted.tolist()
-# ========= 公開 API =========
 def retrieve_contexts(query: str, top_k: int = 5) -> List[str]:
-    """
-    クエリに対して、保存済みのチャンク（chunks.jsonl）から上位コンテキストを返す。
-    - 埋め込みは emb_cache にキャッシュ
-    - モデル: EMBEDDING_MODEL（未指定時 all-MiniLM-L6-v2）
-    """
     chunks = _load_chunks()
     if not chunks:
-        # インデックス未構築
         return []
     SentenceTransformer = _lazy_imports()
@@ -143,14 +106,8 @@ def retrieve_contexts(query: str, top_k: int = 5) -> List[str]:
     top_idx = _cosine_topk(emb_matrix, q_vec, top_k)
     results: List[str] = []
     for i in top_idx:
-        try:
-            ch = chunks[idx_map[i]]
-            txt = str(ch.get("text", "")).strip()
-            src = ch.get("source")
-            if src:
-                results.append(f"{txt}\n[source] {src}")
-            else:
-                results.append(txt)
-        except Exception:
-            continue
     return results

+# modules/rag_retriever.py
 import os
 import json
 import time
 from pathlib import Path
+from typing import List, Tuple, Dict, Any
 import numpy as np
 def _lazy_imports():
     from sentence_transformers import SentenceTransformer
     return SentenceTransformer
 def _now() -> int:
     return int(time.time())
 try:
     from modules.utils import ensure_dirs, data_dir
 except Exception:
     def ensure_dirs() -> None:
         Path("/tmp/agent_studio").mkdir(parents=True, exist_ok=True)
     def data_dir() -> Path:
         ensure_dirs()
         return Path("/tmp/agent_studio")
 def _chunks_path() -> Path:
     return data_dir() / "chunks.jsonl"
 def _load_chunks() -> List[Dict[str, Any]]:
     p = _chunks_path()
     if not p.exists():
         return []
                 continue
     return rows
 def _emb_model_name() -> str:
     return os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
 def _emb_cache_dir() -> Path:
     return d / "embeddings.npy", d / "meta.json"
 def _load_or_build_embeddings(chunks: List[Dict[str, Any]]) -> Tuple[np.ndarray, List[int]]:
     ensure_dirs()
     _emb_cache_dir().mkdir(parents=True, exist_ok=True)
     npy_path, meta_path = _emb_cache_paths()
         try:
             with open(meta_path, "r", encoding="utf-8") as f:
                 meta = json.load(f)
+            if int(meta.get("n", -1)) == len(chunks) and meta.get("model") == _emb_model_name():
                 emb = np.load(npy_path)
                 if emb.shape[0] == len(chunks):
+                    return emb, list(range(len(chunks)))
         except Exception:
+            pass
     SentenceTransformer = _lazy_imports()
     model = SentenceTransformer(_emb_model_name())
     texts = [str(c.get("text", "")) for c in chunks]
     if not texts:
         return np.zeros((0, 384), dtype="float32"), []
+    emb = model.encode(texts, normalize_embeddings=True, convert_to_numpy=True)
     np.save(npy_path, emb)
     with open(meta_path, "w", encoding="utf-8") as f:
         json.dump({"n": len(chunks), "model": _emb_model_name(), "ts": _now()}, f)
     return emb, list(range(len(chunks)))
 def _cosine_topk(matrix: np.ndarray, query_vec: np.ndarray, top_k: int) -> List[int]:
     if matrix.size == 0:
         return []
     sims = matrix @ query_vec
     k = min(top_k, matrix.shape[0])
     part = np.argpartition(-sims, k - 1)[:k]
     part_sorted = part[np.argsort(-sims[part])]
     return part_sorted.tolist()
 def retrieve_contexts(query: str, top_k: int = 5) -> List[str]:
     chunks = _load_chunks()
     if not chunks:
         return []
     SentenceTransformer = _lazy_imports()
     top_idx = _cosine_topk(emb_matrix, q_vec, top_k)
     results: List[str] = []
     for i in top_idx:
+        ch = chunks[idx_map[i]]
+        txt = str(ch.get("text", "")).strip()
+        src = ch.get("source")
+        results.append(f"{txt}\n[source] {src}" if src else txt)
     return results