Spaces:

Corin1998
/

Agent_studio

Runtime error

App Files Files Community

Corin1998 commited on Sep 20, 2025

Commit

7bfce56

verified ·

1 Parent(s): 43c4b04

Create rag_indexer.py

Browse files

Files changed (1) hide show

modules/rag_indexer.py +86 -0

modules/rag_indexer.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import os
+import re
+import faiss
+import pickle
+from typing import List, Tuple
+from pathlib import Path
+import requests
+from bs4 import BeautifulSoup
+from readability import Document
+from sentence_transformers import SentenceTransformer
+from modules.utils import ensure_dirs, chunk_text
+DATA_DIR = Path("data")
+INDEX_PATH = DATA_DIR / "vector_store.faiss"
+META_PATH = DATA_DIR / "vector_store_meta.pkl"
+_model = None
+def _embedder():
+    global _model
+    if _model is None:
+        _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+    return _model
+def _load_index():
+    if INDEX_PATH.exists():
+        index = faiss.read_index(str(INDEX_PATH))
+        with open(META_PATH, "rb") as f:
+            meta = pickle.load(f)
+        return index, meta
+    d = 384  # all-MiniLM-L6-v2
+    index = faiss.IndexFlatIP(d)
+    meta = []
+    return index, meta
+def _save_index(index, meta):
+    faiss.write_index(index, str(INDEX_PATH))
+    with open(META_PATH, "wb") as f:
+        pickle.dump(meta, f)
+def _extract_text_from_url(url: str) -> str:
+    try:
+        r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
+        r.raise_for_status()
+        doc = Document(r.text)
+        html = doc.summary()
+        soup = BeautifulSoup(html, "lxml")
+        text = soup.get_text("\n")
+        return re.sub(r"\n{2,}", "\n", text).strip()
+    except Exception as e:
+        return f"[ERROR] failed to fetch {url}: {e}"
+def _extract_text_from_file(path: str) -> str:
+    p = Path(path)
+    if not p.exists():
+        return ""
+    if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
+        return p.read_text(errors="ignore")
+    # 簡易：他形式は素のバイナリ名のみ
+    return f"[FILE]{p.name}"
+def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
+    ensure_dirs()
+    index, meta = _load_index()
+    emb = _embedder()
+    docs = []
+    for u in urls or []:
+        text = _extract_text_from_url(u)
+        if text:
+            docs.append((u, text))
+    for fp in file_paths or []:
+        text = _extract_text_from_file(fp)
+        if text:
+            docs.append((fp, text))
+    added = 0
+    for src, text in docs:
+        for chunk in chunk_text(text, 600):
+            vec = emb.encode([chunk], normalize_embeddings=True)
+            index.add(vec)
+            meta.append({"source": src, "text": chunk})
+            added += 1
+    _save_index(index, meta)
+    return f"Indexed {added} chunks from {len(docs)} sources."