Agent_studio / modules /rag_indexer.py
Corin1998's picture
Create rag_indexer.py
7bfce56 verified
import os
import re
import faiss
import pickle
from typing import List, Tuple
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from readability import Document
from sentence_transformers import SentenceTransformer
from modules.utils import ensure_dirs, chunk_text
DATA_DIR = Path("data")
INDEX_PATH = DATA_DIR / "vector_store.faiss"
META_PATH = DATA_DIR / "vector_store_meta.pkl"
_model = None
def _embedder():
global _model
if _model is None:
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
return _model
def _load_index():
if INDEX_PATH.exists():
index = faiss.read_index(str(INDEX_PATH))
with open(META_PATH, "rb") as f:
meta = pickle.load(f)
return index, meta
d = 384 # all-MiniLM-L6-v2
index = faiss.IndexFlatIP(d)
meta = []
return index, meta
def _save_index(index, meta):
faiss.write_index(index, str(INDEX_PATH))
with open(META_PATH, "wb") as f:
pickle.dump(meta, f)
def _extract_text_from_url(url: str) -> str:
try:
r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
r.raise_for_status()
doc = Document(r.text)
html = doc.summary()
soup = BeautifulSoup(html, "lxml")
text = soup.get_text("\n")
return re.sub(r"\n{2,}", "\n", text).strip()
except Exception as e:
return f"[ERROR] failed to fetch {url}: {e}"
def _extract_text_from_file(path: str) -> str:
p = Path(path)
if not p.exists():
return ""
if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
return p.read_text(errors="ignore")
# 簡易:他形式は素のバイナリ名のみ
return f"[FILE]{p.name}"
def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
ensure_dirs()
index, meta = _load_index()
emb = _embedder()
docs = []
for u in urls or []:
text = _extract_text_from_url(u)
if text:
docs.append((u, text))
for fp in file_paths or []:
text = _extract_text_from_file(fp)
if text:
docs.append((fp, text))
added = 0
for src, text in docs:
for chunk in chunk_text(text, 600):
vec = emb.encode([chunk], normalize_embeddings=True)
index.add(vec)
meta.append({"source": src, "text": chunk})
added += 1
_save_index(index, meta)
return f"Indexed {added} chunks from {len(docs)} sources."