import os import re import faiss import pickle from typing import List, Tuple from pathlib import Path import requests from bs4 import BeautifulSoup from readability import Document from sentence_transformers import SentenceTransformer from modules.utils import ensure_dirs, chunk_text DATA_DIR = Path("data") INDEX_PATH = DATA_DIR / "vector_store.faiss" META_PATH = DATA_DIR / "vector_store_meta.pkl" _model = None def _embedder(): global _model if _model is None: _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") return _model def _load_index(): if INDEX_PATH.exists(): index = faiss.read_index(str(INDEX_PATH)) with open(META_PATH, "rb") as f: meta = pickle.load(f) return index, meta d = 384 # all-MiniLM-L6-v2 index = faiss.IndexFlatIP(d) meta = [] return index, meta def _save_index(index, meta): faiss.write_index(index, str(INDEX_PATH)) with open(META_PATH, "wb") as f: pickle.dump(meta, f) def _extract_text_from_url(url: str) -> str: try: r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"}) r.raise_for_status() doc = Document(r.text) html = doc.summary() soup = BeautifulSoup(html, "lxml") text = soup.get_text("\n") return re.sub(r"\n{2,}", "\n", text).strip() except Exception as e: return f"[ERROR] failed to fetch {url}: {e}" def _extract_text_from_file(path: str) -> str: p = Path(path) if not p.exists(): return "" if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]: return p.read_text(errors="ignore") # 簡易:他形式は素のバイナリ名のみ return f"[FILE]{p.name}" def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str: ensure_dirs() index, meta = _load_index() emb = _embedder() docs = [] for u in urls or []: text = _extract_text_from_url(u) if text: docs.append((u, text)) for fp in file_paths or []: text = _extract_text_from_file(fp) if text: docs.append((fp, text)) added = 0 for src, text in docs: for chunk in chunk_text(text, 600): vec = emb.encode([chunk], normalize_embeddings=True) index.add(vec) meta.append({"source": src, "text": chunk}) added += 1 _save_index(index, meta) return f"Indexed {added} chunks from {len(docs)} sources."