Spaces:
Runtime error
Runtime error
| import os | |
| import re | |
| import faiss | |
| import pickle | |
| from typing import List, Tuple | |
| from pathlib import Path | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from readability import Document | |
| from sentence_transformers import SentenceTransformer | |
| from modules.utils import ensure_dirs, chunk_text | |
| DATA_DIR = Path("data") | |
| INDEX_PATH = DATA_DIR / "vector_store.faiss" | |
| META_PATH = DATA_DIR / "vector_store_meta.pkl" | |
| _model = None | |
| def _embedder(): | |
| global _model | |
| if _model is None: | |
| _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| return _model | |
| def _load_index(): | |
| if INDEX_PATH.exists(): | |
| index = faiss.read_index(str(INDEX_PATH)) | |
| with open(META_PATH, "rb") as f: | |
| meta = pickle.load(f) | |
| return index, meta | |
| d = 384 # all-MiniLM-L6-v2 | |
| index = faiss.IndexFlatIP(d) | |
| meta = [] | |
| return index, meta | |
| def _save_index(index, meta): | |
| faiss.write_index(index, str(INDEX_PATH)) | |
| with open(META_PATH, "wb") as f: | |
| pickle.dump(meta, f) | |
| def _extract_text_from_url(url: str) -> str: | |
| try: | |
| r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"}) | |
| r.raise_for_status() | |
| doc = Document(r.text) | |
| html = doc.summary() | |
| soup = BeautifulSoup(html, "lxml") | |
| text = soup.get_text("\n") | |
| return re.sub(r"\n{2,}", "\n", text).strip() | |
| except Exception as e: | |
| return f"[ERROR] failed to fetch {url}: {e}" | |
| def _extract_text_from_file(path: str) -> str: | |
| p = Path(path) | |
| if not p.exists(): | |
| return "" | |
| if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]: | |
| return p.read_text(errors="ignore") | |
| # 簡易:他形式は素のバイナリ名のみ | |
| return f"[FILE]{p.name}" | |
| def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str: | |
| ensure_dirs() | |
| index, meta = _load_index() | |
| emb = _embedder() | |
| docs = [] | |
| for u in urls or []: | |
| text = _extract_text_from_url(u) | |
| if text: | |
| docs.append((u, text)) | |
| for fp in file_paths or []: | |
| text = _extract_text_from_file(fp) | |
| if text: | |
| docs.append((fp, text)) | |
| added = 0 | |
| for src, text in docs: | |
| for chunk in chunk_text(text, 600): | |
| vec = emb.encode([chunk], normalize_embeddings=True) | |
| index.add(vec) | |
| meta.append({"source": src, "text": chunk}) | |
| added += 1 | |
| _save_index(index, meta) | |
| return f"Indexed {added} chunks from {len(docs)} sources." | |