Spaces:
Runtime error
Runtime error
File size: 2,515 Bytes
7bfce56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 |
import os
import re
import faiss
import pickle
from typing import List, Tuple
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from readability import Document
from sentence_transformers import SentenceTransformer
from modules.utils import ensure_dirs, chunk_text
DATA_DIR = Path("data")
INDEX_PATH = DATA_DIR / "vector_store.faiss"
META_PATH = DATA_DIR / "vector_store_meta.pkl"
_model = None
def _embedder():
global _model
if _model is None:
_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
return _model
def _load_index():
if INDEX_PATH.exists():
index = faiss.read_index(str(INDEX_PATH))
with open(META_PATH, "rb") as f:
meta = pickle.load(f)
return index, meta
d = 384 # all-MiniLM-L6-v2
index = faiss.IndexFlatIP(d)
meta = []
return index, meta
def _save_index(index, meta):
faiss.write_index(index, str(INDEX_PATH))
with open(META_PATH, "wb") as f:
pickle.dump(meta, f)
def _extract_text_from_url(url: str) -> str:
try:
r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
r.raise_for_status()
doc = Document(r.text)
html = doc.summary()
soup = BeautifulSoup(html, "lxml")
text = soup.get_text("\n")
return re.sub(r"\n{2,}", "\n", text).strip()
except Exception as e:
return f"[ERROR] failed to fetch {url}: {e}"
def _extract_text_from_file(path: str) -> str:
p = Path(path)
if not p.exists():
return ""
if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
return p.read_text(errors="ignore")
# 簡易:他形式は素のバイナリ名のみ
return f"[FILE]{p.name}"
def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
ensure_dirs()
index, meta = _load_index()
emb = _embedder()
docs = []
for u in urls or []:
text = _extract_text_from_url(u)
if text:
docs.append((u, text))
for fp in file_paths or []:
text = _extract_text_from_file(fp)
if text:
docs.append((fp, text))
added = 0
for src, text in docs:
for chunk in chunk_text(text, 600):
vec = emb.encode([chunk], normalize_embeddings=True)
index.add(vec)
meta.append({"source": src, "text": chunk})
added += 1
_save_index(index, meta)
return f"Indexed {added} chunks from {len(docs)} sources."
|