Spaces:

Corin1998
/

Agent_studio

Runtime error

App Files Files Community

Agent_studio / modules /rag_indexer.py

Corin1998

Create rag_indexer.py

7bfce56 verified 5 months ago

raw

history blame contribute delete

2.52 kB

	import os
	import re
	import faiss
	import pickle
	from typing import List, Tuple
	from pathlib import Path
	import requests
	from bs4 import BeautifulSoup
	from readability import Document
	from sentence_transformers import SentenceTransformer
	from modules.utils import ensure_dirs, chunk_text

	DATA_DIR = Path("data")
	INDEX_PATH = DATA_DIR / "vector_store.faiss"
	META_PATH = DATA_DIR / "vector_store_meta.pkl"

	_model = None

	def _embedder():
	global _model
	if _model is None:
	_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
	return _model

	def _load_index():
	if INDEX_PATH.exists():
	index = faiss.read_index(str(INDEX_PATH))
	with open(META_PATH, "rb") as f:
	meta = pickle.load(f)
	return index, meta
	d = 384 # all-MiniLM-L6-v2
	index = faiss.IndexFlatIP(d)
	meta = []
	return index, meta

	def _save_index(index, meta):
	faiss.write_index(index, str(INDEX_PATH))
	with open(META_PATH, "wb") as f:
	pickle.dump(meta, f)

	def _extract_text_from_url(url: str) -> str:
	try:
	r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
	r.raise_for_status()
	doc = Document(r.text)
	html = doc.summary()
	soup = BeautifulSoup(html, "lxml")
	text = soup.get_text("\n")
	return re.sub(r"\n{2,}", "\n", text).strip()
	except Exception as e:
	return f"[ERROR] failed to fetch {url}: {e}"

	def _extract_text_from_file(path: str) -> str:
	p = Path(path)
	if not p.exists():
	return ""
	if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
	return p.read_text(errors="ignore")
	# 簡易：他形式は素のバイナリ名のみ
	return f"[FILE]{p.name}"

	def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
	ensure_dirs()
	index, meta = _load_index()
	emb = _embedder()

	docs = []
	for u in urls or []:
	text = _extract_text_from_url(u)
	if text:
	docs.append((u, text))
	for fp in file_paths or []:
	text = _extract_text_from_file(fp)
	if text:
	docs.append((fp, text))

	added = 0
	for src, text in docs:
	for chunk in chunk_text(text, 600):
	vec = emb.encode([chunk], normalize_embeddings=True)
	index.add(vec)
	meta.append({"source": src, "text": chunk})
	added += 1

	_save_index(index, meta)
	return f"Indexed {added} chunks from {len(docs)} sources."