Spaces:

Corin1998
/

Agent_studio

Runtime error

File size: 2,515 Bytes

7bfce56

import os
import re
import faiss
import pickle
from typing import List, Tuple
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from readability import Document
from sentence_transformers import SentenceTransformer
from modules.utils import ensure_dirs, chunk_text

DATA_DIR = Path("data")
INDEX_PATH = DATA_DIR / "vector_store.faiss"
META_PATH = DATA_DIR / "vector_store_meta.pkl"

_model = None

def _embedder():
    global _model
    if _model is None:
        _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    return _model

def _load_index():
    if INDEX_PATH.exists():
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f:
            meta = pickle.load(f)
        return index, meta
    d = 384  # all-MiniLM-L6-v2
    index = faiss.IndexFlatIP(d)
    meta = []
    return index, meta

def _save_index(index, meta):
    faiss.write_index(index, str(INDEX_PATH))
    with open(META_PATH, "wb") as f:
        pickle.dump(meta, f)

def _extract_text_from_url(url: str) -> str:
    try:
        r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
        r.raise_for_status()
        doc = Document(r.text)
        html = doc.summary()
        soup = BeautifulSoup(html, "lxml")
        text = soup.get_text("\n")
        return re.sub(r"\n{2,}", "\n", text).strip()
    except Exception as e:
        return f"[ERROR] failed to fetch {url}: {e}"

def _extract_text_from_file(path: str) -> str:
    p = Path(path)
    if not p.exists():
        return ""
    if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
        return p.read_text(errors="ignore")
    # 簡易：他形式は素のバイナリ名のみ
    return f"[FILE]{p.name}"

def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
    ensure_dirs()
    index, meta = _load_index()
    emb = _embedder()

    docs = []
    for u in urls or []:
        text = _extract_text_from_url(u)
        if text:
            docs.append((u, text))
    for fp in file_paths or []:
        text = _extract_text_from_file(fp)
        if text:
            docs.append((fp, text))

    added = 0
    for src, text in docs:
        for chunk in chunk_text(text, 600):
            vec = emb.encode([chunk], normalize_embeddings=True)
            index.add(vec)
            meta.append({"source": src, "text": chunk})
            added += 1

    _save_index(index, meta)
    return f"Indexed {added} chunks from {len(docs)} sources."