File size: 2,515 Bytes
7bfce56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import os
import re
import faiss
import pickle
from typing import List, Tuple
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from readability import Document
from sentence_transformers import SentenceTransformer
from modules.utils import ensure_dirs, chunk_text

DATA_DIR = Path("data")
INDEX_PATH = DATA_DIR / "vector_store.faiss"
META_PATH = DATA_DIR / "vector_store_meta.pkl"

_model = None

def _embedder():
    global _model
    if _model is None:
        _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    return _model

def _load_index():
    if INDEX_PATH.exists():
        index = faiss.read_index(str(INDEX_PATH))
        with open(META_PATH, "rb") as f:
            meta = pickle.load(f)
        return index, meta
    d = 384  # all-MiniLM-L6-v2
    index = faiss.IndexFlatIP(d)
    meta = []
    return index, meta

def _save_index(index, meta):
    faiss.write_index(index, str(INDEX_PATH))
    with open(META_PATH, "wb") as f:
        pickle.dump(meta, f)

def _extract_text_from_url(url: str) -> str:
    try:
        r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
        r.raise_for_status()
        doc = Document(r.text)
        html = doc.summary()
        soup = BeautifulSoup(html, "lxml")
        text = soup.get_text("\n")
        return re.sub(r"\n{2,}", "\n", text).strip()
    except Exception as e:
        return f"[ERROR] failed to fetch {url}: {e}"

def _extract_text_from_file(path: str) -> str:
    p = Path(path)
    if not p.exists():
        return ""
    if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
        return p.read_text(errors="ignore")
    # 簡易:他形式は素のバイナリ名のみ
    return f"[FILE]{p.name}"

def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
    ensure_dirs()
    index, meta = _load_index()
    emb = _embedder()

    docs = []
    for u in urls or []:
        text = _extract_text_from_url(u)
        if text:
            docs.append((u, text))
    for fp in file_paths or []:
        text = _extract_text_from_file(fp)
        if text:
            docs.append((fp, text))

    added = 0
    for src, text in docs:
        for chunk in chunk_text(text, 600):
            vec = emb.encode([chunk], normalize_embeddings=True)
            index.add(vec)
            meta.append({"source": src, "text": chunk})
            added += 1

    _save_index(index, meta)
    return f"Indexed {added} chunks from {len(docs)} sources."