Corin1998 commited on
Commit
7bfce56
·
verified ·
1 Parent(s): 43c4b04

Create rag_indexer.py

Browse files
Files changed (1) hide show
  1. modules/rag_indexer.py +86 -0
modules/rag_indexer.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import faiss
4
+ import pickle
5
+ from typing import List, Tuple
6
+ from pathlib import Path
7
+ import requests
8
+ from bs4 import BeautifulSoup
9
+ from readability import Document
10
+ from sentence_transformers import SentenceTransformer
11
+ from modules.utils import ensure_dirs, chunk_text
12
+
13
+ DATA_DIR = Path("data")
14
+ INDEX_PATH = DATA_DIR / "vector_store.faiss"
15
+ META_PATH = DATA_DIR / "vector_store_meta.pkl"
16
+
17
+ _model = None
18
+
19
+ def _embedder():
20
+ global _model
21
+ if _model is None:
22
+ _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
23
+ return _model
24
+
25
+ def _load_index():
26
+ if INDEX_PATH.exists():
27
+ index = faiss.read_index(str(INDEX_PATH))
28
+ with open(META_PATH, "rb") as f:
29
+ meta = pickle.load(f)
30
+ return index, meta
31
+ d = 384 # all-MiniLM-L6-v2
32
+ index = faiss.IndexFlatIP(d)
33
+ meta = []
34
+ return index, meta
35
+
36
+ def _save_index(index, meta):
37
+ faiss.write_index(index, str(INDEX_PATH))
38
+ with open(META_PATH, "wb") as f:
39
+ pickle.dump(meta, f)
40
+
41
+ def _extract_text_from_url(url: str) -> str:
42
+ try:
43
+ r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
44
+ r.raise_for_status()
45
+ doc = Document(r.text)
46
+ html = doc.summary()
47
+ soup = BeautifulSoup(html, "lxml")
48
+ text = soup.get_text("\n")
49
+ return re.sub(r"\n{2,}", "\n", text).strip()
50
+ except Exception as e:
51
+ return f"[ERROR] failed to fetch {url}: {e}"
52
+
53
+ def _extract_text_from_file(path: str) -> str:
54
+ p = Path(path)
55
+ if not p.exists():
56
+ return ""
57
+ if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
58
+ return p.read_text(errors="ignore")
59
+ # 簡易:他形式は素のバイナリ名のみ
60
+ return f"[FILE]{p.name}"
61
+
62
+ def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
63
+ ensure_dirs()
64
+ index, meta = _load_index()
65
+ emb = _embedder()
66
+
67
+ docs = []
68
+ for u in urls or []:
69
+ text = _extract_text_from_url(u)
70
+ if text:
71
+ docs.append((u, text))
72
+ for fp in file_paths or []:
73
+ text = _extract_text_from_file(fp)
74
+ if text:
75
+ docs.append((fp, text))
76
+
77
+ added = 0
78
+ for src, text in docs:
79
+ for chunk in chunk_text(text, 600):
80
+ vec = emb.encode([chunk], normalize_embeddings=True)
81
+ index.add(vec)
82
+ meta.append({"source": src, "text": chunk})
83
+ added += 1
84
+
85
+ _save_index(index, meta)
86
+ return f"Indexed {added} chunks from {len(docs)} sources."