Corin1998 commited on
Commit
24368f7
·
verified ·
1 Parent(s): 2517621

Update modules/rag_indexer.py

Browse files
Files changed (1) hide show
  1. modules/rag_indexer.py +96 -65
modules/rag_indexer.py CHANGED
@@ -1,83 +1,114 @@
1
- import re, pickle
2
- from typing import List
3
- from pathlib import Path
4
  import requests
 
 
 
5
  from bs4 import BeautifulSoup
6
- from readability import Document
7
- import faiss
8
- from sentence_transformers import SentenceTransformer
9
- from modules.utils import ensure_dirs, chunk_text
10
 
11
- DATA_DIR = Path("data")
12
- INDEX_PATH = DATA_DIR / "vector_store.faiss"
13
- META_PATH = DATA_DIR / "vector_store_meta.pkl"
14
 
 
15
  _model = None
16
 
17
  def _embedder():
 
 
 
 
 
18
  global _model
19
- if _model is None:
20
- _model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  return _model
22
 
23
- def _load_index():
24
- if INDEX_PATH.exists():
25
- index = faiss.read_index(str(INDEX_PATH))
26
- with open(META_PATH, "rb") as f:
27
- meta = pickle.load(f)
28
- return index, meta
29
- d = 384 # all-MiniLM-L6-v2
30
- index = faiss.IndexFlatIP(d)
31
- meta = []
32
- return index, meta
33
-
34
- def _save_index(index, meta):
35
- faiss.write_index(index, str(INDEX_PATH))
36
- with open(META_PATH, "wb") as f:
37
- pickle.dump(meta, f)
38
-
39
- def _extract_text_from_url(url: str) -> str:
40
  try:
41
- r = requests.get(url, timeout=20, headers={"User-Agent":"Mozilla/5.0"})
42
  r.raise_for_status()
43
- doc = Document(r.text)
44
- html = doc.summary()
45
- soup = BeautifulSoup(html, "lxml")
46
- text = soup.get_text("\n")
47
- return re.sub(r"\n{2,}", "\n", text).strip()
48
- except Exception as e:
49
- return f"[ERROR] failed to fetch {url}: {e}"
 
50
 
51
- def _extract_text_from_file(path: str) -> str:
52
- p = Path(path)
53
- if not p.exists():
 
 
 
54
  return ""
55
- if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
56
- return p.read_text(errors="ignore")
57
- return f"[FILE]{p.name}"
58
 
59
- def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
 
 
 
 
60
  ensure_dirs()
61
- index, meta = _load_index()
62
- emb = _embedder()
63
-
64
- docs = []
65
- for u in urls or []:
66
- text = _extract_text_from_url(u)
67
- if text:
68
- docs.append((u, text))
69
- for fp in file_paths or []:
70
- text = _extract_text_from_file(fp)
71
- if text:
72
- docs.append((fp, text))
73
 
74
  added = 0
75
- for src, text in docs:
76
- for chunk in chunk_text(text, 600):
77
- vec = emb.encode([chunk], normalize_embeddings=True)
78
- index.add(vec)
79
- meta.append({"source": src, "text": chunk})
80
- added += 1
81
-
82
- _save_index(index, meta)
83
- return f"Indexed {added} chunks from {len(docs)} sources."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
 
3
  import requests
4
+ from pathlib import Path
5
+ from typing import List, Dict, Any, Optional
6
+
7
  from bs4 import BeautifulSoup
 
 
 
 
8
 
9
+ # utils から書き込み先と分割関数を取得
10
+ from .utils import ensure_dirs, data_dir, chunk_text
 
11
 
12
+ # 依存は遅延ロード(モデル初期化は重いので)
13
  _model = None
14
 
15
  def _embedder():
16
+ """
17
+ SentenceTransformer を遅延初期化。
18
+ - すべてのキャッシュは utils.ensure_dirs() 側で /tmp 等の書き込み可パスへ固定済み
19
+ - 念のため cache_folder も明示して、/root 直下を一切使わない
20
+ """
21
  global _model
22
+ if _model is not None:
23
+ return _model
24
+
25
+ ensure_dirs()
26
+ cache_base = data_dir() / "hf_cache"
27
+
28
+ # ここで環境変数も最終確認(多重防御)
29
+ os.environ.setdefault("HF_HOME", str(data_dir() / "hf_home"))
30
+ os.environ.setdefault("HUGGINGFACE_HUB_CACHE", str(cache_base))
31
+ os.environ.setdefault("TRANSFORMERS_CACHE", str(cache_base))
32
+ os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(cache_base))
33
+ os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
34
+ os.environ.setdefault("HF_TOKEN", "")
35
+
36
+ from sentence_transformers import SentenceTransformer
37
+ model_name = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
38
+ _model = SentenceTransformer(model_name, cache_folder=str(cache_base))
39
  return _model
40
 
41
+ def _write_chunks(rows: List[Dict[str, Any]]) -> int:
42
+ """
43
+ chunks.jsonl に追記(既存は破棄せず、簡易に追加)
44
+ 1行: {"text": "...", "source": "path_or_url", "meta": {...}}
45
+ """
46
+ ensure_dirs()
47
+ out = data_dir() / "chunks.jsonl"
48
+ with open(out, "a", encoding="utf-8") as f:
49
+ for r in rows:
50
+ f.write(json.dumps(r, ensure_ascii=False) + "\n")
51
+ return len(rows)
52
+
53
+ def _load_text_from_url(url: str) -> str:
 
 
 
 
54
  try:
55
+ r = requests.get(url, timeout=15)
56
  r.raise_for_status()
57
+ html = r.text
58
+ soup = BeautifulSoup(html, "html.parser")
59
+ # タイトル + 本文テキスト(簡易)
60
+ title = (soup.title.string.strip() if soup.title and soup.title.string else "")
61
+ text = soup.get_text("\n", strip=True)
62
+ return (title + "\n\n" + text).strip()
63
+ except Exception:
64
+ return ""
65
 
66
+ def _load_text_from_file(path: Path) -> str:
67
+ # テキスト/Markdown想定(PDF等は最小構成では未対応)
68
+ try:
69
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
70
+ return f.read()
71
+ except Exception:
72
  return ""
 
 
 
73
 
74
+ def index_files_and_urls(file_paths: Optional[List[str]] = None, urls: Optional[List[str]] = None) -> str:
75
+ """
76
+ - 受け取ったファイルとURLからテキストを抽出し、チャンク化して chunks.jsonl に追記
77
+ - 依存を最小化(PDF/Officeは最小構成では対象外)
78
+ """
79
  ensure_dirs()
80
+ file_paths = file_paths or []
81
+ urls = urls or []
 
 
 
 
 
 
 
 
 
 
82
 
83
  added = 0
84
+ rows: List[Dict[str, Any]] = []
85
+
86
+ # ファイル
87
+ for p in file_paths:
88
+ try:
89
+ path = Path(p)
90
+ txt = _load_text_from_file(path)
91
+ for ch in chunk_text(txt):
92
+ rows.append({"text": ch, "source": str(path), "meta": {"kind": "file"}})
93
+ except Exception:
94
+ continue
95
+
96
+ # URL
97
+ for u in urls:
98
+ txt = _load_text_from_url(u)
99
+ for ch in chunk_text(txt):
100
+ rows.append({"text": ch, "source": u, "meta": {"kind": "url"}})
101
+
102
+ if rows:
103
+ added = _write_chunks(rows)
104
+
105
+ # 埋め込みモデルを一度初期化しておく(初回ダウンロードが必要なため)
106
+ try:
107
+ emb = _embedder()
108
+ # 1件だけ実行してキャッシュを準備
109
+ _ = emb.encode(["warmup"], normalize_embeddings=True)
110
+ warmed = True
111
+ except Exception as e:
112
+ warmed = False
113
+
114
+ return f"indexed_chunks={added}, warmed_up={warmed}"