Spaces:
Sleeping
Sleeping
Update modules/rag_indexer.py
Browse files- modules/rag_indexer.py +96 -65
modules/rag_indexer.py
CHANGED
|
@@ -1,83 +1,114 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
-
from pathlib import Path
|
| 4 |
import requests
|
|
|
|
|
|
|
|
|
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
-
from readability import Document
|
| 7 |
-
import faiss
|
| 8 |
-
from sentence_transformers import SentenceTransformer
|
| 9 |
-
from modules.utils import ensure_dirs, chunk_text
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
META_PATH = DATA_DIR / "vector_store_meta.pkl"
|
| 14 |
|
|
|
|
| 15 |
_model = None
|
| 16 |
|
| 17 |
def _embedder():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
global _model
|
| 19 |
-
if _model is None:
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
return _model
|
| 22 |
|
| 23 |
-
def
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
with open(META_PATH, "wb") as f:
|
| 37 |
-
pickle.dump(meta, f)
|
| 38 |
-
|
| 39 |
-
def _extract_text_from_url(url: str) -> str:
|
| 40 |
try:
|
| 41 |
-
r = requests.get(url, timeout=
|
| 42 |
r.raise_for_status()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
|
|
|
| 50 |
|
| 51 |
-
def
|
| 52 |
-
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
| 54 |
return ""
|
| 55 |
-
if p.suffix.lower() in [".txt", ".md", ".csv", ".json", ".py"]:
|
| 56 |
-
return p.read_text(errors="ignore")
|
| 57 |
-
return f"[FILE]{p.name}"
|
| 58 |
|
| 59 |
-
def index_files_and_urls(file_paths: List[str], urls: List[str]) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
ensure_dirs()
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
docs = []
|
| 65 |
-
for u in urls or []:
|
| 66 |
-
text = _extract_text_from_url(u)
|
| 67 |
-
if text:
|
| 68 |
-
docs.append((u, text))
|
| 69 |
-
for fp in file_paths or []:
|
| 70 |
-
text = _extract_text_from_file(fp)
|
| 71 |
-
if text:
|
| 72 |
-
docs.append((fp, text))
|
| 73 |
|
| 74 |
added = 0
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
|
|
|
| 3 |
import requests
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List, Dict, Any, Optional
|
| 6 |
+
|
| 7 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
| 9 |
+
# utils から書き込み先と分割関数を取得
|
| 10 |
+
from .utils import ensure_dirs, data_dir, chunk_text
|
|
|
|
| 11 |
|
| 12 |
+
# 依存は遅延ロード(モデル初期化は重いので)
|
| 13 |
_model = None
|
| 14 |
|
| 15 |
def _embedder():
|
| 16 |
+
"""
|
| 17 |
+
SentenceTransformer を遅延初期化。
|
| 18 |
+
- すべてのキャッシュは utils.ensure_dirs() 側で /tmp 等の書き込み可パスへ固定済み
|
| 19 |
+
- 念のため cache_folder も明示して、/root 直下を一切使わない
|
| 20 |
+
"""
|
| 21 |
global _model
|
| 22 |
+
if _model is not None:
|
| 23 |
+
return _model
|
| 24 |
+
|
| 25 |
+
ensure_dirs()
|
| 26 |
+
cache_base = data_dir() / "hf_cache"
|
| 27 |
+
|
| 28 |
+
# ここで環境変数も最終確認(多重防御)
|
| 29 |
+
os.environ.setdefault("HF_HOME", str(data_dir() / "hf_home"))
|
| 30 |
+
os.environ.setdefault("HUGGINGFACE_HUB_CACHE", str(cache_base))
|
| 31 |
+
os.environ.setdefault("TRANSFORMERS_CACHE", str(cache_base))
|
| 32 |
+
os.environ.setdefault("SENTENCE_TRANSFORMERS_HOME", str(cache_base))
|
| 33 |
+
os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
|
| 34 |
+
os.environ.setdefault("HF_TOKEN", "")
|
| 35 |
+
|
| 36 |
+
from sentence_transformers import SentenceTransformer
|
| 37 |
+
model_name = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
|
| 38 |
+
_model = SentenceTransformer(model_name, cache_folder=str(cache_base))
|
| 39 |
return _model
|
| 40 |
|
| 41 |
+
def _write_chunks(rows: List[Dict[str, Any]]) -> int:
|
| 42 |
+
"""
|
| 43 |
+
chunks.jsonl に追記(既存は破棄せず、簡易に追加)
|
| 44 |
+
1行: {"text": "...", "source": "path_or_url", "meta": {...}}
|
| 45 |
+
"""
|
| 46 |
+
ensure_dirs()
|
| 47 |
+
out = data_dir() / "chunks.jsonl"
|
| 48 |
+
with open(out, "a", encoding="utf-8") as f:
|
| 49 |
+
for r in rows:
|
| 50 |
+
f.write(json.dumps(r, ensure_ascii=False) + "\n")
|
| 51 |
+
return len(rows)
|
| 52 |
+
|
| 53 |
+
def _load_text_from_url(url: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
try:
|
| 55 |
+
r = requests.get(url, timeout=15)
|
| 56 |
r.raise_for_status()
|
| 57 |
+
html = r.text
|
| 58 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 59 |
+
# タイトル + 本文テキスト(簡易)
|
| 60 |
+
title = (soup.title.string.strip() if soup.title and soup.title.string else "")
|
| 61 |
+
text = soup.get_text("\n", strip=True)
|
| 62 |
+
return (title + "\n\n" + text).strip()
|
| 63 |
+
except Exception:
|
| 64 |
+
return ""
|
| 65 |
|
| 66 |
+
def _load_text_from_file(path: Path) -> str:
|
| 67 |
+
# テキスト/Markdown想定(PDF等は最小構成では未対応)
|
| 68 |
+
try:
|
| 69 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 70 |
+
return f.read()
|
| 71 |
+
except Exception:
|
| 72 |
return ""
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
def index_files_and_urls(file_paths: Optional[List[str]] = None, urls: Optional[List[str]] = None) -> str:
|
| 75 |
+
"""
|
| 76 |
+
- 受け取ったファイルとURLからテキストを抽出し、チャンク化して chunks.jsonl に追記
|
| 77 |
+
- 依存を最小化(PDF/Officeは最小構成では対象外)
|
| 78 |
+
"""
|
| 79 |
ensure_dirs()
|
| 80 |
+
file_paths = file_paths or []
|
| 81 |
+
urls = urls or []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
added = 0
|
| 84 |
+
rows: List[Dict[str, Any]] = []
|
| 85 |
+
|
| 86 |
+
# ファイル
|
| 87 |
+
for p in file_paths:
|
| 88 |
+
try:
|
| 89 |
+
path = Path(p)
|
| 90 |
+
txt = _load_text_from_file(path)
|
| 91 |
+
for ch in chunk_text(txt):
|
| 92 |
+
rows.append({"text": ch, "source": str(path), "meta": {"kind": "file"}})
|
| 93 |
+
except Exception:
|
| 94 |
+
continue
|
| 95 |
+
|
| 96 |
+
# URL
|
| 97 |
+
for u in urls:
|
| 98 |
+
txt = _load_text_from_url(u)
|
| 99 |
+
for ch in chunk_text(txt):
|
| 100 |
+
rows.append({"text": ch, "source": u, "meta": {"kind": "url"}})
|
| 101 |
+
|
| 102 |
+
if rows:
|
| 103 |
+
added = _write_chunks(rows)
|
| 104 |
+
|
| 105 |
+
# 埋め込みモデルを一度初期化しておく(初回ダウンロードが必要なため)
|
| 106 |
+
try:
|
| 107 |
+
emb = _embedder()
|
| 108 |
+
# 1件だけ実行してキャッシュを準備
|
| 109 |
+
_ = emb.encode(["warmup"], normalize_embeddings=True)
|
| 110 |
+
warmed = True
|
| 111 |
+
except Exception as e:
|
| 112 |
+
warmed = False
|
| 113 |
+
|
| 114 |
+
return f"indexed_chunks={added}, warmed_up={warmed}"
|