Spaces:
Sleeping
Sleeping
Update irpr/deps.py
Browse files- irpr/deps.py +67 -24
irpr/deps.py
CHANGED
|
@@ -5,8 +5,12 @@ from typing import List, Dict, Tuple
|
|
| 5 |
import numpy as np
|
| 6 |
from irpr.config import settings
|
| 7 |
|
| 8 |
-
# ==== 書き込み
|
|
|
|
| 9 |
def _ensure_dir_writable(path: str) -> bool:
|
|
|
|
|
|
|
|
|
|
| 10 |
try:
|
| 11 |
os.makedirs(path, exist_ok=True)
|
| 12 |
try:
|
|
@@ -21,25 +25,53 @@ def _ensure_dir_writable(path: str) -> bool:
|
|
| 21 |
except Exception:
|
| 22 |
return False
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
def _pick_writable_dir() -> str:
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
if settings.DATA_DIR:
|
| 27 |
candidates.append(settings.DATA_DIR)
|
| 28 |
-
|
|
|
|
|
|
|
| 29 |
for base in candidates:
|
| 30 |
-
if
|
| 31 |
return base
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
BASE_DIR = _pick_writable_dir()
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")
|
| 39 |
META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")
|
| 40 |
TEXT_PATH = os.path.join(INDEX_DIR, "texts.jsonl")
|
| 41 |
|
| 42 |
-
# ==== OpenAI ====
|
|
|
|
| 43 |
def _openai_client():
|
| 44 |
try:
|
| 45 |
from openai import OpenAI
|
|
@@ -50,7 +82,8 @@ def _openai_client():
|
|
| 50 |
raise RuntimeError("OPENAI_API_KEY が未設定です。環境変数に設定してください。")
|
| 51 |
return OpenAI(api_key=key)
|
| 52 |
|
| 53 |
-
# ==== インデックス I/O ====
|
|
|
|
| 54 |
def _load_index() -> Tuple[np.ndarray, list, list]:
|
| 55 |
if os.path.exists(VECS_PATH):
|
| 56 |
try:
|
|
@@ -80,34 +113,42 @@ def _load_index() -> Tuple[np.ndarray, list, list]:
|
|
| 80 |
return vecs, metas, texts
|
| 81 |
|
| 82 |
def _save_index(vecs: np.ndarray, metas: list, texts: list) -> None:
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
if not _ensure_dir_writable(INDEX_DIR):
|
| 89 |
-
raise RuntimeError(f"INDEX_DIR
|
| 90 |
|
| 91 |
try:
|
| 92 |
np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
|
| 93 |
-
except
|
| 94 |
-
raise RuntimeError(f"
|
| 95 |
|
| 96 |
try:
|
| 97 |
with open(META_PATH, "w", encoding="utf-8") as f:
|
| 98 |
for m in metas:
|
| 99 |
f.write(json.dumps(m, ensure_ascii=False) + "\n")
|
| 100 |
-
except
|
| 101 |
-
raise RuntimeError(f"
|
| 102 |
|
| 103 |
try:
|
| 104 |
with open(TEXT_PATH, "w", encoding="utf-8") as f:
|
| 105 |
for t in texts:
|
| 106 |
f.write((t or "").replace("\n", "\\n") + "\n")
|
| 107 |
-
except
|
| 108 |
-
raise RuntimeError(f"
|
|
|
|
|
|
|
| 109 |
|
| 110 |
-
# ==== Embedding ====
|
| 111 |
def embed_texts(texts: List[str]) -> np.ndarray:
|
| 112 |
client = _openai_client()
|
| 113 |
model = os.environ.get("OPENAI_EMBED_MODEL", settings.OPENAI_EMBED_MODEL)
|
|
@@ -121,7 +162,8 @@ def embed_texts(texts: List[str]) -> np.ndarray:
|
|
| 121 |
norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
|
| 122 |
return arr / norms
|
| 123 |
|
| 124 |
-
# ==== 追加 ====
|
|
|
|
| 125 |
def add_to_index(records: List[Dict]) -> int:
|
| 126 |
if not records:
|
| 127 |
return 0
|
|
@@ -135,6 +177,7 @@ def add_to_index(records: List[Dict]) -> int:
|
|
| 135 |
old_texts = []
|
| 136 |
else:
|
| 137 |
if vecs.shape[1] != vecs_new.shape[1]:
|
|
|
|
| 138 |
vecs = vecs_new
|
| 139 |
metas = []
|
| 140 |
old_texts = []
|
|
@@ -156,7 +199,8 @@ def add_to_index(records: List[Dict]) -> int:
|
|
| 156 |
_save_index(vecs, metas, old_texts)
|
| 157 |
return len(records)
|
| 158 |
|
| 159 |
-
# ==== 検索 ====
|
|
|
|
| 160 |
def search(query: str, top_k=8) -> List[Dict]:
|
| 161 |
vecs, metas, texts = _load_index()
|
| 162 |
if vecs.size == 0:
|
|
@@ -177,7 +221,6 @@ def search(query: str, top_k=8) -> List[Dict]:
|
|
| 177 |
})
|
| 178 |
return out
|
| 179 |
|
| 180 |
-
# ==== 生成 ====
|
| 181 |
def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
|
| 182 |
client = _openai_client()
|
| 183 |
model = os.environ.get("OPENAI_CHAT_MODEL", settings.OPENAI_CHAT_MODEL)
|
|
|
|
| 5 |
import numpy as np
|
| 6 |
from irpr.config import settings
|
| 7 |
|
| 8 |
+
# ========= 書き込み可能ディレクトリの選定 =========
|
| 9 |
+
|
| 10 |
def _ensure_dir_writable(path: str) -> bool:
|
| 11 |
+
"""
|
| 12 |
+
path を作成し、テストファイルを書いて削除できるか検証。
|
| 13 |
+
"""
|
| 14 |
try:
|
| 15 |
os.makedirs(path, exist_ok=True)
|
| 16 |
try:
|
|
|
|
| 25 |
except Exception:
|
| 26 |
return False
|
| 27 |
|
| 28 |
+
def _ensure_dir_tree(base: str, sub: str = "simple_index") -> bool:
|
| 29 |
+
"""
|
| 30 |
+
base と base/sub の双方で書けるか検証。
|
| 31 |
+
"""
|
| 32 |
+
if not _ensure_dir_writable(base):
|
| 33 |
+
return False
|
| 34 |
+
subdir = os.path.join(base, sub)
|
| 35 |
+
return _ensure_dir_writable(subdir)
|
| 36 |
+
|
| 37 |
def _pick_writable_dir() -> str:
|
| 38 |
+
"""
|
| 39 |
+
優先度順で書き込み可能な base dir を返す。
|
| 40 |
+
1) 環境変数 DATA_DIR
|
| 41 |
+
2) /tmp/irpr
|
| 42 |
+
3) /mnt/data
|
| 43 |
+
4) ./data (カレントに書ける場合のみ)
|
| 44 |
+
最後に /tmp
|
| 45 |
+
"""
|
| 46 |
+
candidates: list[str] = []
|
| 47 |
if settings.DATA_DIR:
|
| 48 |
candidates.append(settings.DATA_DIR)
|
| 49 |
+
|
| 50 |
+
candidates += ["/tmp/irpr", "/mnt/data", os.path.join(os.getcwd(), "data")]
|
| 51 |
+
|
| 52 |
for base in candidates:
|
| 53 |
+
if _ensure_dir_tree(base, "simple_index"):
|
| 54 |
return base
|
| 55 |
+
# 最後の砦
|
| 56 |
+
fallback = "/tmp"
|
| 57 |
+
_ensure_dir_tree(fallback, "irpr_index")
|
| 58 |
+
return fallback
|
| 59 |
|
| 60 |
BASE_DIR = _pick_writable_dir()
|
| 61 |
+
|
| 62 |
+
# INDEX_DIR は明示指定があれば尊重するが、書けなければ BASE_DIR/simple_index にフォールバック
|
| 63 |
+
if settings.INDEX_DIR and _ensure_dir_tree(settings.INDEX_DIR, ""):
|
| 64 |
+
INDEX_DIR = settings.INDEX_DIR
|
| 65 |
+
else:
|
| 66 |
+
INDEX_DIR = os.path.join(BASE_DIR, "simple_index")
|
| 67 |
+
_ensure_dir_writable(INDEX_DIR)
|
| 68 |
|
| 69 |
VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")
|
| 70 |
META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")
|
| 71 |
TEXT_PATH = os.path.join(INDEX_DIR, "texts.jsonl")
|
| 72 |
|
| 73 |
+
# ========= OpenAI =========
|
| 74 |
+
|
| 75 |
def _openai_client():
|
| 76 |
try:
|
| 77 |
from openai import OpenAI
|
|
|
|
| 82 |
raise RuntimeError("OPENAI_API_KEY が未設定です。環境変数に設定してください。")
|
| 83 |
return OpenAI(api_key=key)
|
| 84 |
|
| 85 |
+
# ========= インデックス I/O =========
|
| 86 |
+
|
| 87 |
def _load_index() -> Tuple[np.ndarray, list, list]:
|
| 88 |
if os.path.exists(VECS_PATH):
|
| 89 |
try:
|
|
|
|
| 113 |
return vecs, metas, texts
|
| 114 |
|
| 115 |
def _save_index(vecs: np.ndarray, metas: list, texts: list) -> None:
|
| 116 |
+
"""
|
| 117 |
+
保存直前にもパス全部を作成。PermissionError は詳細情報を付けて再送出。
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
os.makedirs(os.path.dirname(VECS_PATH), exist_ok=True)
|
| 121 |
+
os.makedirs(os.path.dirname(META_PATH), exist_ok=True)
|
| 122 |
+
os.makedirs(os.path.dirname(TEXT_PATH), exist_ok=True)
|
| 123 |
+
except PermissionError as e:
|
| 124 |
+
raise RuntimeError(
|
| 125 |
+
f"INDEX_DIR にディレクトリを作成できません: INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR}"
|
| 126 |
+
) from e
|
| 127 |
|
| 128 |
if not _ensure_dir_writable(INDEX_DIR):
|
| 129 |
+
raise RuntimeError(f"INDEX_DIR に書き込みできません: {INDEX_DIR} (BASE_DIR={BASE_DIR})")
|
| 130 |
|
| 131 |
try:
|
| 132 |
np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
|
| 133 |
+
except PermissionError as e:
|
| 134 |
+
raise RuntimeError(f"ベクトル保存に失敗: {VECS_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
|
| 135 |
|
| 136 |
try:
|
| 137 |
with open(META_PATH, "w", encoding="utf-8") as f:
|
| 138 |
for m in metas:
|
| 139 |
f.write(json.dumps(m, ensure_ascii=False) + "\n")
|
| 140 |
+
except PermissionError as e:
|
| 141 |
+
raise RuntimeError(f"メタ保存に失敗: {META_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
|
| 142 |
|
| 143 |
try:
|
| 144 |
with open(TEXT_PATH, "w", encoding="utf-8") as f:
|
| 145 |
for t in texts:
|
| 146 |
f.write((t or "").replace("\n", "\\n") + "\n")
|
| 147 |
+
except PermissionError as e:
|
| 148 |
+
raise RuntimeError(f"本文保存に失敗: {TEXT_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
|
| 149 |
+
|
| 150 |
+
# ========= Embedding =========
|
| 151 |
|
|
|
|
| 152 |
def embed_texts(texts: List[str]) -> np.ndarray:
|
| 153 |
client = _openai_client()
|
| 154 |
model = os.environ.get("OPENAI_EMBED_MODEL", settings.OPENAI_EMBED_MODEL)
|
|
|
|
| 162 |
norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
|
| 163 |
return arr / norms
|
| 164 |
|
| 165 |
+
# ========= 追加 =========
|
| 166 |
+
|
| 167 |
def add_to_index(records: List[Dict]) -> int:
|
| 168 |
if not records:
|
| 169 |
return 0
|
|
|
|
| 177 |
old_texts = []
|
| 178 |
else:
|
| 179 |
if vecs.shape[1] != vecs_new.shape[1]:
|
| 180 |
+
# 次元不一致は全リビルド(安全第一)
|
| 181 |
vecs = vecs_new
|
| 182 |
metas = []
|
| 183 |
old_texts = []
|
|
|
|
| 199 |
_save_index(vecs, metas, old_texts)
|
| 200 |
return len(records)
|
| 201 |
|
| 202 |
+
# ========= 検索・生成 =========
|
| 203 |
+
|
| 204 |
def search(query: str, top_k=8) -> List[Dict]:
|
| 205 |
vecs, metas, texts = _load_index()
|
| 206 |
if vecs.size == 0:
|
|
|
|
| 221 |
})
|
| 222 |
return out
|
| 223 |
|
|
|
|
| 224 |
def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
|
| 225 |
client = _openai_client()
|
| 226 |
model = os.environ.get("OPENAI_CHAT_MODEL", settings.OPENAI_CHAT_MODEL)
|