Corin1998 commited on
Commit
f8c4913
·
verified ·
1 Parent(s): 2e448ce

Update irpr/deps.py

Browse files
Files changed (1) hide show
  1. irpr/deps.py +67 -24
irpr/deps.py CHANGED
@@ -5,8 +5,12 @@ from typing import List, Dict, Tuple
5
  import numpy as np
6
  from irpr.config import settings
7
 
8
- # ==== 書き込み先ユーティリティ ====
 
9
  def _ensure_dir_writable(path: str) -> bool:
 
 
 
10
  try:
11
  os.makedirs(path, exist_ok=True)
12
  try:
@@ -21,25 +25,53 @@ def _ensure_dir_writable(path: str) -> bool:
21
  except Exception:
22
  return False
23
 
 
 
 
 
 
 
 
 
 
24
  def _pick_writable_dir() -> str:
25
- candidates = []
 
 
 
 
 
 
 
 
26
  if settings.DATA_DIR:
27
  candidates.append(settings.DATA_DIR)
28
- candidates += ["/mnt/data", "/data", "./var", "/tmp/irpr", "."]
 
 
29
  for base in candidates:
30
- if _ensure_dir_writable(base):
31
  return base
32
- return "."
 
 
 
33
 
34
  BASE_DIR = _pick_writable_dir()
35
- INDEX_DIR = settings.INDEX_DIR or os.path.join(BASE_DIR, "simple_index")
36
- _ensure_dir_writable(INDEX_DIR)
 
 
 
 
 
37
 
38
  VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")
39
  META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")
40
  TEXT_PATH = os.path.join(INDEX_DIR, "texts.jsonl")
41
 
42
- # ==== OpenAI ====
 
43
  def _openai_client():
44
  try:
45
  from openai import OpenAI
@@ -50,7 +82,8 @@ def _openai_client():
50
  raise RuntimeError("OPENAI_API_KEY が未設定です。環境変数に設定してください。")
51
  return OpenAI(api_key=key)
52
 
53
- # ==== インデックス I/O ====
 
54
  def _load_index() -> Tuple[np.ndarray, list, list]:
55
  if os.path.exists(VECS_PATH):
56
  try:
@@ -80,34 +113,42 @@ def _load_index() -> Tuple[np.ndarray, list, list]:
80
  return vecs, metas, texts
81
 
82
  def _save_index(vecs: np.ndarray, metas: list, texts: list) -> None:
83
- # 念のため親ディレクトリを都度作成
84
- os.makedirs(os.path.dirname(VECS_PATH), exist_ok=True)
85
- os.makedirs(os.path.dirname(META_PATH), exist_ok=True)
86
- os.makedirs(os.path.dirname(TEXT_PATH), exist_ok=True)
 
 
 
 
 
 
 
87
 
88
  if not _ensure_dir_writable(INDEX_DIR):
89
- raise RuntimeError(f"INDEX_DIR not writable: {INDEX_DIR} (BASE_DIR={BASE_DIR})")
90
 
91
  try:
92
  np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
93
- except FileNotFoundError as e:
94
- raise RuntimeError(f"Failed to save vectors at {VECS_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
95
 
96
  try:
97
  with open(META_PATH, "w", encoding="utf-8") as f:
98
  for m in metas:
99
  f.write(json.dumps(m, ensure_ascii=False) + "\n")
100
- except FileNotFoundError as e:
101
- raise RuntimeError(f"Failed to save meta at {META_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
102
 
103
  try:
104
  with open(TEXT_PATH, "w", encoding="utf-8") as f:
105
  for t in texts:
106
  f.write((t or "").replace("\n", "\\n") + "\n")
107
- except FileNotFoundError as e:
108
- raise RuntimeError(f"Failed to save texts at {TEXT_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
 
 
109
 
110
- # ==== Embedding ====
111
  def embed_texts(texts: List[str]) -> np.ndarray:
112
  client = _openai_client()
113
  model = os.environ.get("OPENAI_EMBED_MODEL", settings.OPENAI_EMBED_MODEL)
@@ -121,7 +162,8 @@ def embed_texts(texts: List[str]) -> np.ndarray:
121
  norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
122
  return arr / norms
123
 
124
- # ==== 追加 ====
 
125
  def add_to_index(records: List[Dict]) -> int:
126
  if not records:
127
  return 0
@@ -135,6 +177,7 @@ def add_to_index(records: List[Dict]) -> int:
135
  old_texts = []
136
  else:
137
  if vecs.shape[1] != vecs_new.shape[1]:
 
138
  vecs = vecs_new
139
  metas = []
140
  old_texts = []
@@ -156,7 +199,8 @@ def add_to_index(records: List[Dict]) -> int:
156
  _save_index(vecs, metas, old_texts)
157
  return len(records)
158
 
159
- # ==== 検索 ====
 
160
  def search(query: str, top_k=8) -> List[Dict]:
161
  vecs, metas, texts = _load_index()
162
  if vecs.size == 0:
@@ -177,7 +221,6 @@ def search(query: str, top_k=8) -> List[Dict]:
177
  })
178
  return out
179
 
180
- # ==== 生成 ====
181
  def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
182
  client = _openai_client()
183
  model = os.environ.get("OPENAI_CHAT_MODEL", settings.OPENAI_CHAT_MODEL)
 
5
  import numpy as np
6
  from irpr.config import settings
7
 
8
+ # ========= 書き込み可能デレクトの選定 =========
9
+
10
  def _ensure_dir_writable(path: str) -> bool:
11
+ """
12
+ path を作成し、テストファイルを書いて削除できるか検証。
13
+ """
14
  try:
15
  os.makedirs(path, exist_ok=True)
16
  try:
 
25
  except Exception:
26
  return False
27
 
28
+ def _ensure_dir_tree(base: str, sub: str = "simple_index") -> bool:
29
+ """
30
+ base と base/sub の双方で書けるか検証。
31
+ """
32
+ if not _ensure_dir_writable(base):
33
+ return False
34
+ subdir = os.path.join(base, sub)
35
+ return _ensure_dir_writable(subdir)
36
+
37
  def _pick_writable_dir() -> str:
38
+ """
39
+ 優先度順で書き込み可能な base dir を返す。
40
+ 1) 環境変数 DATA_DIR
41
+ 2) /tmp/irpr
42
+ 3) /mnt/data
43
+ 4) ./data (カレントに書ける場合のみ)
44
+ 最後に /tmp
45
+ """
46
+ candidates: list[str] = []
47
  if settings.DATA_DIR:
48
  candidates.append(settings.DATA_DIR)
49
+
50
+ candidates += ["/tmp/irpr", "/mnt/data", os.path.join(os.getcwd(), "data")]
51
+
52
  for base in candidates:
53
+ if _ensure_dir_tree(base, "simple_index"):
54
  return base
55
+ # 最後の砦
56
+ fallback = "/tmp"
57
+ _ensure_dir_tree(fallback, "irpr_index")
58
+ return fallback
59
 
60
  BASE_DIR = _pick_writable_dir()
61
+
62
+ # INDEX_DIR は明示指定があれば尊重するが、書けなければ BASE_DIR/simple_index にフォールバック
63
+ if settings.INDEX_DIR and _ensure_dir_tree(settings.INDEX_DIR, ""):
64
+ INDEX_DIR = settings.INDEX_DIR
65
+ else:
66
+ INDEX_DIR = os.path.join(BASE_DIR, "simple_index")
67
+ _ensure_dir_writable(INDEX_DIR)
68
 
69
  VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")
70
  META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")
71
  TEXT_PATH = os.path.join(INDEX_DIR, "texts.jsonl")
72
 
73
+ # ========= OpenAI =========
74
+
75
  def _openai_client():
76
  try:
77
  from openai import OpenAI
 
82
  raise RuntimeError("OPENAI_API_KEY が未設定です。環境変数に設定してください。")
83
  return OpenAI(api_key=key)
84
 
85
+ # ========= インデックス I/O =========
86
+
87
  def _load_index() -> Tuple[np.ndarray, list, list]:
88
  if os.path.exists(VECS_PATH):
89
  try:
 
113
  return vecs, metas, texts
114
 
115
  def _save_index(vecs: np.ndarray, metas: list, texts: list) -> None:
116
+ """
117
+ 保存直前にもパス全部を作成。PermissionError は詳細情報を付けて再送出。
118
+ """
119
+ try:
120
+ os.makedirs(os.path.dirname(VECS_PATH), exist_ok=True)
121
+ os.makedirs(os.path.dirname(META_PATH), exist_ok=True)
122
+ os.makedirs(os.path.dirname(TEXT_PATH), exist_ok=True)
123
+ except PermissionError as e:
124
+ raise RuntimeError(
125
+ f"INDEX_DIR にディレクトリを作成できません: INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR}"
126
+ ) from e
127
 
128
  if not _ensure_dir_writable(INDEX_DIR):
129
+ raise RuntimeError(f"INDEX_DIR に書き込みできません: {INDEX_DIR} (BASE_DIR={BASE_DIR})")
130
 
131
  try:
132
  np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
133
+ except PermissionError as e:
134
+ raise RuntimeError(f"ベクトル保存に失敗: {VECS_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
135
 
136
  try:
137
  with open(META_PATH, "w", encoding="utf-8") as f:
138
  for m in metas:
139
  f.write(json.dumps(m, ensure_ascii=False) + "\n")
140
+ except PermissionError as e:
141
+ raise RuntimeError(f"メタ保存に失敗: {META_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
142
 
143
  try:
144
  with open(TEXT_PATH, "w", encoding="utf-8") as f:
145
  for t in texts:
146
  f.write((t or "").replace("\n", "\\n") + "\n")
147
+ except PermissionError as e:
148
+ raise RuntimeError(f"本文保存に失敗: {TEXT_PATH} (INDEX_DIR={INDEX_DIR}, BASE_DIR={BASE_DIR})") from e
149
+
150
+ # ========= Embedding =========
151
 
 
152
  def embed_texts(texts: List[str]) -> np.ndarray:
153
  client = _openai_client()
154
  model = os.environ.get("OPENAI_EMBED_MODEL", settings.OPENAI_EMBED_MODEL)
 
162
  norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
163
  return arr / norms
164
 
165
+ # ========= 追加 =========
166
+
167
  def add_to_index(records: List[Dict]) -> int:
168
  if not records:
169
  return 0
 
177
  old_texts = []
178
  else:
179
  if vecs.shape[1] != vecs_new.shape[1]:
180
+ # 次元不一致は全リビルド(安全第一)
181
  vecs = vecs_new
182
  metas = []
183
  old_texts = []
 
199
  _save_index(vecs, metas, old_texts)
200
  return len(records)
201
 
202
+ # ========= 検索・生成 =========
203
+
204
  def search(query: str, top_k=8) -> List[Dict]:
205
  vecs, metas, texts = _load_index()
206
  if vecs.size == 0:
 
221
  })
222
  return out
223
 
 
224
  def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
225
  client = _openai_client()
226
  model = os.environ.get("OPENAI_CHAT_MODEL", settings.OPENAI_CHAT_MODEL)