Spaces:

Corin1998
/

IR_PR_PilotPro

Sleeping

App Files Files Community

Corin1998 commited on Sep 17, 2025

Commit

1bfa547

verified ·

1 Parent(s): 460c48d

Update irpr/deps.py

Browse files

Files changed (1) hide show

irpr/deps.py +41 -31

irpr/deps.py CHANGED Viewed

@@ -1,28 +1,39 @@
 # irpr/deps.py  --- OpenAI埋め込み + 自前ベクタストア（numpy）／LLM生成
 from __future__ import annotations
 import os, json, uuid
-from typing import List, Dict, Optional, Tuple
 import numpy as np
 from irpr.config import settings
-# ==== 書き込み可能ディレクトリの決定 ====
 def _pick_writable_dir() -> str:
-    candidates = [settings.DATA_DIR, "/data", "./var", "/tmp/irpr", "."]
     for base in candidates:
-        try:
-            if not base: continue
-            os.makedirs(base, exist_ok=True)
-            p = os.path.join(base, ".write_test")
-            with open(p, "w") as w: w.write("ok")
-            os.remove(p)
             return base
-        except Exception:
-            continue
     return "."
 BASE_DIR = _pick_writable_dir()
 INDEX_DIR = settings.INDEX_DIR or os.path.join(BASE_DIR, "simple_index")
-os.makedirs(INDEX_DIR, exist_ok=True)
 VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")     # np.float32 [N,D]（正規化済）
 META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")       # 1行1メタ
@@ -40,13 +51,17 @@ def _openai_client():
     return OpenAI(api_key=key)
 # ==== 収納・ロード ====
-def _load_index() -> Tuple[np.ndarray, List[dict], List[str]]:
     if os.path.exists(VECS_PATH):
-        vecs = np.load(VECS_PATH).astype(np.float32, copy=False)
     else:
         vecs = np.zeros((0, 0), dtype=np.float32)
-    metas: List[dict] = []
-    texts: List[str] = []
     if os.path.exists(META_PATH):
         with open(META_PATH, "r", encoding="utf-8") as f:
             for line in f:
@@ -57,30 +72,29 @@ def _load_index() -> Tuple[np.ndarray, List[dict], List[str]]:
         with open(TEXT_PATH, "r", encoding="utf-8") as f:
             for line in f:
                 texts.append(line.rstrip("\n"))
-    # 整合性チェック
     if vecs.size == 0:
         return np.zeros((0, 0), dtype=np.float32), [], []
     n = vecs.shape[0]
     if len(metas) != n or len(texts) != n:
-        # 壊れているなら初期化
         return np.zeros((0, 0), dtype=np.float32), [], []
     return vecs, metas, texts
-def _save_index(vecs: np.ndarray, metas: List[dict], texts: List[str]) -> None:
-    os.makedirs(INDEX_DIR, exist_ok=True)
     np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
     with open(META_PATH, "w", encoding="utf-8") as f:
         for m in metas:
             f.write(json.dumps(m, ensure_ascii=False) + "\n")
     with open(TEXT_PATH, "w", encoding="utf-8") as f:
         for t in texts:
-            f.write((t or "").replace("\n", "\\n") + "\n")  # 1行1テキストに正規化
 # ==== Embedding ====
 def embed_texts(texts: List[str]) -> np.ndarray:
     client = _openai_client()
-    model = settings.OPENAI_EMBED_MODEL
-    # バッチで呼ぶ
     B = 128
     out = []
     for i in range(0, len(texts), B):
@@ -88,15 +102,11 @@ def embed_texts(texts: List[str]) -> np.ndarray:
         resp = client.embeddings.create(model=model, input=batch)
         out.extend([d.embedding for d in resp.data])
     arr = np.array(out, dtype=np.float32)
-    # 正規化（コサイン類似度用）
     norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
     return arr / norms
 # ==== 追加 ====
 def add_to_index(records: List[Dict]) -> int:
-    """
-    records: [{text, title, source_url, doc_id, chunk_id}]
-    """
     if not records:
         return 0
     texts = [r["text"] for r in records]
@@ -109,7 +119,7 @@ def add_to_index(records: List[Dict]) -> int:
         old_texts = []
     else:
         if vecs.shape[1] != vecs_new.shape[1]:
-            # 埋め込み次元が違う（モデルを変えた等）→作り直し
             vecs = vecs_new
             metas = []
             old_texts = []
@@ -136,8 +146,8 @@ def search(query: str, top_k=8) -> List[Dict]:
     vecs, metas, texts = _load_index()
     if vecs.size == 0:
         return []
-    q = embed_texts([query])[0]  # (D,)
-    scores = vecs @ q  # cosine (正規化済み)
     idx = np.argsort(-scores)[:max(1, top_k)]
     out: List[Dict] = []
     for i in idx.tolist():
@@ -155,7 +165,7 @@ def search(query: str, top_k=8) -> List[Dict]:
 # ==== 生成 ====
 def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
     client = _openai_client()
-    model = settings.OPENAI_CHAT_MODEL
     resp = client.chat.completions.create(
         model=model,
         messages=messages,

 # irpr/deps.py  --- OpenAI埋め込み + 自前ベクタストア（numpy）／LLM生成
 from __future__ import annotations
 import os, json, uuid
+from typing import List, Dict, Tuple
 import numpy as np
 from irpr.config import settings
+# ==== 書き込み先の選定（/mnt/data を最優先） ====
+def _ensure_dir_writable(path: str) -> bool:
+    try:
+        os.makedirs(path, exist_ok=True)
+        try:
+            os.chmod(path, 0o777)
+        except Exception:
+            pass
+        testfile = os.path.join(path, ".write_test")
+        with open(testfile, "wb") as f:
+            f.write(b"ok")
+        os.remove(testfile)
+        return True
+    except Exception:
+        return False
 def _pick_writable_dir() -> str:
+    candidates = []
+    if settings.DATA_DIR:
+        candidates.append(settings.DATA_DIR)
+    candidates += ["/mnt/data", "/data", "./var", "/tmp/irpr", "."]
     for base in candidates:
+        if _ensure_dir_writable(base):
             return base
     return "."
 BASE_DIR = _pick_writable_dir()
 INDEX_DIR = settings.INDEX_DIR or os.path.join(BASE_DIR, "simple_index")
+_ensure_dir_writable(INDEX_DIR)
 VECS_PATH = os.path.join(INDEX_DIR, "vectors.npy")     # np.float32 [N,D]（正規化済）
 META_PATH = os.path.join(INDEX_DIR, "meta.jsonl")       # 1行1メタ
     return OpenAI(api_key=key)
 # ==== 収納・ロード ====
+def _load_index() -> Tuple[np.ndarray, list, list]:
     if os.path.exists(VECS_PATH):
+        try:
+            vecs = np.load(VECS_PATH).astype(np.float32, copy=False)
+        except Exception:
+            vecs = np.zeros((0, 0), dtype=np.float32)
     else:
         vecs = np.zeros((0, 0), dtype=np.float32)
+    metas = []
+    texts = []
     if os.path.exists(META_PATH):
         with open(META_PATH, "r", encoding="utf-8") as f:
             for line in f:
         with open(TEXT_PATH, "r", encoding="utf-8") as f:
             for line in f:
                 texts.append(line.rstrip("\n"))
     if vecs.size == 0:
         return np.zeros((0, 0), dtype=np.float32), [], []
     n = vecs.shape[0]
     if len(metas) != n or len(texts) != n:
+        # 整合性が崩れたら初期化
         return np.zeros((0, 0), dtype=np.float32), [], []
     return vecs, metas, texts
+def _save_index(vecs: np.ndarray, metas: list, texts: list) -> None:
+    _ensure_dir_writable(INDEX_DIR)
     np.save(VECS_PATH, vecs.astype(np.float32, copy=False))
     with open(META_PATH, "w", encoding="utf-8") as f:
         for m in metas:
             f.write(json.dumps(m, ensure_ascii=False) + "\n")
     with open(TEXT_PATH, "w", encoding="utf-8") as f:
         for t in texts:
+            f.write((t or "").replace("\n", "\\n") + "\n")
 # ==== Embedding ====
 def embed_texts(texts: List[str]) -> np.ndarray:
     client = _openai_client()
+    model = os.environ.get("OPENAI_EMBED_MODEL", settings.OPENAI_EMBED_MODEL)
     B = 128
     out = []
     for i in range(0, len(texts), B):
         resp = client.embeddings.create(model=model, input=batch)
         out.extend([d.embedding for d in resp.data])
     arr = np.array(out, dtype=np.float32)
     norms = np.linalg.norm(arr, axis=1, keepdims=True) + 1e-12
     return arr / norms
 # ==== 追加 ====
 def add_to_index(records: List[Dict]) -> int:
     if not records:
         return 0
     texts = [r["text"] for r in records]
         old_texts = []
     else:
         if vecs.shape[1] != vecs_new.shape[1]:
+            # 埋め込みモデル変更 → 既存を捨てて作り直し
             vecs = vecs_new
             metas = []
             old_texts = []
     vecs, metas, texts = _load_index()
     if vecs.size == 0:
         return []
+    q = embed_texts([query])[0]
+    scores = vecs @ q
     idx = np.argsort(-scores)[:max(1, top_k)]
     out: List[Dict] = []
     for i in idx.tolist():
 # ==== 生成 ====
 def generate_chat(messages: List[Dict], max_new_tokens=600, temperature=0.2) -> str:
     client = _openai_client()
+    model = os.environ.get("OPENAI_CHAT_MODEL", settings.OPENAI_CHAT_MODEL)
     resp = client.chat.completions.create(
         model=model,
         messages=messages,