|
|
""" |
|
|
埋め込み(embedding)の保存・読み込みユーティリティ |
|
|
""" |
|
|
|
|
|
import json |
|
|
import logging |
|
|
import os |
|
|
from datetime import datetime |
|
|
from pathlib import Path |
|
|
from typing import List, Dict, Tuple |
|
|
|
|
|
import numpy as np |
|
|
from langchain_core.documents import Document |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
def save_embeddings( |
|
|
documents: List[Document], |
|
|
vectors: List[List[float]], |
|
|
model_name: str, |
|
|
embeddings_dir: str = 'data/embeddings' |
|
|
) -> str: |
|
|
""" |
|
|
埋め込みとメタデータをJSONファイルに保存する |
|
|
|
|
|
Args: |
|
|
documents: Documentオブジェクトのリスト |
|
|
vectors: 埋め込みベクトルのリスト |
|
|
model_name: 使用した埋め込みモデル名 |
|
|
embeddings_dir: 埋め込みを保存するディレクトリ |
|
|
|
|
|
Returns: |
|
|
保存した埋め込みファイルのパス |
|
|
""" |
|
|
|
|
|
Path(embeddings_dir).mkdir(parents=True, exist_ok=True) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
|
|
|
|
|
|
embeddings_data = { |
|
|
"model": model_name, |
|
|
"timestamp": datetime.now().isoformat(), |
|
|
"total_documents": len(documents), |
|
|
"embeddings": [] |
|
|
} |
|
|
|
|
|
|
|
|
for i, (doc, vector) in enumerate(zip(documents, vectors)): |
|
|
chunk_id = f"doc_{i}_chunk_{doc.metadata.get('chunk_index', 0)}" |
|
|
embeddings_data["embeddings"].append({ |
|
|
"chunk_id": chunk_id, |
|
|
"metadata": doc.metadata, |
|
|
"content": doc.page_content, |
|
|
"vector": vector |
|
|
}) |
|
|
|
|
|
|
|
|
output_file = os.path.join(embeddings_dir, f"embeddings_{timestamp}.json") |
|
|
with open(output_file, 'w', encoding='utf-8') as f: |
|
|
json.dump(embeddings_data, f, ensure_ascii=False, indent=2) |
|
|
|
|
|
|
|
|
latest_link = os.path.join(embeddings_dir, "latest.json") |
|
|
if os.path.exists(latest_link): |
|
|
os.remove(latest_link) |
|
|
try: |
|
|
os.symlink(os.path.basename(output_file), latest_link) |
|
|
except OSError: |
|
|
|
|
|
with open(os.path.join(embeddings_dir, "latest.txt"), 'w') as f: |
|
|
f.write(output_file) |
|
|
|
|
|
logger.info(f"Embeddings saved to: {output_file}") |
|
|
logger.info(f"Total embeddings saved: {len(vectors)}") |
|
|
|
|
|
return output_file |
|
|
|
|
|
|
|
|
def load_embeddings( |
|
|
embeddings_file: str = None, |
|
|
embeddings_dir: str = 'data/embeddings' |
|
|
) -> Tuple[List[Document], np.ndarray, str]: |
|
|
""" |
|
|
保存済みのJSONファイルから埋め込みを読み込む |
|
|
|
|
|
Args: |
|
|
embeddings_file: 読み込む埋め込みファイルのパス。Noneの場合は最新を読み込む |
|
|
embeddings_dir: 埋め込みが保存されているディレクトリ |
|
|
|
|
|
Returns: |
|
|
(documents, ベクトル(numpy配列), モデル名) のタプル |
|
|
""" |
|
|
|
|
|
if embeddings_file is None: |
|
|
latest_link = os.path.join(embeddings_dir, "latest.json") |
|
|
if os.path.exists(latest_link): |
|
|
embeddings_file = latest_link |
|
|
else: |
|
|
|
|
|
latest_txt = os.path.join(embeddings_dir, "latest.txt") |
|
|
if os.path.exists(latest_txt): |
|
|
with open(latest_txt, 'r') as f: |
|
|
embeddings_file = f.read().strip() |
|
|
else: |
|
|
raise FileNotFoundError(f"{embeddings_dir} に埋め込みファイルが見つかりません") |
|
|
|
|
|
|
|
|
with open(embeddings_file, 'r', encoding='utf-8') as f: |
|
|
embeddings_data = json.load(f) |
|
|
|
|
|
|
|
|
documents = [] |
|
|
vectors = [] |
|
|
|
|
|
for item in embeddings_data["embeddings"]: |
|
|
|
|
|
doc = Document( |
|
|
page_content=item["content"], |
|
|
metadata=item["metadata"] |
|
|
) |
|
|
documents.append(doc) |
|
|
vectors.append(item["vector"]) |
|
|
|
|
|
|
|
|
vectors_np = np.array(vectors) |
|
|
|
|
|
logger.info(f"Loaded {len(documents)} embeddings from: {embeddings_file}") |
|
|
logger.info(f"Model used: {embeddings_data['model']}") |
|
|
logger.info(f"Created at: {embeddings_data['timestamp']}") |
|
|
|
|
|
return documents, vectors_np, embeddings_data["model"] |
|
|
|