parseai-document-processor / vector_store.py
bluewhale2025's picture
Initial commit: Add ParseAI document processor application
3022fd1
raw
history blame
5.98 kB
import json
from typing import Dict, List
from pathlib import Path
import numpy as np
from datetime import datetime
from sentence_transformers import SentenceTransformer
from huggingface_hub import HfApi
import os
class VectorStore:
def __init__(self):
self.documents = []
self.metadata = [] # ๋ฌธ์„œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
self.model = SentenceTransformer('all-MiniLM-L6-v2')
self.hf_api = HfApi()
self.dataset_name = "bluewhale2025/parseai_202506" # Hugging Face dataset ์ด๋ฆ„
# ๋ฐ์ดํ„ฐ์…‹์ด ์—†์œผ๋ฉด ์ƒ์„ฑ
try:
self.hf_api.create_repo(
repo_id=self.dataset_name,
repo_type="dataset",
private=True # ๊ฐœ์ธ ๋ฐ์ดํ„ฐ์…‹์œผ๋กœ ์„ค์ •
)
print(f"๋ฐ์ดํ„ฐ์…‹ {self.dataset_name} ์ƒ์„ฑ ์™„๋ฃŒ")
except Exception as e:
print(f"๋ฐ์ดํ„ฐ์…‹ ์ƒ์„ฑ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def add_document(self, text: str, metadata: Dict) -> None:
"""๋ฌธ์„œ๋ฅผ ์ €์žฅ"""
try:
# ๋ฌธ์„œ ์ €์žฅ
self.documents.append(text)
# ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ
metadata["timestamp"] = str(datetime.now())
self.metadata.append(metadata)
# ๋ฒกํ„ฐ ์ƒ์„ฑ
vector = self.model.encode(text)
# ํŒŒ์ผ ๊ฒฝ๋กœ ์„ค์ •
vector_path = f"vectors/{len(self.documents)}.npy"
metadata_path = f"metadata/{len(self.documents)}.json"
# ์ž„์‹œ ํŒŒ์ผ๋กœ ์ €์žฅ
np.save(vector_path, vector)
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f)
# Hugging Face์— ์—…๋กœ๋“œ
self.hf_api.upload_file(
path_or_fileobj=vector_path,
path_in_repo=vector_path,
repo_id=self.dataset_name,
repo_type="dataset"
)
self.hf_api.upload_file(
path_or_fileobj=metadata_path,
path_in_repo=metadata_path,
repo_id=self.dataset_name,
repo_type="dataset"
)
# ์ž„์‹œ ํŒŒ์ผ ์‚ญ์ œ
os.remove(vector_path)
os.remove(metadata_path)
except Exception as e:
raise Exception(f"๋ฌธ์„œ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def search(self, query: str, top_k: int = 5) -> List[Dict]:
"""ํ‚ค์›Œ๋“œ ๊ฒ€์ƒ‰"""
try:
# ์ฟผ๋ฆฌ ๋ฒกํ„ฐ ์ƒ์„ฑ
query_vector = self.model.encode(query)
# Hugging Face์—์„œ ๋ชจ๋“  ๋ฒกํ„ฐ ๋กœ๋“œ
vectors = []
metadata = []
# ๋ชจ๋“  ๋ฒกํ„ฐ ํŒŒ์ผ ๋กœ๋“œ
files = self.hf_api.list_repo_files(
repo_id=self.dataset_name,
repo_type="dataset"
)
# ํŒŒ์ผ ์ •๋ ฌ (1๋ถ€ํ„ฐ ์‹œ์ž‘)
vector_files = sorted([f for f in files if f.startswith("vectors/")])
metadata_files = sorted([f for f in files if f.startswith("metadata/")])
if not vector_files or not metadata_files:
return []
# ํŒŒ์ผ ๋กœ๋“œ
for vector_file, metadata_file in zip(vector_files, metadata_files):
vector = np.load(self.hf_api.download_file(
repo_id=self.dataset_name,
filename=vector_file,
repo_type="dataset"
))
vectors.append(vector)
meta = json.load(self.hf_api.download_file(
repo_id=self.dataset_name,
filename=metadata_file,
repo_type="dataset"
))
metadata.append(meta)
# ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
similarities = cosine_similarity(vectors, [query_vector]).flatten()
# ์œ ์‚ฌ๋„ ๊ธฐ๋ฐ˜ ์ •๋ ฌ
sorted_idx = np.argsort(similarities)[::-1][:top_k]
# ๊ฒฐ๊ณผ ์ƒ์„ฑ
results = []
for idx in sorted_idx:
results.append({
"filename": metadata[idx]["filename"],
"total_pages": metadata[idx]["total_pages"],
"summary": metadata[idx]["summary"],
"timestamp": metadata[idx]["timestamp"],
"similarity": float(similarities[idx])
})
return results
except Exception as e:
raise Exception(f"๊ฒ€์ƒ‰ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def _save_metadata(self) -> None:
"""๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ"""
try:
Path(self.metadata_path).parent.mkdir(parents=True, exist_ok=True)
with open(self.metadata_path, 'w', encoding='utf-8') as f:
json.dump({
"documents": self.documents,
"metadata": self.metadata
}, f, ensure_ascii=False, indent=2)
except Exception as e:
raise Exception(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ์ €์žฅ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def _load_metadata(self):
"""๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ"""
try:
if Path(self.metadata_path).exists():
with open(self.metadata_path, 'r', encoding='utf-8') as f:
data = json.load(f)
self.documents = data["documents"]
self.metadata = data["metadata"]
except Exception as e:
raise Exception(f"๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋กœ๋“œ ์ค‘ ์˜ค๋ฅ˜ ๋ฐœ์ƒ: {str(e)}")
def load(self) -> None:
"""์ €์žฅ๋œ ๋ฉ”ํƒ€๋ฐ์ดํ„ฐ ๋ถˆ๋Ÿฌ์˜ค๊ธฐ"""
self._load_metadata()
# ์‹ฑ๊ธ€ํ†ค ์ธ์Šคํ„ด์Šค ์ƒ์„ฑ
vector_store = VectorStore()