Spaces:
Build error
Build error
| import json | |
| from typing import Dict, List | |
| from pathlib import Path | |
| import numpy as np | |
| from datetime import datetime | |
| from sentence_transformers import SentenceTransformer | |
| from huggingface_hub import HfApi | |
| import os | |
| class VectorStore: | |
| def __init__(self): | |
| self.documents = [] | |
| self.metadata = [] # ๋ฌธ์ ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ | |
| self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
| self.hf_api = HfApi() | |
| self.dataset_name = "bluewhale2025/parseai_202506" # Hugging Face dataset ์ด๋ฆ | |
| # ๋ฐ์ดํฐ์ ์ด ์์ผ๋ฉด ์์ฑ | |
| try: | |
| self.hf_api.create_repo( | |
| repo_id=self.dataset_name, | |
| repo_type="dataset", | |
| private=True # ๊ฐ์ธ ๋ฐ์ดํฐ์ ์ผ๋ก ์ค์ | |
| ) | |
| print(f"๋ฐ์ดํฐ์ {self.dataset_name} ์์ฑ ์๋ฃ") | |
| except Exception as e: | |
| print(f"๋ฐ์ดํฐ์ ์์ฑ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def add_document(self, text: str, metadata: Dict) -> None: | |
| """๋ฌธ์๋ฅผ ์ ์ฅ""" | |
| try: | |
| # ๋ฌธ์ ์ ์ฅ | |
| self.documents.append(text) | |
| # ๋ฉํ๋ฐ์ดํฐ ์ ์ฅ | |
| metadata["timestamp"] = str(datetime.now()) | |
| self.metadata.append(metadata) | |
| # ๋ฒกํฐ ์์ฑ | |
| vector = self.model.encode(text) | |
| # ํ์ผ ๊ฒฝ๋ก ์ค์ | |
| vector_path = f"vectors/{len(self.documents)}.npy" | |
| metadata_path = f"metadata/{len(self.documents)}.json" | |
| # ์์ ํ์ผ๋ก ์ ์ฅ | |
| np.save(vector_path, vector) | |
| with open(metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f) | |
| # Hugging Face์ ์ ๋ก๋ | |
| self.hf_api.upload_file( | |
| path_or_fileobj=vector_path, | |
| path_in_repo=vector_path, | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| self.hf_api.upload_file( | |
| path_or_fileobj=metadata_path, | |
| path_in_repo=metadata_path, | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| # ์์ ํ์ผ ์ญ์ | |
| os.remove(vector_path) | |
| os.remove(metadata_path) | |
| except Exception as e: | |
| raise Exception(f"๋ฌธ์ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def search(self, query: str, top_k: int = 5) -> List[Dict]: | |
| """ํค์๋ ๊ฒ์""" | |
| try: | |
| # ์ฟผ๋ฆฌ ๋ฒกํฐ ์์ฑ | |
| query_vector = self.model.encode(query) | |
| # Hugging Face์์ ๋ชจ๋ ๋ฒกํฐ ๋ก๋ | |
| vectors = [] | |
| metadata = [] | |
| # ๋ชจ๋ ๋ฒกํฐ ํ์ผ ๋ก๋ | |
| files = self.hf_api.list_repo_files( | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| # ํ์ผ ์ ๋ ฌ (1๋ถํฐ ์์) | |
| vector_files = sorted([f for f in files if f.startswith("vectors/")]) | |
| metadata_files = sorted([f for f in files if f.startswith("metadata/")]) | |
| if not vector_files or not metadata_files: | |
| return [] | |
| # ํ์ผ ๋ก๋ | |
| for vector_file, metadata_file in zip(vector_files, metadata_files): | |
| vector = np.load(self.hf_api.download_file( | |
| repo_id=self.dataset_name, | |
| filename=vector_file, | |
| repo_type="dataset" | |
| )) | |
| vectors.append(vector) | |
| meta = json.load(self.hf_api.download_file( | |
| repo_id=self.dataset_name, | |
| filename=metadata_file, | |
| repo_type="dataset" | |
| )) | |
| metadata.append(meta) | |
| # ์ ์ฌ๋ ๊ณ์ฐ | |
| similarities = cosine_similarity(vectors, [query_vector]).flatten() | |
| # ์ ์ฌ๋ ๊ธฐ๋ฐ ์ ๋ ฌ | |
| sorted_idx = np.argsort(similarities)[::-1][:top_k] | |
| # ๊ฒฐ๊ณผ ์์ฑ | |
| results = [] | |
| for idx in sorted_idx: | |
| results.append({ | |
| "filename": metadata[idx]["filename"], | |
| "total_pages": metadata[idx]["total_pages"], | |
| "summary": metadata[idx]["summary"], | |
| "timestamp": metadata[idx]["timestamp"], | |
| "similarity": float(similarities[idx]) | |
| }) | |
| return results | |
| except Exception as e: | |
| raise Exception(f"๊ฒ์ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def _save_metadata(self) -> None: | |
| """๋ฉํ๋ฐ์ดํฐ ์ ์ฅ""" | |
| try: | |
| Path(self.metadata_path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(self.metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "documents": self.documents, | |
| "metadata": self.metadata | |
| }, f, ensure_ascii=False, indent=2) | |
| except Exception as e: | |
| raise Exception(f"๋ฉํ๋ฐ์ดํฐ ์ ์ฅ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def _load_metadata(self): | |
| """๋ฉํ๋ฐ์ดํฐ ๋ก๋""" | |
| try: | |
| if Path(self.metadata_path).exists(): | |
| with open(self.metadata_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| self.documents = data["documents"] | |
| self.metadata = data["metadata"] | |
| except Exception as e: | |
| raise Exception(f"๋ฉํ๋ฐ์ดํฐ ๋ก๋ ์ค ์ค๋ฅ ๋ฐ์: {str(e)}") | |
| def load(self) -> None: | |
| """์ ์ฅ๋ ๋ฉํ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ""" | |
| self._load_metadata() | |
| # ์ฑ๊ธํค ์ธ์คํด์ค ์์ฑ | |
| vector_store = VectorStore() | |