Spaces:
Build error
Build error
| import json | |
| from typing import Dict, List | |
| from pathlib import Path | |
| import numpy as np | |
| from datetime import datetime | |
| from sentence_transformers import SentenceTransformer | |
| from huggingface_hub import HfApi | |
| import os | |
| class VectorStore: | |
| def __init__(self): | |
| self.documents = [] | |
| self.metadata = [] # λ¬Έμ λ©νλ°μ΄ν° μ μ₯ | |
| self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
| self.hf_api = HfApi() | |
| self.dataset_name = "bluewhale2025/parseai_202506" # Hugging Face dataset μ΄λ¦ | |
| # λ°μ΄ν°μ μ΄ μμΌλ©΄ μμ± | |
| try: | |
| self.hf_api.create_repo( | |
| repo_id=self.dataset_name, | |
| repo_type="dataset", | |
| private=True # κ°μΈ λ°μ΄ν°μ μΌλ‘ μ€μ | |
| ) | |
| print(f"λ°μ΄ν°μ {self.dataset_name} μμ± μλ£") | |
| except Exception as e: | |
| print(f"λ°μ΄ν°μ μμ± μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def add_document(self, text: str, metadata: Dict) -> None: | |
| """λ¬Έμλ₯Ό μ μ₯""" | |
| try: | |
| # λ¬Έμ μ μ₯ | |
| self.documents.append(text) | |
| # λ©νλ°μ΄ν° μ μ₯ | |
| metadata["timestamp"] = str(datetime.now()) | |
| self.metadata.append(metadata) | |
| # λ²‘ν° μμ± | |
| vector = self.model.encode(text) | |
| # λλ ν 리 μμ± | |
| os.makedirs("vectors", exist_ok=True) | |
| os.makedirs("metadata", exist_ok=True) | |
| # νμΌ κ²½λ‘ μ€μ | |
| doc_id = len(self.documents) | |
| vector_path = f"vectors/{doc_id}.npy" | |
| metadata_path = f"metadata/{doc_id}.json" | |
| # μμ νμΌλ‘ μ μ₯ | |
| np.save(vector_path, vector) | |
| with open(metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump(metadata, f) | |
| # Hugging Faceμ μ λ‘λ | |
| self.hf_api.upload_file( | |
| path_or_fileobj=vector_path, | |
| path_in_repo=vector_path, | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| self.hf_api.upload_file( | |
| path_or_fileobj=metadata_path, | |
| path_in_repo=metadata_path, | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| # μμ νμΌ μμ | |
| os.remove(vector_path) | |
| os.remove(metadata_path) | |
| except Exception as e: | |
| raise Exception(f"λ¬Έμ μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def search(self, query: str, top_k: int = 5) -> List[Dict]: | |
| """ν€μλ κ²μ""" | |
| try: | |
| # 쿼리 λ²‘ν° μμ± | |
| query_vector = self.model.encode(query) | |
| # Hugging Faceμμ λͺ¨λ λ²‘ν° λ‘λ | |
| vectors = [] | |
| metadata = [] | |
| # λͺ¨λ λ²‘ν° νμΌ λ‘λ | |
| files = self.hf_api.list_repo_files( | |
| repo_id=self.dataset_name, | |
| repo_type="dataset" | |
| ) | |
| # νμΌ μ λ ¬ (1λΆν° μμ) | |
| vector_files = sorted([f for f in files if f.startswith("vectors/")]) | |
| metadata_files = sorted([f for f in files if f.startswith("metadata/")]) | |
| if not vector_files or not metadata_files: | |
| return [] | |
| # νμΌ λ‘λ | |
| for vector_file, metadata_file in zip(vector_files, metadata_files): | |
| vector = np.load(self.hf_api.download_file( | |
| repo_id=self.dataset_name, | |
| filename=vector_file, | |
| repo_type="dataset" | |
| )) | |
| vectors.append(vector) | |
| meta = json.load(self.hf_api.download_file( | |
| repo_id=self.dataset_name, | |
| filename=metadata_file, | |
| repo_type="dataset" | |
| )) | |
| metadata.append(meta) | |
| # μ μ¬λ κ³μ° | |
| similarities = cosine_similarity(vectors, [query_vector]).flatten() | |
| # μ μ¬λ κΈ°λ° μ λ ¬ | |
| sorted_idx = np.argsort(similarities)[::-1][:top_k] | |
| # κ²°κ³Ό μμ± | |
| results = [] | |
| for idx in sorted_idx: | |
| results.append({ | |
| "filename": metadata[idx]["filename"], | |
| "total_pages": metadata[idx]["total_pages"], | |
| "summary": metadata[idx]["summary"], | |
| "timestamp": metadata[idx]["timestamp"], | |
| "similarity": float(similarities[idx]) | |
| }) | |
| return results | |
| except Exception as e: | |
| raise Exception(f"κ²μ μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def _save_metadata(self) -> None: | |
| """λ©νλ°μ΄ν° μ μ₯""" | |
| try: | |
| Path(self.metadata_path).parent.mkdir(parents=True, exist_ok=True) | |
| with open(self.metadata_path, 'w', encoding='utf-8') as f: | |
| json.dump({ | |
| "documents": self.documents, | |
| "metadata": self.metadata | |
| }, f, ensure_ascii=False, indent=2) | |
| except Exception as e: | |
| raise Exception(f"λ©νλ°μ΄ν° μ μ₯ μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def _load_metadata(self): | |
| """λ©νλ°μ΄ν° λ‘λ""" | |
| try: | |
| if Path(self.metadata_path).exists(): | |
| with open(self.metadata_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| self.documents = data["documents"] | |
| self.metadata = data["metadata"] | |
| except Exception as e: | |
| raise Exception(f"λ©νλ°μ΄ν° λ‘λ μ€ μ€λ₯ λ°μ: {str(e)}") | |
| def load(self) -> None: | |
| """μ μ₯λ λ©νλ°μ΄ν° λΆλ¬μ€κΈ°""" | |
| self._load_metadata() | |
| # μ±κΈν€ μΈμ€ν΄μ€ μμ± | |
| vector_store = VectorStore() | |