Spaces:
Sleeping
Sleeping
| import os | |
| from typing import List, Dict, Any | |
| import chromadb | |
| from chromadb.config import Settings | |
| from sentence_transformers import SentenceTransformer | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| class SemanticSearchEngine: | |
| def __init__(self, collection_name: str = "document_collection"): | |
| """ | |
| Inisialisasi Semantic Search Engine menggunakan ChromaDB dan Sentence-Transformers. | |
| """ | |
| # Set up Embedding Model (default to a good multilingual model if not in .env) | |
| model_name = os.getenv("EMBEDDING_MODEL", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") | |
| self.embedding_model = SentenceTransformer(model_name) | |
| # Set up ChromaDB Client (Persistent storage) | |
| db_path = os.getenv("CHROMA_DB_DIR", "data/chroma_db") | |
| self.chroma_client = chromadb.PersistentClient(path=db_path) | |
| # Get or create collection | |
| self.collection = self.chroma_client.get_or_create_collection( | |
| name=collection_name, | |
| metadata={"hnsw:space": "cosine"} # Use cosine similarity for text search | |
| ) | |
| def _get_embeddings(self, texts: List[str]) -> List[List[float]]: | |
| """Generate embeddings using SentenceTransformer""" | |
| embeddings = self.embedding_model.encode(texts, show_progress_bar=False) | |
| return embeddings.tolist() | |
| def add_documents(self, documents: List[str], metadatas: List[Dict[str, Any]], ids: List[str]): | |
| """ | |
| Menambahkan dokumen ke dalam Vector Database. | |
| """ | |
| if not documents: | |
| return | |
| embeddings = self._get_embeddings(documents) | |
| self.collection.add( | |
| embeddings=embeddings, | |
| documents=documents, | |
| metadatas=metadatas, | |
| ids=ids | |
| ) | |
| print(f"Berhasil menambahkan {len(documents)} dokumen ke ChromaDB.") | |
| def search(self, query: str, top_k: int = 5) -> Dict[str, Any]: | |
| """ | |
| Mencari dokumen yang secara semantik mirip dengan query. | |
| """ | |
| query_embedding = self._get_embeddings([query]) | |
| results = self.collection.query( | |
| query_embeddings=query_embedding, | |
| n_results=top_k, | |
| include=["documents", "metadatas", "distances"] | |
| ) | |
| return results | |
| # Example usage: | |
| # engine = SemanticSearchEngine() | |
| # engine.add_documents(["Anggaran pendidikan tahun 2024 meningkat"], [{"source": "news"}], ["doc_1"]) | |
| # results = engine.search("Berapa dana sekolah tahun depan?", top_k=1) | |