Spaces:
Sleeping
Sleeping
| """ | |
| Retriever Moduli | |
| ================ | |
| Foydalanuvchi so‘rovlariga asoslanib tegishli chunk’larni semantik qidirish. | |
| Retriever embedding generator va vector store’ni bog‘laydi: | |
| 1. Foydalanuvchi so‘rovini embedding’ga aylantiradi | |
| 2. O‘xshash chunk’larni qidiradi | |
| 3. Natijalarni kontekst bilan tartiblab qaytaradi | |
| """ | |
| import numpy as np | |
| from typing import List, Dict, Optional | |
| from .embeddings import EmbeddingGenerator | |
| from .vector_store import VectorStore | |
| class Retriever: | |
| """ | |
| Semantik o‘xshashlik asosida tegishli hujjat chunk’larini olish uchun klass. | |
| Retriever RAG tizimining "Retrieval" komponentining yuragi hisoblanadi. | |
| Foydalanuvchi so‘rovi berilganda, indekslangan hujjatlardan eng | |
| mos chunk’larni topadi. | |
| Misol: | |
| retriever = Retriever(embedding_generator, vector_store) | |
| results = retriever.retrieve("What is machine learning?", top_k=5) | |
| """ | |
| def __init__( | |
| self, | |
| embedding_generator: EmbeddingGenerator, | |
| vector_store: VectorStore | |
| ): | |
| """ | |
| Retriever’ni embedding generator va vector store bilan ishga tushirish. | |
| Args: | |
| embedding_generator: So‘rovlar uchun embedding hosil qiluvchi generator. | |
| vector_store: Indekslangan hujjatlarni saqlovchi vector baza. | |
| """ | |
| self.embedding_generator = embedding_generator | |
| self.vector_store = vector_store | |
| def retrieve( | |
| self, | |
| query: str, | |
| top_k: int = 5, | |
| min_similarity: Optional[float] = None | |
| ) -> List[Dict]: | |
| """ | |
| Berilgan so‘rov uchun eng tegishli chunk’larni olish. | |
| Args: | |
| query: Foydalanuvchi so‘rovi. | |
| top_k: Maksimal natijalar soni. | |
| min_similarity: Ixtiyoriy minimal o‘xshashlik (0-1). | |
| Pastroq chunk’lar filtrlab tashlanadi. | |
| Returns: | |
| Natijalar ro‘yxati, har bir element: | |
| - 'text': Chunk matni | |
| - 'similarity': O‘xshashlik bahosi (qanchalik yuqori = shuncha mos) | |
| - 'metadata': Qo‘shimcha ma’lumotlar | |
| - 'id': Chunk ID | |
| """ | |
| # 1-bosqich: So‘rovni embedding’ga aylantirish | |
| query_embedding = self.embedding_generator.embed(query) | |
| # 2-bosqich: Vector store’da qidirish | |
| results = self.vector_store.query(query_embedding, top_k=top_k) | |
| # 3-bosqich: Natijalarni formatlash | |
| formatted_results = [] | |
| for i in range(len(results['ids'])): | |
| # ChromaDB masofani beradi (cosine distance, past = ko‘proq o‘xshash) | |
| distance = results['distances'][i] | |
| similarity = 1 - distance # Cosine distance → similarity | |
| # Minimal o‘xshashlikni tekshirish | |
| if min_similarity is not None and similarity < min_similarity: | |
| continue | |
| result = { | |
| 'id': results['ids'][i], | |
| 'text': results['documents'][i], | |
| 'similarity': similarity, | |
| 'metadata': results['metadatas'][i] | |
| } | |
| formatted_results.append(result) | |
| return formatted_results | |
| def retrieve_with_context( | |
| self, | |
| query: str, | |
| top_k: int = 5 | |
| ) -> str: | |
| """ | |
| Tegishli chunk’larni olib, ularni kontekst stringiga aylantirish. | |
| Ushbu metod LLM’ga kontekst yuborish uchun qulay. | |
| Args: | |
| query: Foydalanuvchi so‘rovi. | |
| top_k: Eng ko‘p natijalar soni. | |
| Returns: | |
| Barcha topilgan chunk’larni tartiblangan kontekst stringi. | |
| """ | |
| results = self.retrieve(query, top_k=top_k) | |
| if not results: | |
| return "Tegishli kontekst topilmadi." | |
| # Chunk’larni raqamlangan blok sifatida formatlash | |
| context_parts = [] | |
| for i, result in enumerate(results, 1): | |
| context_parts.append( | |
| f"[Context {i}] (Similarity: {result['similarity']:.2f})\n" | |
| f"{result['text']}" | |
| ) | |
| return "\n\n---\n\n".join(context_parts) | |
| def get_stats(self) -> Dict: | |
| """ | |
| Indekslangan hujjatlar statistikasi. | |
| Returns: | |
| Retriever statistikasi lug‘ati. | |
| """ | |
| return { | |
| 'total_documents': self.vector_store.count(), | |
| 'embedding_model': self.embedding_generator.model_name, | |
| 'embedding_dimension': self.embedding_generator.embedding_dim | |
| } | |
| # Namuna ishlatish (test) | |
| if __name__ == "__main__": | |
| print("Retriever moduli - to‘liq namuna uchun rag_pipeline’dan ishlating") | |