rag-pdf-chat / rag_system /retriever.py
Mehriddin1997's picture
clean project
b8f0598
"""
Retriever Moduli
================
Foydalanuvchi so‘rovlariga asoslanib tegishli chunk’larni semantik qidirish.
Retriever embedding generator va vector store’ni bog‘laydi:
1. Foydalanuvchi so‘rovini embedding’ga aylantiradi
2. O‘xshash chunk’larni qidiradi
3. Natijalarni kontekst bilan tartiblab qaytaradi
"""
import numpy as np
from typing import List, Dict, Optional
from .embeddings import EmbeddingGenerator
from .vector_store import VectorStore
class Retriever:
"""
Semantik o‘xshashlik asosida tegishli hujjat chunk’larini olish uchun klass.
Retriever RAG tizimining "Retrieval" komponentining yuragi hisoblanadi.
Foydalanuvchi so‘rovi berilganda, indekslangan hujjatlardan eng
mos chunk’larni topadi.
Misol:
retriever = Retriever(embedding_generator, vector_store)
results = retriever.retrieve("What is machine learning?", top_k=5)
"""
def __init__(
self,
embedding_generator: EmbeddingGenerator,
vector_store: VectorStore
):
"""
Retriever’ni embedding generator va vector store bilan ishga tushirish.
Args:
embedding_generator: So‘rovlar uchun embedding hosil qiluvchi generator.
vector_store: Indekslangan hujjatlarni saqlovchi vector baza.
"""
self.embedding_generator = embedding_generator
self.vector_store = vector_store
def retrieve(
self,
query: str,
top_k: int = 5,
min_similarity: Optional[float] = None
) -> List[Dict]:
"""
Berilgan so‘rov uchun eng tegishli chunk’larni olish.
Args:
query: Foydalanuvchi so‘rovi.
top_k: Maksimal natijalar soni.
min_similarity: Ixtiyoriy minimal o‘xshashlik (0-1).
Pastroq chunk’lar filtrlab tashlanadi.
Returns:
Natijalar ro‘yxati, har bir element:
- 'text': Chunk matni
- 'similarity': O‘xshashlik bahosi (qanchalik yuqori = shuncha mos)
- 'metadata': Qo‘shimcha ma’lumotlar
- 'id': Chunk ID
"""
# 1-bosqich: So‘rovni embedding’ga aylantirish
query_embedding = self.embedding_generator.embed(query)
# 2-bosqich: Vector store’da qidirish
results = self.vector_store.query(query_embedding, top_k=top_k)
# 3-bosqich: Natijalarni formatlash
formatted_results = []
for i in range(len(results['ids'])):
# ChromaDB masofani beradi (cosine distance, past = ko‘proq o‘xshash)
distance = results['distances'][i]
similarity = 1 - distance # Cosine distance → similarity
# Minimal o‘xshashlikni tekshirish
if min_similarity is not None and similarity < min_similarity:
continue
result = {
'id': results['ids'][i],
'text': results['documents'][i],
'similarity': similarity,
'metadata': results['metadatas'][i]
}
formatted_results.append(result)
return formatted_results
def retrieve_with_context(
self,
query: str,
top_k: int = 5
) -> str:
"""
Tegishli chunk’larni olib, ularni kontekst stringiga aylantirish.
Ushbu metod LLM’ga kontekst yuborish uchun qulay.
Args:
query: Foydalanuvchi so‘rovi.
top_k: Eng ko‘p natijalar soni.
Returns:
Barcha topilgan chunk’larni tartiblangan kontekst stringi.
"""
results = self.retrieve(query, top_k=top_k)
if not results:
return "Tegishli kontekst topilmadi."
# Chunk’larni raqamlangan blok sifatida formatlash
context_parts = []
for i, result in enumerate(results, 1):
context_parts.append(
f"[Context {i}] (Similarity: {result['similarity']:.2f})\n"
f"{result['text']}"
)
return "\n\n---\n\n".join(context_parts)
def get_stats(self) -> Dict:
"""
Indekslangan hujjatlar statistikasi.
Returns:
Retriever statistikasi lug‘ati.
"""
return {
'total_documents': self.vector_store.count(),
'embedding_model': self.embedding_generator.model_name,
'embedding_dimension': self.embedding_generator.embedding_dim
}
# Namuna ishlatish (test)
if __name__ == "__main__":
print("Retriever moduli - to‘liq namuna uchun rag_pipeline’dan ishlating")