rag-pdf-chat / rag_system /chunker.py
Mehriddin1997's picture
clean project
b8f0598
"""
Matnni Chunklarga Ajratish Moduli
================================
Overlap (ustma-ust kelish) bilan belgilangan hajmdagi matn chunklarini yaratadi.
Chunklash RAG tizimlarida juda muhim, chunki:
1. Embedding va LLM modellari token chekloviga ega
2. Kichik chunklar aniqroq semantik qidiruvni ta’minlaydi
3. Overlap chunklar chegarasida kontekst yo‘qolib ketmasligini ta’minlaydi
❗ RAG sifatining katta qismi aynan chunklash strategiyasiga bog‘liq.
"""
from typing import List, Dict
class TextChunker:
"""
Matnni belgilangan hajmdagi chunklarga overlap bilan ajratuvchi klass.
Ushbu chunklash strategiyasi “sliding window” (sirpanib yuruvchi oyna)
yondashuviga asoslanadi:
- Har bir chunk `chunk_size` miqdordagi belgilarni o‘z ichiga oladi
- Ketma-ket chunklar `overlap` miqdorda umumiy belgilarni bo‘lishadi
- Bu chunklar orasida kontekst uzilmasligini ta’minlaydi
Misol:
chunker = TextChunker(chunk_size=500, overlap=100)
chunks = chunker.chunk(text)
📌 Bu yondashuv RAG tizimlari uchun eng ishonchli va keng qo‘llaniladigan usul.
"""
def __init__(self, chunk_size: int = 500, overlap: int = 100):
"""
Chunker’ni chunk hajmi va overlap sozlamalari bilan ishga tushiradi.
Args:
chunk_size:
Har bir chunk uchun maksimal belgilar soni.
Standart qiymat: 500 ta belgi.
overlap:
Ketma-ket chunklar orasida umumiy bo‘ladigan belgilar soni.
Standart qiymat: 100 ta belgi (500 ning taxminan 20%).
Raises:
ValueError:
- Agar chunk_size manfiy yoki 0 bo‘lsa
- Agar overlap manfiy bo‘lsa
- Agar overlap >= chunk_size bo‘lsa
❗ overlap >= chunk_size bo‘lsa, sliding window oldinga siljimaydi
va cheksiz loop (infinite loop) yuzaga keladi.
"""
if chunk_size <= 0:
raise ValueError("chunk_size musbat bo‘lishi kerak")
if overlap < 0:
raise ValueError("overlap manfiy bo‘lishi mumkin emas")
if overlap >= chunk_size:
raise ValueError("overlap chunk_size’dan kichik bo‘lishi kerak")
self.chunk_size = chunk_size
self.overlap = overlap
def chunk(self, text: str) -> List[Dict]:
"""
Matnni overlap bilan chunklarga ajratadi.
Ushbu metod sliding window usulidan foydalanadi.
Har bir chunk original matndagi joylashuvi haqida
qo‘shimcha metadata saqlaydi.
Args:
text:
Chunklanishi kerak bo‘lgan kiruvchi matn.
Returns:
Chunklar ro‘yxati. Har bir chunk quyidagi maydonlarga ega:
- text: chunk matni
- index: chunk tartib raqami (0 dan boshlanadi)
- start: original matndagi boshlanish indeksi
- end: original matndagi tugash indeksi
- chunk_size: ushbu chunk hajmi
"""
if not text:
return []
chunks = []
start = 0
chunk_index = 0
# Sliding window logikasi:
# Har safar (chunk_size - overlap) miqdorida oldinga siljiydi
#
# Masalan:
# chunk_size = 500
# overlap = 100
# → har iteratsiyada 400 belgiga siljiydi
#
# Bu overlap orqali kontekst uzilmasligini ta’minlaydi
while start < len(text):
# Joriy chunk uchun tugash indeksini hisoblaymiz
end = min(start + self.chunk_size, len(text))
# Matndan chunkni kesib olamiz
chunk_text = text[start:end]
# Chunk va uning metadata ma’lumotlarini yaratamiz
chunk = {
'text': chunk_text,
'index': chunk_index,
'start': start,
'end': end,
'chunk_size': len(chunk_text)
}
chunks.append(chunk)
# Sliding window’ni oldinga siljitamiz
start += self.chunk_size - self.overlap
chunk_index += 1
# Xavfsizlik tekshiruvi:
# Agar matn oxiriga yetilgan bo‘lsa, cheksiz loop bo‘lmasligi uchun to‘xtaymiz
if end >= len(text):
break
return chunks
def chunk_with_sentences(self, text: str) -> List[Dict]:
"""
Gap chegaralariga moslab chunklashga harakat qiladigan alternativ usul.
Ushbu metod:
- Maksimal chunk hajmiga amal qiladi
- Lekin chunklarni gap o‘rtasida bo‘lib yubormaslikka harakat qiladi
- Nuqta (.), savol (?) va undov (!) belgilarida chunkni tugatadi
Bu usul LLM uchun tabiiyroq va grammatik jihatdan to‘liq
kontekst yaratadi.
Args:
text:
Chunklanishi kerak bo‘lgan matn.
Returns:
Oddiy chunk() metodi bilan bir xil tuzilishga ega chunklar ro‘yxati.
"""
if not text:
return []
chunks = []
start = 0
chunk_index = 0
sentence_endings = '.!?'
while start < len(text):
end = min(start + self.chunk_size, len(text))
# Agar matn oxiriga yetilmagan bo‘lsa,
# gap tugash belgilarini orqaga qarab qidiramiz
if end < len(text):
adjusted_end = end
for i in range(end - 1, max(start + self.chunk_size // 2, start), -1):
if text[i] in sentence_endings:
adjusted_end = i + 1
break
end = adjusted_end
chunk_text = text[start:end]
chunk = {
'text': chunk_text,
'index': chunk_index,
'start': start,
'end': end,
'chunk_size': len(chunk_text)
}
chunks.append(chunk)
# Keyingi chunk uchun start pozitsiyasini hisoblaymiz
# overlap orqali kontekstni saqlab qolamiz
start = end - self.overlap if end < len(text) else end
chunk_index += 1
return chunks
def get_stats(self, chunks: List[Dict]) -> Dict:
"""
Chunklangan matn bo‘yicha statistik ma’lumotlarni qaytaradi.
Bu metod RAG tizimini sozlash (tuning) jarayonida juda foydali:
- chunklar soni juda ko‘pmi?
- chunklar juda kichikmi?
- overlap to‘g‘ri tanlanganmi?
Args:
chunks:
chunk() yoki chunk_with_sentences() orqali yaratilgan chunklar.
Returns:
Statistik ma’lumotlar lug‘ati.
"""
if not chunks:
return {'num_chunks': 0}
sizes = [c['chunk_size'] for c in chunks]
return {
'num_chunks': len(chunks),
'avg_chunk_size': sum(sizes) / len(sizes),
'min_chunk_size': min(sizes),
'max_chunk_size': max(sizes),
'total_characters': sum(sizes)
}
# Namuna sifatida ishga tushirish (test uchun)
if __name__ == "__main__":
sample_text = """
Artificial intelligence (AI) is intelligence demonstrated by machines,
as opposed to natural intelligence displayed by animals including humans.
AI research has been defined as the field of study of intelligent agents,
which refers to any system that perceives its environment and takes actions
that maximize its chance of achieving its goals.
"""
chunker = TextChunker(chunk_size=200, overlap=50)
chunks = chunker.chunk(sample_text)
print(f"{len(chunks)} ta chunk yaratildi:")
for chunk in chunks:
print(f"\n--- Chunk {chunk['index']} ({chunk['chunk_size']} belgi) ---")
print(chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text'])