Spaces:
Sleeping
Sleeping
| """ | |
| Matnni Chunklarga Ajratish Moduli | |
| ================================ | |
| Overlap (ustma-ust kelish) bilan belgilangan hajmdagi matn chunklarini yaratadi. | |
| Chunklash RAG tizimlarida juda muhim, chunki: | |
| 1. Embedding va LLM modellari token chekloviga ega | |
| 2. Kichik chunklar aniqroq semantik qidiruvni ta’minlaydi | |
| 3. Overlap chunklar chegarasida kontekst yo‘qolib ketmasligini ta’minlaydi | |
| ❗ RAG sifatining katta qismi aynan chunklash strategiyasiga bog‘liq. | |
| """ | |
| from typing import List, Dict | |
| class TextChunker: | |
| """ | |
| Matnni belgilangan hajmdagi chunklarga overlap bilan ajratuvchi klass. | |
| Ushbu chunklash strategiyasi “sliding window” (sirpanib yuruvchi oyna) | |
| yondashuviga asoslanadi: | |
| - Har bir chunk `chunk_size` miqdordagi belgilarni o‘z ichiga oladi | |
| - Ketma-ket chunklar `overlap` miqdorda umumiy belgilarni bo‘lishadi | |
| - Bu chunklar orasida kontekst uzilmasligini ta’minlaydi | |
| Misol: | |
| chunker = TextChunker(chunk_size=500, overlap=100) | |
| chunks = chunker.chunk(text) | |
| 📌 Bu yondashuv RAG tizimlari uchun eng ishonchli va keng qo‘llaniladigan usul. | |
| """ | |
| def __init__(self, chunk_size: int = 500, overlap: int = 100): | |
| """ | |
| Chunker’ni chunk hajmi va overlap sozlamalari bilan ishga tushiradi. | |
| Args: | |
| chunk_size: | |
| Har bir chunk uchun maksimal belgilar soni. | |
| Standart qiymat: 500 ta belgi. | |
| overlap: | |
| Ketma-ket chunklar orasida umumiy bo‘ladigan belgilar soni. | |
| Standart qiymat: 100 ta belgi (500 ning taxminan 20%). | |
| Raises: | |
| ValueError: | |
| - Agar chunk_size manfiy yoki 0 bo‘lsa | |
| - Agar overlap manfiy bo‘lsa | |
| - Agar overlap >= chunk_size bo‘lsa | |
| ❗ overlap >= chunk_size bo‘lsa, sliding window oldinga siljimaydi | |
| va cheksiz loop (infinite loop) yuzaga keladi. | |
| """ | |
| if chunk_size <= 0: | |
| raise ValueError("chunk_size musbat bo‘lishi kerak") | |
| if overlap < 0: | |
| raise ValueError("overlap manfiy bo‘lishi mumkin emas") | |
| if overlap >= chunk_size: | |
| raise ValueError("overlap chunk_size’dan kichik bo‘lishi kerak") | |
| self.chunk_size = chunk_size | |
| self.overlap = overlap | |
| def chunk(self, text: str) -> List[Dict]: | |
| """ | |
| Matnni overlap bilan chunklarga ajratadi. | |
| Ushbu metod sliding window usulidan foydalanadi. | |
| Har bir chunk original matndagi joylashuvi haqida | |
| qo‘shimcha metadata saqlaydi. | |
| Args: | |
| text: | |
| Chunklanishi kerak bo‘lgan kiruvchi matn. | |
| Returns: | |
| Chunklar ro‘yxati. Har bir chunk quyidagi maydonlarga ega: | |
| - text: chunk matni | |
| - index: chunk tartib raqami (0 dan boshlanadi) | |
| - start: original matndagi boshlanish indeksi | |
| - end: original matndagi tugash indeksi | |
| - chunk_size: ushbu chunk hajmi | |
| """ | |
| if not text: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| chunk_index = 0 | |
| # Sliding window logikasi: | |
| # Har safar (chunk_size - overlap) miqdorida oldinga siljiydi | |
| # | |
| # Masalan: | |
| # chunk_size = 500 | |
| # overlap = 100 | |
| # → har iteratsiyada 400 belgiga siljiydi | |
| # | |
| # Bu overlap orqali kontekst uzilmasligini ta’minlaydi | |
| while start < len(text): | |
| # Joriy chunk uchun tugash indeksini hisoblaymiz | |
| end = min(start + self.chunk_size, len(text)) | |
| # Matndan chunkni kesib olamiz | |
| chunk_text = text[start:end] | |
| # Chunk va uning metadata ma’lumotlarini yaratamiz | |
| chunk = { | |
| 'text': chunk_text, | |
| 'index': chunk_index, | |
| 'start': start, | |
| 'end': end, | |
| 'chunk_size': len(chunk_text) | |
| } | |
| chunks.append(chunk) | |
| # Sliding window’ni oldinga siljitamiz | |
| start += self.chunk_size - self.overlap | |
| chunk_index += 1 | |
| # Xavfsizlik tekshiruvi: | |
| # Agar matn oxiriga yetilgan bo‘lsa, cheksiz loop bo‘lmasligi uchun to‘xtaymiz | |
| if end >= len(text): | |
| break | |
| return chunks | |
| def chunk_with_sentences(self, text: str) -> List[Dict]: | |
| """ | |
| Gap chegaralariga moslab chunklashga harakat qiladigan alternativ usul. | |
| Ushbu metod: | |
| - Maksimal chunk hajmiga amal qiladi | |
| - Lekin chunklarni gap o‘rtasida bo‘lib yubormaslikka harakat qiladi | |
| - Nuqta (.), savol (?) va undov (!) belgilarida chunkni tugatadi | |
| Bu usul LLM uchun tabiiyroq va grammatik jihatdan to‘liq | |
| kontekst yaratadi. | |
| Args: | |
| text: | |
| Chunklanishi kerak bo‘lgan matn. | |
| Returns: | |
| Oddiy chunk() metodi bilan bir xil tuzilishga ega chunklar ro‘yxati. | |
| """ | |
| if not text: | |
| return [] | |
| chunks = [] | |
| start = 0 | |
| chunk_index = 0 | |
| sentence_endings = '.!?' | |
| while start < len(text): | |
| end = min(start + self.chunk_size, len(text)) | |
| # Agar matn oxiriga yetilmagan bo‘lsa, | |
| # gap tugash belgilarini orqaga qarab qidiramiz | |
| if end < len(text): | |
| adjusted_end = end | |
| for i in range(end - 1, max(start + self.chunk_size // 2, start), -1): | |
| if text[i] in sentence_endings: | |
| adjusted_end = i + 1 | |
| break | |
| end = adjusted_end | |
| chunk_text = text[start:end] | |
| chunk = { | |
| 'text': chunk_text, | |
| 'index': chunk_index, | |
| 'start': start, | |
| 'end': end, | |
| 'chunk_size': len(chunk_text) | |
| } | |
| chunks.append(chunk) | |
| # Keyingi chunk uchun start pozitsiyasini hisoblaymiz | |
| # overlap orqali kontekstni saqlab qolamiz | |
| start = end - self.overlap if end < len(text) else end | |
| chunk_index += 1 | |
| return chunks | |
| def get_stats(self, chunks: List[Dict]) -> Dict: | |
| """ | |
| Chunklangan matn bo‘yicha statistik ma’lumotlarni qaytaradi. | |
| Bu metod RAG tizimini sozlash (tuning) jarayonida juda foydali: | |
| - chunklar soni juda ko‘pmi? | |
| - chunklar juda kichikmi? | |
| - overlap to‘g‘ri tanlanganmi? | |
| Args: | |
| chunks: | |
| chunk() yoki chunk_with_sentences() orqali yaratilgan chunklar. | |
| Returns: | |
| Statistik ma’lumotlar lug‘ati. | |
| """ | |
| if not chunks: | |
| return {'num_chunks': 0} | |
| sizes = [c['chunk_size'] for c in chunks] | |
| return { | |
| 'num_chunks': len(chunks), | |
| 'avg_chunk_size': sum(sizes) / len(sizes), | |
| 'min_chunk_size': min(sizes), | |
| 'max_chunk_size': max(sizes), | |
| 'total_characters': sum(sizes) | |
| } | |
| # Namuna sifatida ishga tushirish (test uchun) | |
| if __name__ == "__main__": | |
| sample_text = """ | |
| Artificial intelligence (AI) is intelligence demonstrated by machines, | |
| as opposed to natural intelligence displayed by animals including humans. | |
| AI research has been defined as the field of study of intelligent agents, | |
| which refers to any system that perceives its environment and takes actions | |
| that maximize its chance of achieving its goals. | |
| """ | |
| chunker = TextChunker(chunk_size=200, overlap=50) | |
| chunks = chunker.chunk(sample_text) | |
| print(f"{len(chunks)} ta chunk yaratildi:") | |
| for chunk in chunks: | |
| print(f"\n--- Chunk {chunk['index']} ({chunk['chunk_size']} belgi) ---") | |
| print(chunk['text'][:100] + "..." if len(chunk['text']) > 100 else chunk['text']) | |