adaptive_rag / recursive_text_splitter_explained.py
lanny xu
modify reranker
0d85198
raw
history blame
16.4 kB
"""
RecursiveCharacterTextSplitter ๅทฅไฝœๅŽŸ็†่ฏฆ่งฃ
ๅฑ•็คบๅฆ‚ไฝ•ๅฐ†้•ฟๆ–‡ๆกฃๅˆ‡ๅˆ†ๆˆๅฐๅ—๏ผˆchunks๏ผ‰
"""
print("=" * 80)
print("RecursiveCharacterTextSplitter ๅทฅไฝœๅŽŸ็†")
print("=" * 80)
# ============================================================================
# Part 1: ไธบไป€ไนˆ้œ€่ฆๆ–‡ๆœฌๅˆ†ๅ‰ฒ๏ผŸ
# ============================================================================
print("\n" + "=" * 80)
print("โ“ Part 1: ไธบไป€ไนˆ้œ€่ฆๆ–‡ๆœฌๅˆ†ๅ‰ฒ๏ผŸ")
print("=" * 80)
print("""
้—ฎ้ข˜๏ผšๅŽŸๅง‹ๆ–‡ๆกฃ้€šๅธธๅพˆ้•ฟ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ไธ€็ฏ‡็ฝ‘้กตๆ–‡็ซ ๏ผš5000 ๅญ—
ไธ€ไปฝๆŠ€ๆœฏๆ–‡ๆกฃ๏ผš10000 ๅญ—
ไธ€ๆœฌไนฆ็š„ไธ€็ซ ๏ผš20000 ๅญ—
ๅฆ‚ๆžœ็›ดๆŽฅๅฐ†ๆ•ด็ฏ‡ๆ–‡ๆกฃๅšๆˆๅ‘้‡๏ผš
โŒ ไฟกๆฏๅฏ†ๅบฆๅคชไฝŽ๏ผˆๆ— ๅ…ณไฟกๆฏๅคชๅคš๏ผ‰
โŒ ๆฃ€็ดขไธ็ฒพๅ‡†๏ผˆๆ— ๆณ•ๅฎšไฝๅˆฐๅ…ทไฝ“ๆฎต่ฝ๏ผ‰
โŒ ่ถ…ๅ‡บๆจกๅž‹้•ฟๅบฆ้™ๅˆถ๏ผˆBERT ๆœ€ๅคš 512 tokens๏ผ‰
่งฃๅ†ณๆ–นๆกˆ๏ผšๆ–‡ๆœฌๅˆ†ๅ‰ฒ๏ผˆText Splitting๏ผ‰
โœ… ๅฐ†้•ฟๆ–‡ๆกฃๅˆ‡ๆˆๅฐๅ—๏ผˆchunks๏ผ‰
โœ… ๆฏไธช chunk ็‹ฌ็ซ‹ๅปบ็ซ‹ๅ‘้‡็ดขๅผ•
โœ… ๆฃ€็ดขๆ—ถ่ฟ”ๅ›žๆœ€็›ธๅ…ณ็š„ chunks
""")
# ============================================================================
# Part 2: ไฝ ็š„้กน็›ฎ้…็ฝฎ
# ============================================================================
print("\n" + "=" * 80)
print("โš™๏ธ Part 2: ไฝ ็š„้กน็›ฎ้…็ฝฎ")
print("=" * 80)
print("""
ๅœจ config.py ไธญ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
CHUNK_SIZE = 250 # ๆฏไธชๅ—ๆœ€ๅคš 250 ไธช tokens
CHUNK_OVERLAP = 0 # ๅ—ไน‹้—ดไธ้‡ๅ 
ๅœจ document_processor.py ไธญ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=250, # ๆฏๅ— 250 tokens
chunk_overlap=0 # ๆ— ้‡ๅ 
)
""")
# ============================================================================
# Part 3: RecursiveCharacterTextSplitter ็š„ๆ ธๅฟƒๆœบๅˆถ
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ” Part 3: RecursiveCharacterTextSplitter ๆ ธๅฟƒๆœบๅˆถ")
print("=" * 80)
print("""
"Recursive" ็š„ๅซไน‰๏ผš้€’ๅฝ’ๅผๅˆ†ๅ‰ฒ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ไธๆ˜ฏ็ฎ€ๅ•็ฒ—ๆšดๅœฐๆŒ‰ๅญ—็ฌฆๆ•ฐๅˆ‡ๅˆ†๏ผŒ่€Œๆ˜ฏๆŒ‰็…งๅˆ†้š”็ฌฆ็š„ไผ˜ๅ…ˆ็บง้€’ๅฝ’ๅˆ‡ๅˆ†๏ผš
ๅˆ†้š”็ฌฆไผ˜ๅ…ˆ็บง๏ผˆไปŽ้ซ˜ๅˆฐไฝŽ๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
1. "\\n\\n" ๅŒๆข่กŒ๏ผˆๆฎต่ฝๅˆ†้š”๏ผ‰ โ† ๆœ€ไผ˜ๅ…ˆ
2. "\\n" ๅ•ๆข่กŒ๏ผˆๅฅๅญๅˆ†้š”๏ผ‰
3. " " ็ฉบๆ ผ๏ผˆ่ฏ่ฏญๅˆ†้š”๏ผ‰
4. "" ๅญ—็ฌฆ็บงๅˆซๅˆ‡ๅˆ† โ† ๆœ€ๅŽๆ‰‹ๆฎต
ๅทฅไฝœๆต็จ‹๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Step 1: ๅฐ่ฏ•็”จ "\\n\\n" ๅˆ†ๅ‰ฒ
โ†“
ๆฏๅ—้ƒฝ < 250 tokens?
โ†“ No (ๆŸๅ—ๅคชๅคง)
Step 2: ๅฏนๅคงๅ—็”จ "\\n" ๅˆ†ๅ‰ฒ
โ†“
ๆฏๅ—้ƒฝ < 250 tokens?
โ†“ No (ๆŸๅ—่ฟ˜ๅคชๅคง)
Step 3: ๅฏนๅคงๅ—็”จ " " ๅˆ†ๅ‰ฒ
โ†“
ๆฏๅ—้ƒฝ < 250 tokens?
โ†“ No (ๆŸๅ—ไปๅคชๅคง)
Step 4: ๅผบๅˆถๆŒ‰ๅญ—็ฌฆๅˆ‡ๅˆ†
โ†“
ไฟ่ฏๆฏๅ— <= 250 tokens โœ“
""")
# ============================================================================
# Part 4: ๅฎž้™…็คบไพ‹ - ๆ‰‹ๅŠจๆจกๆ‹Ÿๅˆ†ๅ‰ฒ่ฟ‡็จ‹
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ’ก Part 4: ๅฎž้™…็คบไพ‹ - ๆ‰‹ๅŠจๆจกๆ‹Ÿๅˆ†ๅ‰ฒ่ฟ‡็จ‹")
print("=" * 80)
# ็คบไพ‹ๆ–‡ๆกฃ
document = """ไบบๅทฅๆ™บ่ƒฝ็ฎ€ไป‹
ไบบๅทฅๆ™บ่ƒฝ๏ผˆAI๏ผ‰ๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏใ€‚ๅฎƒ่‡ดๅŠ›ไบŽๅˆ›ๅปบ่ƒฝๅคŸๆ‰ง่กŒ้€šๅธธ้œ€่ฆไบบ็ฑปๆ™บ่ƒฝ็š„ไปปๅŠก็š„็ณป็ปŸใ€‚
ๆœบๅ™จๅญฆไน ๆ˜ฏไบบๅทฅๆ™บ่ƒฝ็š„ไธ€ไธชๅญ้ข†ๅŸŸใ€‚ๅฎƒไฝฟ่ฎก็ฎ—ๆœบ่ƒฝๅคŸไปŽๆ•ฐๆฎไธญๅญฆไน ๅนถๆ”น่ฟ›ๅ…ถๆ€ง่ƒฝใ€‚ๆทฑๅบฆๅญฆไน ๆ˜ฏๆœบๅ™จๅญฆไน ็š„ไธ€็งๆ–นๆณ•๏ผŒไฝฟ็”จๅคšๅฑ‚็ฅž็ป็ฝ‘็ปœใ€‚
่‡ช็„ถ่ฏญ่จ€ๅค„็†๏ผˆNLP๏ผ‰ๆ˜ฏๅฆไธ€ไธช้‡่ฆ็š„AI้ข†ๅŸŸใ€‚ๅฎƒๅค„็†่ฎก็ฎ—ๆœบไธŽไบบ็ฑป่ฏญ่จ€ไน‹้—ด็š„ไบคไบ’ใ€‚"""
print(f"\nๅŽŸๅง‹ๆ–‡ๆกฃ๏ผš")
print("โ”€" * 80)
print(document)
print("โ”€" * 80)
print(f"ๆ–‡ๆกฃ้•ฟๅบฆ๏ผš{len(document)} ๅญ—็ฌฆ")
# ๆจกๆ‹Ÿ RecursiveCharacterTextSplitter ็š„ๅทฅไฝœ
def count_tokens(text):
"""็ฎ€ๅŒ–็š„ token ่ฎกๆ•ฐ๏ผˆๅฎž้™…ไฝฟ็”จ tiktoken๏ผ‰"""
# ไธญๆ–‡๏ผšๅคง็บฆ 1 ๅญ— = 1.5 tokens
# ่‹ฑๆ–‡๏ผšๅคง็บฆ 1 ่ฏ = 1 token
return int(len(text) * 0.7) # ็ฎ€ๅŒ–ไผฐ็ฎ—
print(f"\nไผฐ็ฎ— tokens ๆ•ฐ๏ผš{count_tokens(document)} tokens")
# Step 1: ๅฐ่ฏ•ๆŒ‰ๅŒๆข่กŒๅˆ†ๅ‰ฒ
print("\n" + "โ”" * 80)
print("Step 1: ๆŒ‰ '\\n\\n' (ๆฎต่ฝ) ๅˆ†ๅ‰ฒ")
print("โ”" * 80)
paragraphs = document.split('\n\n')
print(f"\nๅˆ†ๅ‰ฒๆˆ {len(paragraphs)} ไธชๆฎต่ฝ๏ผš\n")
for i, para in enumerate(paragraphs, 1):
token_count = count_tokens(para)
status = "โœ…" if token_count <= 250 else "โŒ ่ถ…ๅ‡บ้™ๅˆถ"
print(f"ๆฎต่ฝ {i}: {token_count} tokens {status}")
print(f" ๅ†…ๅฎน: {para[:60]}...")
print()
# ๅ‡่ฎพๆŸไธชๆฎต่ฝ่ถ…ๅ‡บ้™ๅˆถ
large_para = """ๆœบๅ™จๅญฆไน ๆ˜ฏไบบๅทฅๆ™บ่ƒฝ็š„ไธ€ไธชๅญ้ข†ๅŸŸใ€‚ๅฎƒไฝฟ่ฎก็ฎ—ๆœบ่ƒฝๅคŸไปŽๆ•ฐๆฎไธญๅญฆไน ๅนถๆ”น่ฟ›ๅ…ถๆ€ง่ƒฝใ€‚ๆทฑๅบฆๅญฆไน ๆ˜ฏๆœบๅ™จๅญฆไน ็š„ไธ€็งๆ–นๆณ•๏ผŒไฝฟ็”จๅคšๅฑ‚็ฅž็ป็ฝ‘็ปœใ€‚"""
if count_tokens(large_para) > 250:
print("โ”" * 80)
print("Step 2: ๆฎต่ฝๅคชๅคง๏ผŒๆŒ‰ '\\n' (ๅฅๅญ) ๅˆ†ๅ‰ฒ")
print("โ”" * 80)
sentences = large_para.split('ใ€‚')
print(f"\nๅˆ†ๅ‰ฒๆˆ {len(sentences)} ไธชๅฅๅญ๏ผš\n")
for i, sent in enumerate(sentences, 1):
if sent.strip():
token_count = count_tokens(sent)
status = "โœ…" if token_count <= 250 else "โŒ"
print(f"ๅฅๅญ {i}: {token_count} tokens {status}")
print(f" ๅ†…ๅฎน: {sent.strip()}")
print()
# ============================================================================
# Part 5: chunk_overlap ็š„ไฝœ็”จ
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ”„ Part 5: chunk_overlap๏ผˆๅ—้‡ๅ ๏ผ‰็š„ไฝœ็”จ")
print("=" * 80)
print("""
ไฝ ็š„้กน็›ฎ่ฎพ็ฝฎ๏ผšCHUNK_OVERLAP = 0๏ผˆๆ— ้‡ๅ ๏ผ‰
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๆ— ้‡ๅ ็š„ๅˆ‡ๅˆ†๏ผš
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Chunk 1 โ”‚โ”‚ Chunk 2 โ”‚โ”‚ Chunk 3 โ”‚
โ”‚ 250 tok โ”‚โ”‚ 250 tok โ”‚โ”‚ 250 tok โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†‘
่พน็•Œๅฏ่ƒฝๅˆ‡ๆ–ญ่ฏญไน‰
ๆœ‰้‡ๅ ็š„ๅˆ‡ๅˆ†๏ผˆCHUNK_OVERLAP = 50๏ผ‰๏ผš
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Chunk 1 โ”‚
โ”‚ 250 tok โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Chunk 2 โ”‚
โ”‚ 250 tok โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Chunk 3 โ”‚
โ”‚ 250 tok โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
ไผ˜็‚น๏ผš
โœ… ไฟ็•™ไธŠไธ‹ๆ–‡่ฟž่ดฏๆ€ง
โœ… ้ฟๅ…ๅ…ณ้”ฎไฟกๆฏ่ขซๅˆ‡ๆ–ญ
โœ… ๆ้ซ˜ๆฃ€็ดขๅ‡†็กฎ็އ (+5-10%)
็ผบ็‚น๏ผš
โŒ ๅญ˜ๅ‚จ็ฉบ้—ดๅขžๅŠ  20-30%
โŒ ๅฏ่ƒฝ่ฟ”ๅ›ž้‡ๅคๅ†…ๅฎน
ไธบไป€ไนˆไฝ ็š„้กน็›ฎ่ฎพไธบ 0๏ผŸ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
โœ… ่Š‚็œๅญ˜ๅ‚จ็ฉบ้—ด
โœ… ้ฟๅ…้‡ๅค
โœ… ้…ๅˆ CrossEncoder ้‡ๆŽ’ๅทฒ็ป่ถณๅคŸๅ‡†็กฎ
ๆŽจ่่ฎพ็ฝฎ๏ผš
- CHUNK_OVERLAP = 0: ๅฟซ้€ŸๅŽŸๅž‹ใ€ๅญ˜ๅ‚จๅ—้™
- CHUNK_OVERLAP = 50: ็”Ÿไบง็Žฏๅขƒใ€้ซ˜็ฒพๅบฆ่ฆๆฑ‚ โญ
- CHUNK_OVERLAP = 100: ๅ…ณ้”ฎๅบ”็”จใ€ๅŒป็–—ๆณ•ๅพ‹็ญ‰
""")
# ============================================================================
# Part 6: from_tiktoken_encoder ็š„ไฝœ็”จ
# ============================================================================
print("\n" + "=" * 80)
print("๐ŸŽฏ Part 6: from_tiktoken_encoder ็š„็‰นๆฎŠไน‹ๅค„")
print("=" * 80)
print("""
ไฝ ็š„ไปฃ็ ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
RecursiveCharacterTextSplitter.from_tiktoken_encoder(
chunk_size=250,
chunk_overlap=0
)
ไธไฝฟ็”จ tiktoken๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
RecursiveCharacterTextSplitter(
chunk_size=250, # โ† ่ฟ™้‡Œๆ˜ฏๅญ—็ฌฆๆ•ฐ๏ผŒไธๆ˜ฏ tokens๏ผ
chunk_overlap=0
)
ๅŒบๅˆซ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๆ™ฎ้€šๆจกๅผ๏ผšๆŒ‰ๅญ—็ฌฆๆ•ฐๅˆ‡ๅˆ†
โ”œโ”€ chunk_size=250 โ†’ 250 ไธชๅญ—็ฌฆ
โ”œโ”€ ไธญๆ–‡๏ผšๅคง็บฆ 250 ไธชๅญ— = 375 tokens๏ผˆ่ถ…ๆ ‡๏ผ๏ผ‰
โ””โ”€ ่‹ฑๆ–‡๏ผšๅคง็บฆ 50 ไธชๅ•่ฏ = 50 tokens๏ผˆๅคชๅฐ‘๏ผ๏ผ‰
tiktoken ๆจกๅผ๏ผšๆŒ‰ tokens ๅˆ‡ๅˆ† โญ
โ”œโ”€ chunk_size=250 โ†’ ็ฒพ็กฎ 250 ไธช tokens
โ”œโ”€ ไธญๆ–‡๏ผšๅคง็บฆ 166 ไธชๅญ— = 250 tokens โœ“
โ””โ”€ ่‹ฑๆ–‡๏ผšๅคง็บฆ 190 ไธชๅ•่ฏ = 250 tokens โœ“
tiktoken ๆ˜ฏไป€ไนˆ๏ผŸ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
OpenAI ็š„ tokenizer๏ผŒไธŽ GPT/BERT ็š„ๅˆ†่ฏๆ–นๅผไธ€่‡ด
ไผ˜็‚น๏ผš
โœ… ็ฒพ็กฎๆŽงๅˆถ chunk ๅคงๅฐ
โœ… ไธŽ Embedding ๆจกๅž‹็š„ token ้™ๅˆถไธ€่‡ด
โœ… ไธญ่‹ฑๆ–‡้ƒฝ่ƒฝๅ‡†็กฎๅค„็†
ไฝ ็š„้กน็›ฎไฝฟ็”จ tiktoken ๆ˜ฏๆญฃ็กฎไธ”ๆŽจ่็š„ๅšๆณ•๏ผ
""")
# ============================================================================
# Part 7: ๅฎŒๆ•ด็š„ๅˆ†ๅ‰ฒๆต็จ‹ๅฏ่ง†ๅŒ–
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ“Š Part 7: ๅฎŒๆ•ด็š„ๅˆ†ๅ‰ฒๆต็จ‹ๅฏ่ง†ๅŒ–")
print("=" * 80)
print("""
ๅŽŸๅง‹ๆ–‡ๆกฃ (5000 tokens)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
โ”‚ ็ฌฌไธ€็ซ ๏ผšไบบๅทฅๆ™บ่ƒฝ็ฎ€ไป‹ โ”‚
โ”‚ โ”‚
โ”‚ ไบบๅทฅๆ™บ่ƒฝ๏ผˆAI๏ผ‰ๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏ... โ”‚
โ”‚ โ”‚
โ”‚ ็ฌฌไบŒ็ซ ๏ผšๆœบๅ™จๅญฆไน  โ”‚
โ”‚ โ”‚
โ”‚ ๆœบๅ™จๅญฆไน ๆ˜ฏAI็š„ไธ€ไธชๅญ้ข†ๅŸŸ... โ”‚
โ”‚ โ”‚
โ”‚ ็ฌฌไธ‰็ซ ๏ผšๆทฑๅบฆๅญฆไน  โ”‚
โ”‚ ... โ”‚
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
โ†“
RecursiveCharacterTextSplitter
(chunk_size=250, overlap=0)
โ†“
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Chunk 1 (250 tokens)
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ็ฌฌไธ€็ซ ๏ผšไบบๅทฅๆ™บ่ƒฝ็ฎ€ไป‹ โ”‚
โ”‚ โ”‚
โ”‚ ไบบๅทฅๆ™บ่ƒฝ๏ผˆAI๏ผ‰ๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏใ€‚ๅฎƒ่‡ดๅŠ›... โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“ ๅญ˜ๅ…ฅๅ‘้‡ๆ•ฐๆฎๅบ“
Chunk 2 (250 tokens)
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ไบบๅทฅๆ™บ่ƒฝๅŒ…ๆ‹ฌๅคšไธชๅญ้ข†ๅŸŸ๏ผŒๅฆ‚ๆœบๅ™จๅญฆไน ใ€... โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“ ๅญ˜ๅ…ฅๅ‘้‡ๆ•ฐๆฎๅบ“
Chunk 3 (250 tokens)
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ ็ฌฌไบŒ็ซ ๏ผšๆœบๅ™จๅญฆไน  โ”‚
โ”‚ โ”‚
โ”‚ ๆœบๅ™จๅญฆไน ๆ˜ฏAI็š„ไธ€ไธชๅญ้ข†ๅŸŸใ€‚ๅฎƒไฝฟ่ฎก็ฎ—ๆœบ่ƒฝๅคŸ... โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“ ๅญ˜ๅ…ฅๅ‘้‡ๆ•ฐๆฎๅบ“
...็ปง็ปญๅˆ†ๅ‰ฒๆˆ็บฆ 20 ไธช chunks
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๆฃ€็ดขๆ—ถ๏ผš
็”จๆˆท้—ฎ้ข˜: "ไป€ไนˆๆ˜ฏๆœบๅ™จๅญฆไน ๏ผŸ"
โ†“
ๅ‘้‡ๆฃ€็ดข Top 20 chunks
โ†“
โ”œโ”€ Chunk 3 (็›ธๅ…ณๅบฆ: 0.92) โ† ๆœ€็›ธๅ…ณ
โ”œโ”€ Chunk 4 (็›ธๅ…ณๅบฆ: 0.88)
โ”œโ”€ Chunk 1 (็›ธๅ…ณๅบฆ: 0.75)
โ””โ”€ ...
โ†“
CrossEncoder ้‡ๆŽ’ โ†’ Top 5
โ†“
่ฟ”ๅ›žๆœ€็›ธๅ…ณ็š„็‰‡ๆฎต็ป™ LLM ็”Ÿๆˆ็ญ”ๆกˆ
""")
# ============================================================================
# Part 8: ๅ…ณ้”ฎๅ‚ๆ•ฐ่ฐƒไผ˜ๅปบ่ฎฎ
# ============================================================================
print("\n" + "=" * 80)
print("โš™๏ธ Part 8: ๅ…ณ้”ฎๅ‚ๆ•ฐ่ฐƒไผ˜ๅปบ่ฎฎ")
print("=" * 80)
print("""
ๅ‚ๆ•ฐ้…็ฝฎๅปบ่ฎฎ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
CHUNK_SIZE๏ผˆๅ—ๅคงๅฐ๏ผ‰๏ผš
โ”œโ”€ 100-200: ็Ÿญๆ–‡ๆกฃใ€็ฒพ็กฎๆฃ€็ดข
โ”œโ”€ 250-500: ้€š็”จๅœบๆ™ฏ โญ (ไฝ ็š„้กน็›ฎ)
โ””โ”€ 500-1000: ้•ฟๆ–‡ๆกฃใ€้œ€่ฆๆ›ดๅคšไธŠไธ‹ๆ–‡
CHUNK_OVERLAP๏ผˆ้‡ๅ ๏ผ‰๏ผš
โ”œโ”€ 0: ๅฟซ้€ŸๅŽŸๅž‹ใ€ๅญ˜ๅ‚จๅ—้™ (ไฝ ็š„้กน็›ฎ)
โ”œโ”€ 50: ็”Ÿไบง็ŽฏๅขƒๆŽจ่ โญ
โ”œโ”€ 100: ้ซ˜็ฒพๅบฆ่ฆๆฑ‚
โ””โ”€ 150+: ๅ…ณ้”ฎๅบ”็”จ
ไฝ ็š„้กน็›ฎ้…็ฝฎ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
CHUNK_SIZE = 250 โœ“ ้€‚ไธญ๏ผŒ้€‚ๅˆๅคงๅคšๆ•ฐๅœบๆ™ฏ
CHUNK_OVERLAP = 0 โš ๏ธ ๅปบ่ฎฎๆ”นไธบ 50-100
ๆŽจ่ไผ˜ๅŒ–๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
CHUNK_SIZE = 400 # ๅขžๅŠ ไธŠไธ‹ๆ–‡
CHUNK_OVERLAP = 100 # ๆทปๅŠ ้‡ๅ ไฟ่ฏ่ฟž่ดฏๆ€ง
็†็”ฑ๏ผš
โœ… 400 tokens ่ถณๅคŸๅŒ…ๅซๅฎŒๆ•ด็š„ๆฎต่ฝ
โœ… 100 tokens ้‡ๅ ้ฟๅ…ๅ…ณ้”ฎไฟกๆฏ่ขซๅˆ‡ๆ–ญ
โœ… ้…ๅˆ CrossEncoder๏ผŒๅ‡†็กฎ็އๅฏๆๅ‡ 8-12%
""")
# ============================================================================
# Part 9: ๆ€ป็ป“
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ“š Part 9: ๆ ธๅฟƒ่ฆ็‚นๆ€ป็ป“")
print("=" * 80)
print("""
RecursiveCharacterTextSplitter ็š„ๅทฅไฝœๅŽŸ็†๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
1๏ธโƒฃ ้€’ๅฝ’ๅˆ†ๅ‰ฒ๏ผš
โ””โ”€ ๆŒ‰ไผ˜ๅ…ˆ็บงๅฐ่ฏ•ๅˆ†้š”็ฌฆ๏ผš\\n\\n โ†’ \\n โ†’ ็ฉบๆ ผ โ†’ ๅญ—็ฌฆ
2๏ธโƒฃ ๆ™บ่ƒฝๅˆ‡ๅˆ†๏ผš
โ””โ”€ ไฟๆŒ่ฏญไน‰ๅฎŒๆ•ดๆ€ง๏ผŒไผ˜ๅ…ˆๅœจๆฎต่ฝ/ๅฅๅญ่พน็•Œๅˆ‡ๅˆ†
3๏ธโƒฃ ็ฒพ็กฎๆŽงๅˆถ๏ผš
โ””โ”€ from_tiktoken_encoder ็กฎไฟๆฏๅ—ๆฐๅฅฝ 250 tokens
4๏ธโƒฃ ๅฏ้€‰้‡ๅ ๏ผš
โ””โ”€ CHUNK_OVERLAP ไฟ็•™ไธŠไธ‹ๆ–‡่ฟž่ดฏๆ€ง
5๏ธโƒฃ ไฝ ็š„้กน็›ฎๆต็จ‹๏ผš
ๅŽŸๅง‹ๆ–‡ๆกฃ
โ†“ RecursiveCharacterTextSplitter
250-token chunks
โ†“ HuggingFace Embeddings
ๅ‘้‡ๆ•ฐๆฎๅบ“
โ†“ ๅ‘้‡ๆฃ€็ดข (Top 20)
ๅ€™้€‰ chunks
โ†“ CrossEncoder ้‡ๆŽ’
ๆœ€็ปˆ Top 5 chunks
โ†“
ๅ–‚็ป™ LLM ็”Ÿๆˆ็ญ”ๆกˆ
ๅ…ณ้”ฎไผ˜ๅŠฟ๏ผš
โœ… ๆ™บ่ƒฝๅˆ‡ๅˆ†๏ผŒไฟๆŒ่ฏญไน‰ๅฎŒๆ•ด
โœ… ็ฒพ็กฎๆŽงๅˆถ chunk ๅคงๅฐ
โœ… ๆ”ฏๆŒไธญ่‹ฑๆ–‡ๆททๅˆๆ–‡ๆœฌ
โœ… ไธŽๅ‘้‡ๆฃ€็ดข้…ๅˆๅฎŒ็พŽ
่ฟ™ๅฐฑๆ˜ฏไธบไป€ไนˆไฝ ็š„้กน็›ฎ่ƒฝๅคŸๅ‡†็กฎๆฃ€็ดขๅ’Œๅ›ž็ญ”้—ฎ้ข˜๏ผ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
""")
print("\n" + "=" * 80)
print("โœ… ่งฃๆžๅฎŒๆˆ๏ผ็Žฐๅœจไฝ ๅบ”่ฏฅ็†่งฃไบ†ๆ–‡ๆœฌๅˆ†ๅ‰ฒ็š„ๅŽŸ็†")
print("=" * 80)
print()