adaptive_rag / crossencoder_document_processing_demo.py
lanny xu
modify reranker
20ae167
"""
CrossEncoder ๆ–‡ๆกฃๅค„็†่ฏฆ่งฃ
่งฃ็ญ”๏ผšDocument ๆ˜ฏไฝœไธบๆ•ดไฝ“่ฟ˜ๆ˜ฏๆ‹†ๅˆ†ๆˆ sentences๏ผŸ
"""
print("=" * 80)
print("CrossEncoder ๅฆ‚ไฝ•ๅค„็† Document๏ผŸ")
print("=" * 80)
# ============================================================================
# Part 1: Document ็š„ๅฎž้™…ๅค„็†ๆ–นๅผ
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ“ Part 1: Document ็š„ๅฎž้™…ๅค„็†ๆ–นๅผ")
print("=" * 80)
query = "ไป€ไนˆๆ˜ฏไบบๅทฅๆ™บ่ƒฝ๏ผŸ"
document = """ไบบๅทฅๆ™บ่ƒฝๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏใ€‚ๅฎƒ่‡ดๅŠ›ไบŽๅˆ›ๅปบๆ™บ่ƒฝ็ณป็ปŸใ€‚
่ฟ™ไบ›็ณป็ปŸๅฏไปฅๆ‰ง่กŒ้œ€่ฆไบบ็ฑปๆ™บ่ƒฝ็š„ไปปๅŠกใ€‚ไบบๅทฅๆ™บ่ƒฝๅŒ…ๆ‹ฌๆœบๅ™จๅญฆไน ็ญ‰ๅญ้ข†ๅŸŸใ€‚"""
print(f"\nๅŽŸๅง‹่พ“ๅ…ฅ๏ผš")
print(f"Query: {query}")
print(f"\nDocument (ๅŒ…ๅซๅคšไธชๅฅๅญ):")
print(f"{document}")
print("\n" + "-" * 80)
print("ๅ…ณ้”ฎ้—ฎ้ข˜๏ผšDocument ๆœ‰ๅคšไธชๅฅๅญ๏ผŒCrossEncoder ๅฆ‚ไฝ•ๅค„็†๏ผŸ")
print("-" * 80)
print("""
็ญ”ๆกˆ๏ผšCrossEncoder ๆŠŠๆ•ดไธช Document ไฝœไธบไธ€ไธชๆ•ดไฝ“ๅค„็†๏ผ
ๅ…ทไฝ“่ฟ‡็จ‹๏ผš
1. ่พ“ๅ…ฅๆ‹ผๆŽฅ๏ผš[CLS] Query [SEP] Document [SEP]
โ””โ”€ Document ็š„ๆ‰€ๆœ‰ๅฅๅญ้ƒฝๆ‹ผๆŽฅๅœจไธ€่ตท
2. ๅˆ†่ฏ๏ผšๆ•ดไธชๅบๅˆ—่ขซๅˆ‡ๅˆ†ๆˆ tokens
โ””โ”€ ไธๆ˜ฏๆŒ‰ๅฅๅญๅˆ†๏ผŒ่€Œๆ˜ฏๆ•ดไธช Document ไธ€่ตทๅˆ†่ฏ
3. ็”Ÿๆˆ embeddings๏ผš
โ””โ”€ ๆฏไธช token ไธ€ไธชๅ‘้‡๏ผˆไธๆ˜ฏๆฏไธชๅฅๅญไธ€ไธชๅ‘้‡๏ผ๏ผ‰
โ””โ”€ Document ๅฏ่ƒฝๆœ‰ 100 ไธช tokens = 100 ไธชๅ‘้‡
""")
# ============================================================================
# Part 2: ่ฏฆ็ป†็š„ Token ็บงๅˆซๅค„็†
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ”ค Part 2: Token ็บงๅˆซ็š„ๅค„็†๏ผˆๅฎž้™…ๅ‘็”Ÿ็š„ไบ‹ๆƒ…๏ผ‰")
print("=" * 80)
# ๆจกๆ‹Ÿ็œŸๅฎž็š„ๅค„็†่ฟ‡็จ‹
concatenated = f"[CLS] {query} [SEP] {document} [SEP]"
print(f"\nๆญฅ้ชค1๏ผšๆ‹ผๆŽฅๆˆๅ•ไธ€ๅบๅˆ—")
print(f"{'โ”€' * 40}")
print(f"{concatenated[:100]}...")
# ็ฎ€ๅŒ–็š„ๅˆ†่ฏ๏ผˆๅฎž้™… BERT tokenizer ไผš็”จ WordPiece๏ผ‰
def tokenize_chinese(text):
"""็ฎ€ๅŒ–็š„ไธญๆ–‡ๅˆ†่ฏ"""
tokens = []
i = 0
while i < len(text):
if text[i:i+5] == '[CLS]':
tokens.append('[CLS]')
i += 5
elif text[i:i+5] == '[SEP]':
tokens.append('[SEP]')
i += 5
elif text[i] == ' ':
i += 1
continue
else:
tokens.append(text[i])
i += 1
return tokens
tokens = tokenize_chinese(concatenated)
print(f"\nๆญฅ้ชค2๏ผšๅˆ†่ฏ๏ผˆๆฏไธชๅญ—/่ฏๅ˜ๆˆ token๏ผ‰")
print(f"{'โ”€' * 40}")
print(f"ๆ€ปๅ…ฑ {len(tokens)} ไธช tokens")
print(f"ๅ‰ 30 ไธช tokens: {tokens[:30]}")
print(f"\nๆญฅ้ชค3๏ผšๆฏไธช token ็”Ÿๆˆไธ€ไธชๅ‘้‡")
print(f"{'โ”€' * 40}")
print(f"""
Token ๅบๅˆ— (้•ฟๅบฆ={len(tokens)}):
tokens[0] = '[CLS]' โ†’ embedding[0] (768็ปดๅ‘้‡)
tokens[1] = 'ไป€' โ†’ embedding[1] (768็ปดๅ‘้‡)
tokens[2] = 'ไนˆ' โ†’ embedding[2] (768็ปดๅ‘้‡)
...
tokens[10] = '[SEP]' โ†’ embedding[10] (768็ปดๅ‘้‡)
tokens[11] = 'ไบบ' โ†’ embedding[11] (768็ปดๅ‘้‡) โ† Document ๅผ€ๅง‹
tokens[12] = 'ๅทฅ' โ†’ embedding[12] (768็ปดๅ‘้‡)
tokens[13] = 'ๆ™บ' โ†’ embedding[13] (768็ปดๅ‘้‡)
tokens[14] = '่ƒฝ' โ†’ embedding[14] (768็ปดๅ‘้‡)
...
tokens[{len(tokens)-1}] = '[SEP]' โ†’ embedding[{len(tokens)-1}] (768็ปดๅ‘้‡)
ๅ…ณ้”ฎ็‚น๏ผš
โœ… Document ไธๆ˜ฏไธ€ไธชๅ‘้‡๏ผ
โœ… Document ็š„ๆฏไธชๅญ—/่ฏ้ƒฝๆ˜ฏไธ€ไธชๅ‘้‡๏ผ
โœ… ๅณไฝฟ Document ๆœ‰ๅคšไธชๅฅๅญ๏ผŒไนŸๆ˜ฏ่ฟž็ปญ็š„ token ๅบๅˆ—
""")
# ============================================================================
# Part 3: ๆณจๆ„ๅŠ›ๅฆ‚ไฝ•่ทจๅฅๅญๅทฅไฝœ
# ============================================================================
print("\n" + "=" * 80)
print("๐ŸŒŸ Part 3: ๆณจๆ„ๅŠ›ๆœบๅˆถ่ทจๅฅๅญๅทฅไฝœ")
print("=" * 80)
print("""
Document ๆœ‰ๅคšไธชๅฅๅญๆ—ถ็š„ๆณจๆ„ๅŠ›่ฎก็ฎ—๏ผš
ๅ‡่ฎพ Document = "ๅฅๅญ1ใ€‚ๅฅๅญ2ใ€‚ๅฅๅญ3ใ€‚"
Tokenๅบๅˆ—๏ผš
[CLS] Query่ฏ1 Query่ฏ2 [SEP] ๅฅๅญ1่ฏ1 ๅฅๅญ1่ฏ2 ใ€‚ ๅฅๅญ2่ฏ1 ๅฅๅญ2่ฏ2 ใ€‚ ๅฅๅญ3่ฏ1 [SEP]
โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘ โ†‘
t[0] t[1] t[2] t[3] t[4] t[5] t[6] t[7] t[8] t[9] t[10] t[11]
Self-Attention ่ฎก็ฎ—๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Query่ฏ1 (t[1]) ็š„ๆณจๆ„ๅŠ›๏ผš
- ๅฏไปฅๅ…ณๆณจ ๅฅๅญ1่ฏ1 (t[4]) โœ“
- ๅฏไปฅๅ…ณๆณจ ๅฅๅญ2่ฏ1 (t[7]) โœ“
- ๅฏไปฅๅ…ณๆณจ ๅฅๅญ3่ฏ1 (t[10]) โœ“
โ†’ Query ็š„่ฏๅฏไปฅ็œ‹ๅˆฐ Document ๆ‰€ๆœ‰ๅฅๅญ็š„ๆ‰€ๆœ‰่ฏ๏ผ
ๅฅๅญ1่ฏ1 (t[4]) ็š„ๆณจๆ„ๅŠ›๏ผš
- ๅฏไปฅๅ…ณๆณจ Query่ฏ1 (t[1]) โœ“
- ๅฏไปฅๅ…ณๆณจ ๅฅๅญ2่ฏ1 (t[7]) โœ“ (่ทจๅฅๅญ๏ผ)
- ๅฏไปฅๅ…ณๆณจ ๅฅๅญ3่ฏ1 (t[10]) โœ“ (่ทจๅฅๅญ๏ผ)
โ†’ Document ๅ†…็š„ไธๅŒๅฅๅญไนŸ่ƒฝไบ’็›ธ็œ‹ๅˆฐ๏ผ
่ฟ™ๅฐฑๆ˜ฏ"ๅ…จๅฑ€ๆณจๆ„ๅŠ›"(Global Attention)๏ผš
ๆฏไธช token ้ƒฝ่ƒฝ็œ‹ๅˆฐๆ•ดไธชๅบๅˆ—็š„ๆ‰€ๆœ‰ token๏ผ
""")
# ============================================================================
# Part 4: ไธบไป€ไนˆไธๆ‹†ๅˆ†ๆˆๅฅๅญ๏ผŸ
# ============================================================================
print("\n" + "=" * 80)
print("โ“ Part 4: ไธบไป€ไนˆไธๆŠŠ Document ๆ‹†ๆˆๅคšไธชๅฅๅญ๏ผŸ")
print("=" * 80)
print("""
ๆ–นๆกˆA๏ผšๆŠŠ Document ๅฝ“ๆ•ดไฝ“๏ผˆCrossEncoder ๅฎž้™…ๅšๆณ•๏ผ‰โœ…
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ…ฅ๏ผš[CLS] Query [SEP] ๅฅๅญ1+ๅฅๅญ2+ๅฅๅญ3 [SEP]
โ†“
ๅ•ๆฌกๆŽจ็†๏ผŒๅพ—ๅˆฐไธ€ไธชๅˆ†ๆ•ฐ: 8.5
ไผ˜็‚น๏ผš
โœ… ไธ€ๆฌก่ฎก็ฎ—๏ผŒ้€Ÿๅบฆๅฟซ
โœ… ๅฅๅญไน‹้—ดๅฏไปฅไบ’็›ธๅ…ณๆณจ๏ผŒ็†่งฃไธŠไธ‹ๆ–‡
โœ… ๆ•ดไฝ“่ฏญไน‰็†่งฃๆ›ดๅฅฝ
็ผบ็‚น๏ผš
โš ๏ธ ๆœ‰้•ฟๅบฆ้™ๅˆถ๏ผˆ้€šๅธธ 512 tokens๏ผ‰
ๅฆ‚ๆžœ Document ๅคช้•ฟไผš่ขซๆˆชๆ–ญ
ๆ–นๆกˆB๏ผšๆ‹†ๆˆๅคšไธชๅฅๅญๅˆ†ๅˆซ่ฎก็ฎ—๏ผˆไธๆŽจ่๏ผ‰โŒ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ…ฅ1๏ผš[CLS] Query [SEP] ๅฅๅญ1 [SEP] โ†’ ๅˆ†ๆ•ฐ: 7.2
่พ“ๅ…ฅ2๏ผš[CLS] Query [SEP] ๅฅๅญ2 [SEP] โ†’ ๅˆ†ๆ•ฐ: 8.1
่พ“ๅ…ฅ3๏ผš[CLS] Query [SEP] ๅฅๅญ3 [SEP] โ†’ ๅˆ†ๆ•ฐ: 6.5
็„ถๅŽๅ–ๅนณๅ‡ๆˆ–ๆœ€ๅคงๅ€ผ๏ผŸ
็ผบ็‚น๏ผš
โŒ ้œ€่ฆ่ฎก็ฎ— 3 ๆฌก๏ผŒ้€Ÿๅบฆๆ…ข 3 ๅ€
โŒ ๅฅๅญไน‹้—ดๆ— ๆณ•ไบ’็›ธ็†่งฃ
โŒ ไธขๅคฑไบ†ไธŠไธ‹ๆ–‡ไฟกๆฏ
โŒ ๅฆ‚ไฝ•่šๅˆๅˆ†ๆ•ฐ๏ผŸๅนณๅ‡๏ผŸๆœ€ๅคง๏ผŸ้ƒฝไธๅฎŒ็พŽ
""")
# ============================================================================
# Part 5: ๅฎž้™…ไปฃ็ ็คบไพ‹
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ’ป Part 5: ๅฎž้™…ไปฃ็ ็คบไพ‹")
print("=" * 80)
print("""
ไฝฟ็”จ CrossEncoder ็š„็œŸๅฎžไปฃ็ ๏ผš
```python
from sentence_transformers import CrossEncoder
model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
query = "ไป€ไนˆๆ˜ฏไบบๅทฅๆ™บ่ƒฝ๏ผŸ"
# Document ๆœ‰ๅคšไธชๅฅๅญ
document = \"\"\"
ไบบๅทฅๆ™บ่ƒฝๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏใ€‚
ๅฎƒ่‡ดๅŠ›ไบŽๅˆ›ๅปบๆ™บ่ƒฝ็ณป็ปŸใ€‚
่ฟ™ไบ›็ณป็ปŸๅฏไปฅๆ‰ง่กŒ้œ€่ฆไบบ็ฑปๆ™บ่ƒฝ็š„ไปปๅŠกใ€‚
\"\"\"
# ็›ดๆŽฅไผ ๅ…ฅๆ•ดไธช Document๏ผ
pairs = [[query, document]] # โ† ๆณจๆ„๏ผšๆ•ดไธช document ไฝœไธบไธ€ไธชๅญ—็ฌฆไธฒ
# ๆจกๅž‹ๅ†…้ƒจไผš่‡ชๅŠจ๏ผš
# 1. ๆ‹ผๆŽฅ๏ผš[CLS] query [SEP] document [SEP]
# 2. ๅˆ†่ฏ๏ผšๅˆ‡ๅˆ†ๆˆ tokens๏ผˆๅฏ่ƒฝๆœ‰ 50-100 ไธช๏ผ‰
# 3. ็ผ–็ ๏ผšๆฏไธช token ไธ€ไธชๅ‘้‡
# 4. ๆณจๆ„ๅŠ›๏ผšๆ‰€ๆœ‰ tokens ไบ’็›ธๅ…ณๆณจ
# 5. ่พ“ๅ‡บ๏ผšไธ€ไธชๅˆ†ๆ•ฐ
scores = model.predict(pairs)
print(f"็›ธๅ…ณๆ€งๅˆ†ๆ•ฐ: {scores[0]}") # ่พ“ๅ‡บ: 8.26
```
ๅ…ณ้”ฎ็†่งฃ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Document ไธไผš่ขซๆ‹†ๅˆ†๏ผ
Document ็š„ๆฏไธชๅญ—/่ฏ้ƒฝไผšๅ˜ๆˆไธ€ไธชๅ‘้‡๏ผ
ๆ‰€ๆœ‰ๅ‘้‡้€š่ฟ‡ๆณจๆ„ๅŠ›ๆœบๅˆถไบ’็›ธ่ฟžๆŽฅ๏ผ
ๆœ€็ปˆ่พ“ๅ‡บไธ€ไธชๆ•ดไฝ“็š„็›ธๅ…ณๆ€งๅˆ†ๆ•ฐ๏ผ
""")
# ============================================================================
# Part 6: Token ้™ๅˆถ้—ฎ้ข˜
# ============================================================================
print("\n" + "=" * 80)
print("โš ๏ธ Part 6: Document ๅคช้•ฟๆ€ŽไนˆๅŠž๏ผŸ")
print("=" * 80)
print("""
CrossEncoder ๆœ‰้•ฟๅบฆ้™ๅˆถ๏ผˆ้€šๅธธ 512 tokens๏ผ‰
ๅฆ‚ๆžœ Document ๅคช้•ฟ๏ผˆๆฏ”ๅฆ‚ 1000 ไธชๅญ—๏ผ‰๏ผš
่งฃๅ†ณๆ–นๆกˆ1๏ผšๆˆชๆ–ญ๏ผˆๆœ€ๅธธ็”จ๏ผ‰
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅชไฟ็•™ๅ‰ 512 tokens๏ผš
[CLS] Query [SEP] Documentๅ‰400ไธชๅญ— [SEP]
ไผ˜็‚น๏ผš็ฎ€ๅ•ๅฟซ้€Ÿ
็ผบ็‚น๏ผšๅฏ่ƒฝไธขๅคฑ้‡่ฆไฟกๆฏ
่งฃๅ†ณๆ–นๆกˆ2๏ผšๆป‘ๅŠจ็ช—ๅฃ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅˆ†ๆˆๅคšไธช็ช—ๅฃ๏ผŒๆฏไธช็ช—ๅฃๅ•็‹ฌ่ฎก็ฎ—๏ผš
็ช—ๅฃ1: [CLS] Query [SEP] Document[0:400] [SEP] โ†’ ๅˆ†ๆ•ฐ: 7.2
็ช—ๅฃ2: [CLS] Query [SEP] Document[200:600] [SEP] โ†’ ๅˆ†ๆ•ฐ: 8.5
็ช—ๅฃ3: [CLS] Query [SEP] Document[400:800] [SEP] โ†’ ๅˆ†ๆ•ฐ: 6.8
ๅ–ๆœ€้ซ˜ๅˆ†: 8.5
ไผ˜็‚น๏ผšไธไผšไธขๅคฑไฟกๆฏ
็ผบ็‚น๏ผš่ฎก็ฎ—้‡ๅขžๅŠ 
่งฃๅ†ณๆ–นๆกˆ3๏ผšๅ…ˆ็”จ Bi-Encoder ็ฒ—ๆŽ’
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
1. ๆŠŠ้•ฟ Document ๆ‹†ๆˆๆฎต่ฝ
2. ็”จ Bi-Encoder ๅฟซ้€Ÿๆ‰พๅˆฐๆœ€็›ธๅ…ณ็š„ 1-2 ไธชๆฎต่ฝ
3. ๅชๅฏน่ฟ™ไบ›ๆฎต่ฝ็”จ CrossEncoder ้‡ๆŽ’
ไผ˜็‚น๏ผš้€Ÿๅบฆๅฟซ๏ผŒๅ‡†็กฎ็އ้ซ˜
็ผบ็‚น๏ผšไธค้˜ถๆฎตๅค„็†
ไฝ ็š„้กน็›ฎไฝฟ็”จ็š„ๆ˜ฏๆ–นๆกˆ1๏ผˆๆˆชๆ–ญ๏ผ‰๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅœจ reranker.py ไธญ๏ผš
CrossEncoderReranker(max_length=512) โ† ่ถ…่ฟ‡ 512 ไผš่‡ชๅŠจๆˆชๆ–ญ
""")
# ============================================================================
# Part 7: ๅฏ่ง†ๅŒ–ๆ€ป็ป“
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ“Š Part 7: ๅฏ่ง†ๅŒ–ๆ€ป็ป“")
print("=" * 80)
print("""
Document ๅค„็†็š„ๅฎŒๆ•ดๆต็จ‹๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่พ“ๅ…ฅ Document (ๅคšๅฅๅญ):
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ "ไบบๅทฅๆ™บ่ƒฝๆ˜ฏ่ฎก็ฎ—ๆœบ็ง‘ๅญฆ็š„ไธ€ไธชๅˆ†ๆ”ฏใ€‚ๅฎƒ่‡ดๅŠ›ไบŽๅˆ›ๅปบๆ™บ่ƒฝ็ณป็ปŸใ€‚" โ”‚
โ”‚ ๅฅๅญ1 ๅฅๅญ2 โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
ๆ‹ผๆŽฅๆˆๅ•ไธ€ๅบๅˆ—
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ [CLS] ไป€ไนˆๆ˜ฏไบบๅทฅๆ™บ่ƒฝ๏ผŸ [SEP] ไบบๅทฅๆ™บ่ƒฝๆ˜ฏ...ๆ™บ่ƒฝ็ณป็ปŸใ€‚ [SEP] โ”‚
โ”‚ ็‰นๆฎŠ Query tokens ๅˆ†้š” Document tokens ็ป“ๆŸ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
ๅˆ†่ฏ (Tokenization)
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”
โ”‚[CLS]โ”‚ ไป€ โ”‚ ไนˆ โ”‚[SEP]โ”‚ ไบบ โ”‚ ๅทฅ โ”‚ ...โ”‚ ็ปŸ โ”‚ ใ€‚ โ”‚[SEP]โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
ๆฏไธช token โ†’ ไธ€ไธช 768็ปดๅ‘้‡
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”ฌโ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Vโ‚€ โ”‚ Vโ‚ โ”‚ Vโ‚‚ โ”‚ Vโ‚ƒ โ”‚ Vโ‚„ โ”‚ Vโ‚… โ”‚ ... โ”‚ Vโ‚™โ‚‹โ‚‚โ”‚ Vโ‚™โ‚‹โ‚โ”‚ Vโ‚™ โ”‚
โ”‚768็ปดโ”‚768็ปดโ”‚768็ปดโ”‚768็ปดโ”‚768็ปดโ”‚768็ปดโ”‚ ... โ”‚768็ปดโ”‚768็ปดโ”‚768็ปดโ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”ดโ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
Self-Attention (12 ๅฑ‚)
ๆฏไธชๅ‘้‡้ƒฝ่ƒฝ"็œ‹ๅˆฐ"ๆ‰€ๆœ‰ๅ…ถไป–ๅ‘้‡
โ†“
โ”Œโ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”
โ”‚ Vโ‚€' (ๆ›ดๆ–ฐๅŽ็š„ [CLS] ๅ‘้‡) โ”‚
โ”‚ ๅŒ…ๅซไบ†ๆ•ดไธชๅบๅˆ—็š„ไฟกๆฏ โ”‚
โ””โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”˜
โ†“
ๅ…จ่ฟžๆŽฅๅฑ‚ (ๅˆ†็ฑปๅคด)
โ†“
็›ธๅ…ณๆ€งๅˆ†ๆ•ฐ
8.26
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
ๅ…ณ้”ฎ็‚นๆ€ป็ป“๏ผš
1. Document ๆ•ดไฝ“ๅค„็† โœ“
โ””โ”€ ไธๆ˜ฏไธ€ไธชๅ‘้‡๏ผŒๆ˜ฏๅพˆๅคšๅ‘้‡็š„ๅบๅˆ—
2. ๆฏไธชๅญ—/่ฏไธ€ไธชๅ‘้‡ โœ“
โ””โ”€ ไธๆ˜ฏๆฏไธชๅฅๅญไธ€ไธชๅ‘้‡
3. ๅ…จๅฑ€ๆณจๆ„ๅŠ› โœ“
โ””โ”€ Query ็š„่ฏ่ƒฝ็œ‹ๅˆฐ Document ๆ‰€ๆœ‰ๅฅๅญ็š„ๆ‰€ๆœ‰่ฏ
4. ๆœ€็ปˆไธ€ไธชๅˆ†ๆ•ฐ โœ“
โ””โ”€ ไปŽ [CLS] ๅ‘้‡ๆๅ–ๅ‡บๆฅ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
""")
# ============================================================================
# Part 8: ๅฏนๆฏ” Bi-Encoder ็š„ๅค„็†ๆ–นๅผ
# ============================================================================
print("\n" + "=" * 80)
print("๐Ÿ”„ Part 8: ๅฏนๆฏ” Bi-Encoder ็š„ๅค„็†ๆ–นๅผ")
print("=" * 80)
print("""
Bi-Encoder (ๅ‘้‡ๆฃ€็ดข):
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Document: "ๅฅๅญ1ใ€‚ๅฅๅญ2ใ€‚ๅฅๅญ3ใ€‚"
โ†“
Encoder (BERT)
โ†“
ๅ– [CLS] ๅ‘้‡
โ†“
ๅ•ไธชๅ‘้‡ (768็ปด) โ† Document ่ขซๅŽ‹็ผฉๆˆไธ€ไธชๅ‘้‡๏ผ
โ†“
ไธŽ Query ๅ‘้‡ๅšไฝ™ๅผฆ็›ธไผผๅบฆ
โ†“
็›ธๅ…ณๆ€งๅˆ†ๆ•ฐ
CrossEncoder (ๆทฑๅบฆ้‡ๆŽ’):
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Query + Document: "[CLS] Query [SEP] ๅฅๅญ1ใ€‚ๅฅๅญ2ใ€‚ๅฅๅญ3ใ€‚ [SEP]"
โ†“
Encoder (BERT)
โ†“
ไฟ็•™ๆ‰€ๆœ‰ token ็š„ๅ‘้‡
โ†“
ๅ‘้‡ๅบๅˆ— (n ร— 768) โ† ไฟ็•™ไบ†ๆ‰€ๆœ‰็ป†่Š‚๏ผ
โ†“
Self-Attention ่ฎฉๆ‰€ๆœ‰่ฏไบ’็›ธ็†่งฃ
โ†“
็›ธๅ…ณๆ€งๅˆ†ๆ•ฐ
ๅŒบๅˆซ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
Bi-Encoder: Document โ†’ 1 ไธชๅ‘้‡ (ไฟกๆฏๅŽ‹็ผฉ)
CrossEncoder: Document โ†’ n ไธชๅ‘้‡ (ไฟกๆฏไฟ็•™)
Bi-Encoder: Query ๅ’Œ Document ๅˆ†ๅผ€ๅค„็†
CrossEncoder: Query ๅ’Œ Document ไธ€่ตทๅค„็†
Bi-Encoder: ๅฟซ้€Ÿไฝ†ไธๅคŸๅ‡†็กฎ
CrossEncoder: ๆ…ขไฝ†้žๅธธๅ‡†็กฎ
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
""")
print("\n" + "=" * 80)
print("โœ… ๆ€ป็ป“็ญ”ๆกˆ")
print("=" * 80)
print("""
ไฝ ็š„้—ฎ้ข˜๏ผšDocument ๆ˜ฏๅšๆˆไธ€ไธช embedding๏ผŒ่ฟ˜ๆ˜ฏๆฏไธช sentence ๅšๆˆไธ€ๅ †ๅ‘้‡๏ผŸ
็ญ”ๆกˆ๏ผš้ƒฝไธๆ˜ฏ๏ผ ๐Ÿ˜Š
ๆญฃ็กฎ็†่งฃ๏ผš
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
โœ… Document ๆ•ดไฝ“ไฝœไธบ่พ“ๅ…ฅ๏ผˆไธๆ‹†ๅˆ†ๅฅๅญ๏ผ‰
โœ… ไฝ† Document ็š„ๆฏไธชๅญ—/่ฏ้ƒฝไผš็”Ÿๆˆไธ€ไธชๅ‘้‡
โœ… ไธๆ˜ฏ"ไธ€ไธช embedding"๏ผŒ่€Œๆ˜ฏ"ไธ€ไธชๅ‘้‡ๅบๅˆ—"
โœ… ไธๆ˜ฏ"ๆŒ‰ๅฅๅญๅˆ†"๏ผŒ่€Œๆ˜ฏ"ๆŒ‰ๅญ—/่ฏๅˆ†"
Document (50ไธชๅญ—) โ†’ 50 ไธชๅ‘้‡ (ๆฏไธช 768 ็ปด)
ไธๆ˜ฏ 1 ไธชๅ‘้‡
ไนŸไธๆ˜ฏ 3 ไธชๅ‘้‡(ๅฆ‚ๆžœๆœ‰3ไธชๅฅๅญ)
โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”โ”
่ฟ™ๅฐฑๆ˜ฏไธบไป€ไนˆ CrossEncoder ่ƒฝ็†่งฃ็ป†็ฒ’ๅบฆ็š„่ฏญไน‰ๅ…ณ็ณป๏ผ
""")
print("\n๐Ÿ’ก ็Žฐๅœจไฝ ็†่งฃไบ†ๅ—๏ผŸๅฆ‚ๆœ‰็–‘้—ฎ๏ผŒ่ฏท็ปง็ปญๆ้—ฎ๏ผ\n")