Spaces:
Paused
Paused
lanny xu
commited on
Commit
ยท
dbd527a
1
Parent(s):
0d85198
modify reranker
Browse files
vectorization_implementation_steps.py
ADDED
|
@@ -0,0 +1,555 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ๆๅญ่ฝฌๅ้็ๅ
ทไฝๅฎ็ฐๆญฅ้ชค๏ผไปฃ็ ๅฑ้ข๏ผ
|
| 3 |
+
ๅฑ็คบ HuggingFace Embeddings ๅ
้จ็ๅฎ้
ๆไฝ
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
print("=" * 80)
|
| 7 |
+
print("ๆๅญ โ ๅ้็ๅ
ทไฝๅฎ็ฐๆญฅ้ชค")
|
| 8 |
+
print("=" * 80)
|
| 9 |
+
|
| 10 |
+
# ============================================================================
|
| 11 |
+
# ๅๅคๅทฅไฝ๏ผๆจกๆๅฎๆด็ๅ้ๅ่ฟ็จ
|
| 12 |
+
# ============================================================================
|
| 13 |
+
print("\n" + "=" * 80)
|
| 14 |
+
print("๐ง ๅๅค๏ผๅฎ่ฃ
ๅๅฏผๅ
ฅ้่ฆ็ๅบ")
|
| 15 |
+
print("=" * 80)
|
| 16 |
+
|
| 17 |
+
print("""
|
| 18 |
+
้่ฆ็ๅบ๏ผ
|
| 19 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
+
pip install transformers torch sentence-transformers
|
| 21 |
+
|
| 22 |
+
ๅฏผๅ
ฅ๏ผ
|
| 23 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 24 |
+
from transformers import AutoTokenizer, AutoModel
|
| 25 |
+
import torch
|
| 26 |
+
import numpy as np
|
| 27 |
+
""")
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ============================================================================
|
| 31 |
+
# Step 1: ๅ ่ฝฝๆจกๅๅๅ่ฏๅจ
|
| 32 |
+
# ============================================================================
|
| 33 |
+
print("\n" + "=" * 80)
|
| 34 |
+
print("Step 1: ๅ ่ฝฝ้ข่ฎญ็ปๆจกๅๅๅ่ฏๅจ")
|
| 35 |
+
print("=" * 80)
|
| 36 |
+
|
| 37 |
+
print("""
|
| 38 |
+
ไปฃ็ ๏ผ
|
| 39 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 40 |
+
from transformers import AutoTokenizer, AutoModel
|
| 41 |
+
|
| 42 |
+
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
| 43 |
+
|
| 44 |
+
# 1. ๅ ่ฝฝๅ่ฏๅจ๏ผ่ด่ดฃๆๅญ โ ID๏ผ
|
| 45 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 46 |
+
|
| 47 |
+
# 2. ๅ ่ฝฝๆจกๅ๏ผ่ด่ดฃ ID โ ๅ้๏ผ
|
| 48 |
+
model = AutoModel.from_pretrained(model_name)
|
| 49 |
+
model.eval() # ่ฎพ็ฝฎไธบ่ฏไผฐๆจกๅผ๏ผไธ่ฎญ็ป๏ผ
|
| 50 |
+
|
| 51 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 52 |
+
|
| 53 |
+
่ฟไธคไธชไธ่ฅฟๅไปไน๏ผ
|
| 54 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 55 |
+
|
| 56 |
+
Tokenizer๏ผๅ่ฏๅจ๏ผ๏ผ
|
| 57 |
+
โโ ่ฏๆฑ่กจ๏ผvocabulary๏ผ๏ผ30,000+ ไธช่ฏ
|
| 58 |
+
โ ไพๅฆ๏ผ{"hello": 1, "world": 2, "machine": 3456, ...}
|
| 59 |
+
โโ ๅ่ฏ่งๅ๏ผๅฆไฝๅๅๆๅญ
|
| 60 |
+
|
| 61 |
+
Model๏ผๆจกๅ๏ผ๏ผ
|
| 62 |
+
โโ Embedding ๅฑ๏ผ่ฏๆฑ่กจ โ ๅๅงๅ้
|
| 63 |
+
โ 30,000 ร 384 ็็ฉ้ต๏ผๆฏไธช่ฏๅฏนๅบไธไธช 384 ็ปดๅ้๏ผ
|
| 64 |
+
โโ Transformer ๅฑ๏ผ6 ๅฑ BERT encoder
|
| 65 |
+
โ ๆฏๅฑ้ฝๆ Self-Attention + Feed Forward
|
| 66 |
+
โโ ๅๆฐ้๏ผ22M๏ผ2200ไธไธชๆฐๅญ๏ผ
|
| 67 |
+
""")
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
# ============================================================================
|
| 71 |
+
# Step 2: ๅ่ฏ๏ผTokenization๏ผ
|
| 72 |
+
# ============================================================================
|
| 73 |
+
print("\n" + "=" * 80)
|
| 74 |
+
print("Step 2: ๅ่ฏ - ๆๅญ่ฝฌไธบ Token IDs")
|
| 75 |
+
print("=" * 80)
|
| 76 |
+
|
| 77 |
+
print("""
|
| 78 |
+
่พๅ
ฅๆๆฌ๏ผ
|
| 79 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 80 |
+
text = "Machine learning is a subset of artificial intelligence"
|
| 81 |
+
|
| 82 |
+
ไปฃ็ ๏ผ
|
| 83 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 84 |
+
# ๅ่ฏๅนถ่ฝฌๆขไธบๆจกๅ่พๅ
ฅๆ ผๅผ
|
| 85 |
+
encoded_input = tokenizer(
|
| 86 |
+
text,
|
| 87 |
+
padding=True, # ๅกซๅ
ๅฐ็ธๅ้ฟๅบฆ
|
| 88 |
+
truncation=True, # ่ถ
้ฟๆชๆญ
|
| 89 |
+
max_length=512, # ๆๅคง้ฟๅบฆ
|
| 90 |
+
return_tensors='pt' # ่ฟๅ PyTorch tensor
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
print(encoded_input)
|
| 94 |
+
|
| 95 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 96 |
+
|
| 97 |
+
่พๅบ๏ผencoded_input ๅ
ๅซ๏ผ๏ผ
|
| 98 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 99 |
+
{
|
| 100 |
+
'input_ids': tensor([[
|
| 101 |
+
101, # [CLS] ็นๆฎๆ ่ฎฐ
|
| 102 |
+
3698, # "machine"
|
| 103 |
+
4083, # "learning"
|
| 104 |
+
2003, # "is"
|
| 105 |
+
1037, # "a"
|
| 106 |
+
2042, # "subset"
|
| 107 |
+
1997, # "of"
|
| 108 |
+
7976, # "artificial"
|
| 109 |
+
4454, # "intelligence"
|
| 110 |
+
102 # [SEP] ็นๆฎๆ ่ฎฐ
|
| 111 |
+
]]),
|
| 112 |
+
|
| 113 |
+
'attention_mask': tensor([[
|
| 114 |
+
1, 1, 1, 1, 1, 1, 1, 1, 1, 1 # ๆๆไฝ็ฝฎ้ฝๆๆ๏ผ1่กจ็คบๅ
ณๆณจ๏ผ0่กจ็คบๅฟฝ็ฅ๏ผ
|
| 115 |
+
]])
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
่ฏฆ็ป่งฃ้๏ผ
|
| 119 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 120 |
+
|
| 121 |
+
input_ids:
|
| 122 |
+
ๆฏไธชๆฐๅญๅฏนๅบไธไธช่ฏ
|
| 123 |
+
101 = [CLS]๏ผๅฅๅญๅผๅงๆ ่ฎฐ๏ผ
|
| 124 |
+
3698 = "machine"
|
| 125 |
+
102 = [SEP]๏ผๅฅๅญ็ปๆๆ ่ฎฐ๏ผ
|
| 126 |
+
|
| 127 |
+
attention_mask:
|
| 128 |
+
ๅ่ฏๆจกๅๅชไบไฝ็ฝฎๆฏ็ๅฎๅ
ๅฎน๏ผ1๏ผ๏ผๅชไบๆฏๅกซๅ
๏ผ0๏ผ
|
| 129 |
+
ไพๅฆ๏ผ[1, 1, 1, 0, 0] ่กจ็คบๅ3ไธชๆฏ็ๅฎ่ฏ๏ผๅ2ไธชๆฏๅกซๅ
|
| 130 |
+
""")
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
# ============================================================================
|
| 134 |
+
# Step 3: ้่ฟ Embedding ๅฑ่ทๅๅๅงๅ้
|
| 135 |
+
# ============================================================================
|
| 136 |
+
print("\n" + "=" * 80)
|
| 137 |
+
print("Step 3: Token IDs โ ๅๅงๅ้๏ผEmbedding ๅฑ๏ผ")
|
| 138 |
+
print("=" * 80)
|
| 139 |
+
|
| 140 |
+
print("""
|
| 141 |
+
่ฟไธๆญฅๅ็ๅจๆจกๅๅ
้จ๏ผ
|
| 142 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 143 |
+
|
| 144 |
+
input_ids = [101, 3698, 4083, 2003, ...]
|
| 145 |
+
โ
|
| 146 |
+
Embedding ่กจๆฅ่ฏข
|
| 147 |
+
โ
|
| 148 |
+
|
| 149 |
+
Embedding ่กจ๏ผ็ฎๅ๏ผ๏ผ
|
| 150 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 151 |
+
่ฟๆฏไธไธชๅทจๅคง็็ฉ้ต๏ผ30,522 ร 384
|
| 152 |
+
๏ผ30,522 ๆฏ่ฏๆฑ่กจๅคงๅฐ๏ผ384 ๆฏๅ้็ปดๅบฆ๏ผ
|
| 153 |
+
|
| 154 |
+
ID | ็ฌฌ1็ปด ็ฌฌ2็ปด ็ฌฌ3็ปด ... ็ฌฌ384็ปด
|
| 155 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 156 |
+
101 | 0.12 -0.34 0.56 ... 0.78 โ [CLS]
|
| 157 |
+
3698 | 0.23 0.45 -0.67 ... 0.89 โ "machine"
|
| 158 |
+
4083 | 0.34 -0.56 0.78 ... -0.90 โ "learning"
|
| 159 |
+
2003 | 0.45 0.67 -0.89 ... 0.12 โ "is"
|
| 160 |
+
...
|
| 161 |
+
|
| 162 |
+
ๆฅ่ฏข่ฟ็จ๏ผ็ฑปไผผๅญๅ
ธๆฅ่ฏข๏ผ๏ผ
|
| 163 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 164 |
+
ID 101 โ ๆฅ่กจ โ [0.12, -0.34, 0.56, ..., 0.78]
|
| 165 |
+
ID 3698 โ ๆฅ่กจ โ [0.23, 0.45, -0.67, ..., 0.89]
|
| 166 |
+
ID 4083 โ ๆฅ่กจ โ [0.34, -0.56, 0.78, ..., -0.90]
|
| 167 |
+
...
|
| 168 |
+
|
| 169 |
+
็ปๆ๏ผ
|
| 170 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 171 |
+
token_embeddings = [
|
| 172 |
+
[0.12, -0.34, 0.56, ..., 0.78], # [CLS]
|
| 173 |
+
[0.23, 0.45, -0.67, ..., 0.89], # "machine"
|
| 174 |
+
[0.34, -0.56, 0.78, ..., -0.90], # "learning"
|
| 175 |
+
[0.45, 0.67, -0.89, ..., 0.12], # "is"
|
| 176 |
+
...
|
| 177 |
+
]
|
| 178 |
+
ๅฝข็ถ๏ผ(10, 384) # 10 ไธช tokens๏ผๆฏไธช 384 ็ปด
|
| 179 |
+
|
| 180 |
+
โ ๏ธ ๆณจๆ๏ผ่ฟไบ่ฟไธๆฏๆ็ปๅ้๏ผ้่ฆ้่ฟ Transformer ๅค็๏ผ
|
| 181 |
+
""")
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ============================================================================
|
| 185 |
+
# Step 4: Transformer ๅค็๏ผๆ ธๅฟ๏ผ๏ผ
|
| 186 |
+
# ============================================================================
|
| 187 |
+
print("\n" + "=" * 80)
|
| 188 |
+
print("Step 4: Transformer ๅค็ - Self-Attention๏ผๆ ธๅฟๆญฅ้ชค๏ผ")
|
| 189 |
+
print("=" * 80)
|
| 190 |
+
|
| 191 |
+
print("""
|
| 192 |
+
ไปฃ็ ๏ผ
|
| 193 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 194 |
+
with torch.no_grad(): # ไธ่ฎก็ฎๆขฏๅบฆ๏ผไธ่ฎญ็ป๏ผ
|
| 195 |
+
outputs = model(**encoded_input)
|
| 196 |
+
|
| 197 |
+
# outputs.last_hidden_state ๅฐฑๆฏ Transformer ็่พๅบ
|
| 198 |
+
token_embeddings = outputs.last_hidden_state
|
| 199 |
+
print(token_embeddings.shape) # torch.Size([1, 10, 384])
|
| 200 |
+
# ๆนๆฌก tokens ็ปดๅบฆ
|
| 201 |
+
|
| 202 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 203 |
+
|
| 204 |
+
Transformer ๅ
้จๅไบไปไน๏ผ๏ผ6 ๅฑๅค็๏ผ
|
| 205 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 206 |
+
|
| 207 |
+
่พๅ
ฅ๏ผๅๅง embeddings
|
| 208 |
+
[CLS]: [0.12, -0.34, 0.56, ...]
|
| 209 |
+
machine: [0.23, 0.45, -0.67, ...]
|
| 210 |
+
learning: [0.34, -0.56, 0.78, ...]
|
| 211 |
+
is: [0.45, 0.67, -0.89, ...]
|
| 212 |
+
...
|
| 213 |
+
|
| 214 |
+
โ
|
| 215 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 216 |
+
โ Layer 1: Self-Attention โ
|
| 217 |
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 218 |
+
โ โ
|
| 219 |
+
โ ๆฏไธช่ฏ"็"ๅ
ถไปๆๆ่ฏ๏ผๆดๆฐ่ชๅทฑ็ๅ้๏ผ โ
|
| 220 |
+
โ โ
|
| 221 |
+
โ "machine" ็ๅฐ "learning" โ ็่งฃ่ฟๆฏไธไธช่ฏ็ป โ
|
| 222 |
+
โ "learning" ็ๅฐ "artificial" โ ็่งฃไธAI็ธๅ
ณ โ
|
| 223 |
+
โ "is" ็ๅฐๅๅ่ฏ โ ็่งฃๆฏ่ฟๆฅ่ฏ โ
|
| 224 |
+
โ โ
|
| 225 |
+
โ ๆดๆฐๅ็ๅ้ๅ
ๅซไบไธไธๆไฟกๆฏ โ
|
| 226 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 227 |
+
โ
|
| 228 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 229 |
+
โ Layer 2: Self-Attention โ
|
| 230 |
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 231 |
+
โ ็ปง็ปญๆทฑๅ็่งฃ... โ
|
| 232 |
+
โ "machine learning" ไฝไธบๆดไฝ็่งฃ โ
|
| 233 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 234 |
+
โ
|
| 235 |
+
... (Layer 3, 4, 5) ...
|
| 236 |
+
โ
|
| 237 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 238 |
+
โ Layer 6: Self-Attention (ๆๅไธๅฑ) โ
|
| 239 |
+
โ โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ โ
|
| 240 |
+
โ ๆฏไธช่ฏ็ๅ้็ฐๅจๅ
ๅซไบ๏ผ โ
|
| 241 |
+
โ - ่ชๅทฑ็่ฏญไน โ
|
| 242 |
+
โ - ไธไธๆไฟกๆฏ โ
|
| 243 |
+
โ - ๆดไธชๅฅๅญ็ๅซไน โ
|
| 244 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 245 |
+
โ
|
| 246 |
+
ๆ็ป่พๅบ๏ผ
|
| 247 |
+
[CLS]: [0.234, 0.567, -0.890, ...] # ๆดๆฐๅ๏ผๅ
ๅซๅ
จๅฅไฟกๆฏ
|
| 248 |
+
machine: [0.345, -0.678, 0.123, ...] # ๅ
ๅซ "learning" ็ไฟกๆฏ
|
| 249 |
+
learning: [0.456, 0.789, -0.234, ...] # ๅ
ๅซ "machine" ็ไฟกๆฏ
|
| 250 |
+
...
|
| 251 |
+
|
| 252 |
+
ๅฝข็ถ๏ผ(1, 10, 384)
|
| 253 |
+
ๆนๆฌก tokens ็ปดๅบฆ
|
| 254 |
+
""")
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ============================================================================
|
| 258 |
+
# Step 5: Mean Pooling - ๅๅนถๆไธไธชๅฅๅญๅ้
|
| 259 |
+
# ============================================================================
|
| 260 |
+
print("\n" + "=" * 80)
|
| 261 |
+
print("Step 5: Mean Pooling - ๆๅคไธช่ฏๅ้ๅๅนถๆไธไธชๅฅๅญๅ้")
|
| 262 |
+
print("=" * 80)
|
| 263 |
+
|
| 264 |
+
print("""
|
| 265 |
+
้ฎ้ข๏ผ็ฐๅจๆ 10 ไธช่ฏ๏ผๆฏไธช่ฏไธไธชๅ้
|
| 266 |
+
ๅฆไฝๅๆ 1 ไธชๅฅๅญๅ้๏ผ
|
| 267 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 268 |
+
|
| 269 |
+
ไปฃ็ ๏ผ
|
| 270 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 271 |
+
def mean_pooling(token_embeddings, attention_mask):
|
| 272 |
+
\"\"\"
|
| 273 |
+
ๅฏนๆๆ่ฏๅ้ๆฑๅนณๅ๏ผ่่ attention_mask๏ผ
|
| 274 |
+
\"\"\"
|
| 275 |
+
# token_embeddings: (1, 10, 384)
|
| 276 |
+
# attention_mask: (1, 10)
|
| 277 |
+
|
| 278 |
+
# ๆฉๅฑ mask ็็ปดๅบฆไปฅๅน้
embeddings
|
| 279 |
+
# (1, 10) โ (1, 10, 1) โ (1, 10, 384)
|
| 280 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(
|
| 281 |
+
token_embeddings.size()
|
| 282 |
+
).float()
|
| 283 |
+
|
| 284 |
+
# ๅฐ embeddings ไธ mask ็ธไน๏ผๅฟฝ็ฅๅกซๅ
้จๅ๏ผ
|
| 285 |
+
# ็ถๅๅฏนๆๆ่ฏๆฑๅ
|
| 286 |
+
sum_embeddings = torch.sum(
|
| 287 |
+
token_embeddings * input_mask_expanded,
|
| 288 |
+
dim=1 # ๅจ token ็ปดๅบฆๆฑๅ
|
| 289 |
+
)
|
| 290 |
+
|
| 291 |
+
# ่ฎก็ฎๆๆ token ็ๆฐ้
|
| 292 |
+
sum_mask = torch.clamp(
|
| 293 |
+
input_mask_expanded.sum(dim=1),
|
| 294 |
+
min=1e-9 # ้ฟๅ
้ค้ถ
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
# ๆฑๅนณๅ
|
| 298 |
+
mean_embeddings = sum_embeddings / sum_mask
|
| 299 |
+
|
| 300 |
+
return mean_embeddings
|
| 301 |
+
|
| 302 |
+
# ไฝฟ็จ
|
| 303 |
+
sentence_embedding = mean_pooling(
|
| 304 |
+
token_embeddings,
|
| 305 |
+
encoded_input['attention_mask']
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
print(sentence_embedding.shape) # torch.Size([1, 384])
|
| 309 |
+
# ๆนๆฌก ็ปดๅบฆ
|
| 310 |
+
|
| 311 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 312 |
+
|
| 313 |
+
ๅ
ทไฝ่ฎก็ฎ๏ผ็ฎๅ็คบไพ๏ผ๏ผ
|
| 314 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 315 |
+
|
| 316 |
+
10 ไธช่ฏๅ้๏ผๆฏไธช 384 ็ปด๏ผ
|
| 317 |
+
Token 1: [0.234, 0.567, -0.890, ..., 0.123]
|
| 318 |
+
Token 2: [0.345, -0.678, 0.123, ..., 0.234]
|
| 319 |
+
Token 3: [0.456, 0.789, -0.234, ..., 0.345]
|
| 320 |
+
...
|
| 321 |
+
Token 10: [0.567, 0.890, 0.345, ..., 0.456]
|
| 322 |
+
|
| 323 |
+
ๆฑๅนณๅ๏ผๅฏนๆฏไธ็ปดๅๅซๅนณๅ๏ผ๏ผ
|
| 324 |
+
็ฌฌ1็ปด: (0.234 + 0.345 + 0.456 + ... + 0.567) / 10 = 0.412
|
| 325 |
+
็ฌฌ2็ปด: (0.567 - 0.678 + 0.789 + ... + 0.890) / 10 = 0.523
|
| 326 |
+
็ฌฌ3็ปด: (-0.890 + 0.123 - 0.234 + ... + 0.345) / 10 = -0.089
|
| 327 |
+
...
|
| 328 |
+
็ฌฌ384็ปด: (0.123 + 0.234 + 0.345 + ... + 0.456) / 10 = 0.289
|
| 329 |
+
|
| 330 |
+
ๅฅๅญๅ้ = [0.412, 0.523, -0.089, ..., 0.289] (384็ปด)
|
| 331 |
+
""")
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
# ============================================================================
|
| 335 |
+
# Step 6: ๅฝไธๅ๏ผNormalization๏ผ
|
| 336 |
+
# ============================================================================
|
| 337 |
+
print("\n" + "=" * 80)
|
| 338 |
+
print("Step 6: L2 ๅฝไธๅ - ๅฐๅ้้ฟๅบฆ็ผฉๆพๅฐ 1")
|
| 339 |
+
print("=" * 80)
|
| 340 |
+
|
| 341 |
+
print("""
|
| 342 |
+
ไปฃ็ ๏ผ
|
| 343 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ๏ฟฝ๏ฟฝ๏ฟฝโโโโโโโโโโโโโโโโโโโโโโโ
|
| 344 |
+
import torch.nn.functional as F
|
| 345 |
+
|
| 346 |
+
# L2 ๅฝไธๅ
|
| 347 |
+
sentence_embedding = F.normalize(
|
| 348 |
+
sentence_embedding,
|
| 349 |
+
p=2, # L2 ่ๆฐ
|
| 350 |
+
dim=1 # ๅจ็นๅพ็ปดๅบฆๅฝไธๅ
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
print(sentence_embedding.shape) # torch.Size([1, 384])
|
| 354 |
+
|
| 355 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 356 |
+
|
| 357 |
+
ๅฝไธๅ็ไฝ็จ๏ผ
|
| 358 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 359 |
+
|
| 360 |
+
ๅฝไธๅๅ็ๅ้๏ผ
|
| 361 |
+
v = [0.412, 0.523, -0.089, ..., 0.289]
|
| 362 |
+
้ฟๅบฆ ||v|| = โ(0.412ยฒ + 0.523ยฒ + ... + 0.289ยฒ) = 2.37
|
| 363 |
+
|
| 364 |
+
ๅฝไธๅๅ็ๅ้๏ผ
|
| 365 |
+
v_norm = v / ||v||
|
| 366 |
+
v_norm = [0.412/2.37, 0.523/2.37, ..., 0.289/2.37]
|
| 367 |
+
= [0.174, 0.221, -0.038, ..., 0.122]
|
| 368 |
+
้ฟๅบฆ ||v_norm|| = 1 โ
|
| 369 |
+
|
| 370 |
+
ๅฅฝๅค๏ผ
|
| 371 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 372 |
+
โ
ๆๆๅ้้ฟๅบฆ็ธๅ๏ผ้ฝๆฏ1๏ผ๏ผๆนไพฟๆฏ่พ
|
| 373 |
+
โ
ไฝๅผฆ็ธไผผๅบฆ = ็น็งฏ๏ผ่ฎก็ฎๆดๅฟซ๏ผ
|
| 374 |
+
cos_sim(a, b) = aยทb / (||a|| ร ||b||)
|
| 375 |
+
ๅฆๆๅฝไธๅ: cos_sim(a, b) = aยทb โ ็ฎๅไบ๏ผ
|
| 376 |
+
|
| 377 |
+
โ
ๆถ้คๅ้้ฟๅบฆ็ๅฝฑๅ๏ผๅชๅ
ณๆณจๆนๅ
|
| 378 |
+
""")
|
| 379 |
+
|
| 380 |
+
|
| 381 |
+
# ============================================================================
|
| 382 |
+
# Step 7: ๆ็ป่พๅบ
|
| 383 |
+
# ============================================================================
|
| 384 |
+
print("\n" + "=" * 80)
|
| 385 |
+
print("Step 7: ๅพๅฐๆ็ป็ๅฅๅญๅ้")
|
| 386 |
+
print("=" * 80)
|
| 387 |
+
|
| 388 |
+
print("""
|
| 389 |
+
ๆ็ป็ปๆ๏ผ
|
| 390 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 391 |
+
|
| 392 |
+
# ่ฝฌๆขไธบ numpy ๆฐ็ป๏ผๆนไพฟไฝฟ็จ๏ผ
|
| 393 |
+
final_vector = sentence_embedding.cpu().numpy()[0]
|
| 394 |
+
|
| 395 |
+
print(final_vector.shape) # (384,)
|
| 396 |
+
print(final_vector[:5]) # ๅ5ไธชๆฐๅญ
|
| 397 |
+
# [0.174, 0.221, -0.038, 0.095, 0.312]
|
| 398 |
+
|
| 399 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 400 |
+
|
| 401 |
+
่ฟๅฐฑๆฏๆ็ป็ๅฅๅญๅ้๏ผ
|
| 402 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 403 |
+
|
| 404 |
+
่พๅ
ฅ: "Machine learning is a subset of artificial intelligence"
|
| 405 |
+
่พๅบ: [0.174, 0.221, -0.038, ..., 0.122] (384 ไธชๆฐๅญ)
|
| 406 |
+
|
| 407 |
+
่ฟไธชๅ้ๅ
ๅซไบ๏ผ
|
| 408 |
+
โ
ๆฏไธช่ฏ็่ฏญไน
|
| 409 |
+
โ
่ฏไธ่ฏไน้ด็ๅ
ณ็ณป
|
| 410 |
+
โ
ๆดไธชๅฅๅญ็ๅซไน
|
| 411 |
+
|
| 412 |
+
ๅฏไปฅ็จๆฅ๏ผ
|
| 413 |
+
โ
่ฎก็ฎไธๅ
ถไปๅฅๅญ็็ธไผผๅบฆ
|
| 414 |
+
โ
ๅญๅ
ฅๅ้ๆฐๆฎๅบ
|
| 415 |
+
โ
่ฟ่ก่ฏญไนๆ็ดข
|
| 416 |
+
""")
|
| 417 |
+
|
| 418 |
+
|
| 419 |
+
# ============================================================================
|
| 420 |
+
# ๅฎๆดไปฃ็ ๆฑๆป
|
| 421 |
+
# ============================================================================
|
| 422 |
+
print("\n" + "=" * 80)
|
| 423 |
+
print("๐ ๅฎๆดไปฃ็ ๆฑๆป๏ผๅฎ้
ๅฏ่ฟ่ก๏ผ")
|
| 424 |
+
print("=" * 80)
|
| 425 |
+
|
| 426 |
+
print("""
|
| 427 |
+
from transformers import AutoTokenizer, AutoModel
|
| 428 |
+
import torch
|
| 429 |
+
import torch.nn.functional as F
|
| 430 |
+
|
| 431 |
+
def text_to_vector(text):
|
| 432 |
+
\"\"\"
|
| 433 |
+
ๅฎๆด็ๆๅญ่ฝฌๅ้ๆต็จ
|
| 434 |
+
\"\"\"
|
| 435 |
+
# Step 1: ๅ ่ฝฝๆจกๅ
|
| 436 |
+
model_name = "sentence-transformers/all-MiniLM-L6-v2"
|
| 437 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 438 |
+
model = AutoModel.from_pretrained(model_name)
|
| 439 |
+
model.eval()
|
| 440 |
+
|
| 441 |
+
# Step 2: ๅ่ฏ
|
| 442 |
+
encoded_input = tokenizer(
|
| 443 |
+
text,
|
| 444 |
+
padding=True,
|
| 445 |
+
truncation=True,
|
| 446 |
+
max_length=512,
|
| 447 |
+
return_tensors='pt'
|
| 448 |
+
)
|
| 449 |
+
|
| 450 |
+
# Step 3 & 4: ้่ฟๆจกๅ๏ผEmbedding + Transformer๏ผ
|
| 451 |
+
with torch.no_grad():
|
| 452 |
+
outputs = model(**encoded_input)
|
| 453 |
+
token_embeddings = outputs.last_hidden_state
|
| 454 |
+
|
| 455 |
+
# Step 5: Mean Pooling
|
| 456 |
+
attention_mask = encoded_input['attention_mask']
|
| 457 |
+
input_mask_expanded = attention_mask.unsqueeze(-1).expand(
|
| 458 |
+
token_embeddings.size()
|
| 459 |
+
).float()
|
| 460 |
+
|
| 461 |
+
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
|
| 462 |
+
sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
|
| 463 |
+
sentence_embedding = sum_embeddings / sum_mask
|
| 464 |
+
|
| 465 |
+
# Step 6: ๅฝไธๅ
|
| 466 |
+
sentence_embedding = F.normalize(sentence_embedding, p=2, dim=1)
|
| 467 |
+
|
| 468 |
+
# Step 7: ่ฝฌไธบ numpy
|
| 469 |
+
return sentence_embedding.cpu().numpy()[0]
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
# ไฝฟ็จ็คบไพ๏ผ
|
| 473 |
+
text = "Machine learning is a subset of artificial intelligence"
|
| 474 |
+
vector = text_to_vector(text)
|
| 475 |
+
|
| 476 |
+
print(f"่พๅ
ฅ: {text}")
|
| 477 |
+
print(f"ๅ้็ปดๅบฆ: {vector.shape}") # (384,)
|
| 478 |
+
print(f"ๅ10ไธชๆฐๅญ: {vector[:10]}")
|
| 479 |
+
print(f"ๅ้้ฟๅบฆ: {np.linalg.norm(vector)}") # ๅบ่ฏฅๆฏ 1.0
|
| 480 |
+
|
| 481 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 482 |
+
|
| 483 |
+
ไฝ ็้กน็ฎไธญ็็ฎๅ่ฐ็จ๏ผ
|
| 484 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ๏ฟฝ๏ฟฝโโโโโโโโโโโโโโโโโ
|
| 485 |
+
|
| 486 |
+
from langchain_community.embeddings import HuggingFaceEmbeddings
|
| 487 |
+
|
| 488 |
+
embeddings = HuggingFaceEmbeddings(
|
| 489 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2"
|
| 490 |
+
)
|
| 491 |
+
|
| 492 |
+
vector = embeddings.embed_query(text)
|
| 493 |
+
# โ ่ฟไธ่กๅ
้จๆง่กไบไธ้ขๆๆ 7 ไธชๆญฅ้ชค๏ผ
|
| 494 |
+
|
| 495 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 496 |
+
""")
|
| 497 |
+
|
| 498 |
+
|
| 499 |
+
# ============================================================================
|
| 500 |
+
# ๅ
ณ้ฎๆญฅ้ชคๆถ้ดๅๆ
|
| 501 |
+
# ============================================================================
|
| 502 |
+
print("\n" + "=" * 80)
|
| 503 |
+
print("โฑ๏ธ ๅๆญฅ้ชค่ๆถๅๆ")
|
| 504 |
+
print("=" * 80)
|
| 505 |
+
|
| 506 |
+
print("""
|
| 507 |
+
ๅ่ฎพๅค็ไธไธชๅฅๅญ๏ผ10ไธช่ฏ๏ผ๏ผ
|
| 508 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 509 |
+
|
| 510 |
+
Step 1: ๅ ่ฝฝๆจกๅ 0.5-2็ง (ๅช้ไธๆฌก๏ผๅฏๅค็จ)
|
| 511 |
+
Step 2: ๅ่ฏ <1ๆฏซ็ง (้ๅธธๅฟซ)
|
| 512 |
+
Step 3: Embedding ๆฅ่กจ <1ๆฏซ็ง (็ฉ้ต็ดขๅผ)
|
| 513 |
+
Step 4: Transformer ๅค็ 10-50ๆฏซ็ง (6ๅฑ่ฎก็ฎ๏ผๆๆ
ข)
|
| 514 |
+
Step 5: Mean Pooling <1ๆฏซ็ง (็ฎๅๅนณๅ)
|
| 515 |
+
Step 6: ๅฝไธๅ <1ๆฏซ็ง (็ฎๅ้คๆณ)
|
| 516 |
+
Step 7: ่ฝฌๆขๆ ผๅผ <1ๆฏซ็ง
|
| 517 |
+
|
| 518 |
+
ๆป่ๆถ: 10-50ๆฏซ็ง (GPU) ๆ 50-200ๆฏซ็ง (CPU)
|
| 519 |
+
|
| 520 |
+
ๆน้ๅค็๏ผ20ไธชๅฅๅญ๏ผ:
|
| 521 |
+
ๅไธชๅค็: 20 ร 50ms = 1000ms
|
| 522 |
+
ๆน้ๅค็: 100ms โ ๅฟซ10ๅ๏ผ(GPUๅนถ่ก)
|
| 523 |
+
|
| 524 |
+
่ฟๅฐฑๆฏไธบไปไน่ฆๆน้ๅ้ๅ๏ผ
|
| 525 |
+
""")
|
| 526 |
+
|
| 527 |
+
|
| 528 |
+
print("\n" + "=" * 80)
|
| 529 |
+
print("โ
ๆๅญ่ฝฌๅ้็ๅฎ็ฐๆญฅ้ชค่ฎฒ่งฃๅฎๆฏ๏ผ")
|
| 530 |
+
print("=" * 80)
|
| 531 |
+
print("""
|
| 532 |
+
ๆ ธๅฟๆญฅ้ชคๅ้กพ๏ผ
|
| 533 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 534 |
+
|
| 535 |
+
ๆๅญ
|
| 536 |
+
โ Step 1: ๅ ่ฝฝๆจกๅ
|
| 537 |
+
Tokenizer + Model
|
| 538 |
+
โ Step 2: ๅ่ฏ
|
| 539 |
+
Token IDs: [101, 3698, 4083, ...]
|
| 540 |
+
โ Step 3: Embedding ๆฅ่กจ
|
| 541 |
+
ๅๅงๅ้: [(10, 384)]
|
| 542 |
+
โ Step 4: Transformer ๅค็
|
| 543 |
+
ๆดๆฐๅ้: [(10, 384)] ๅ
ๅซไธไธๆไฟกๆฏ
|
| 544 |
+
โ Step 5: Mean Pooling
|
| 545 |
+
ๅฅๅญๅ้: [(1, 384)]
|
| 546 |
+
โ Step 6: ๅฝไธๅ
|
| 547 |
+
ๅฝไธๅๅ้: [(1, 384)] ้ฟๅบฆ=1
|
| 548 |
+
โ Step 7: ่พๅบ
|
| 549 |
+
ๆ็ปๅ้: [0.174, 0.221, ..., 0.122]
|
| 550 |
+
|
| 551 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 552 |
+
|
| 553 |
+
็ฐๅจไฝ ็ฅ้ไบๆฏไธๆญฅ็ๅ
ทไฝๆไฝ๏ผ
|
| 554 |
+
""")
|
| 555 |
+
print()
|
vectorization_process_explained.py
ADDED
|
@@ -0,0 +1,528 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
ๅ้ๅๅ Chroma ๅญๅจ่ฟ็จ่ฏฆ่งฃ
|
| 3 |
+
ไปๅๅฒๅ็ๆๆกฃๅฐๅ้ๆฐๆฎๅบ็ๅฎๆดๆต็จ
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
print("=" * 80)
|
| 7 |
+
print("ๅ้ๅๅ Chroma ๅญๅจ่ฟ็จ่ฏฆ่งฃ")
|
| 8 |
+
print("=" * 80)
|
| 9 |
+
|
| 10 |
+
# ============================================================================
|
| 11 |
+
# Part 1: ๅฎๆดๆต็จๆฆ่ง
|
| 12 |
+
# ============================================================================
|
| 13 |
+
print("\n" + "=" * 80)
|
| 14 |
+
print("๐ Part 1: ๅฎๆดๆต็จๆฆ่ง")
|
| 15 |
+
print("=" * 80)
|
| 16 |
+
|
| 17 |
+
print("""
|
| 18 |
+
ไปๆๆกฃๅๅฒๅฐๅ้ๆฐๆฎๅบ็ๅฎๆดๆต็จ๏ผ
|
| 19 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 20 |
+
|
| 21 |
+
Step 1: ๆๆกฃๅๅฒ
|
| 22 |
+
ๅๅงๆๆกฃ โ RecursiveCharacterTextSplitter โ 20 ไธช chunks
|
| 23 |
+
(5000 tokens) (ๆฏไธช 250 tokens)
|
| 24 |
+
|
| 25 |
+
Step 2: ๅ้ๅ (Embedding)
|
| 26 |
+
ๆฏไธช chunk โ HuggingFace ๆจกๅ โ ๅ้ (384็ปด)
|
| 27 |
+
"ไบบๅทฅๆบ่ฝๆฏ..." โ [0.12, -0.34, 0.56, ...]
|
| 28 |
+
|
| 29 |
+
Step 3: ๅญๅ
ฅ Chroma
|
| 30 |
+
ๅ้ + ๅๆ + ๅ
ๆฐๆฎ โ Chroma ๆฐๆฎๅบ
|
| 31 |
+
โโ ๆไน
ๅๅญๅจ
|
| 32 |
+
|
| 33 |
+
Step 4: ๆๅปบ็ดขๅผ
|
| 34 |
+
Chroma โ HNSW ็ดขๅผ โ ๅฟซ้่ฟไผผๆฃ็ดข
|
| 35 |
+
(ๅฑๆฌกๅๅพ็ปๆ)
|
| 36 |
+
|
| 37 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 38 |
+
""")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# ============================================================================
|
| 42 |
+
# Part 2: Embedding ๆจกๅ่ฏฆ่งฃ
|
| 43 |
+
# ============================================================================
|
| 44 |
+
print("\n" + "=" * 80)
|
| 45 |
+
print("๐ค Part 2: Embedding ๆจกๅ - HuggingFaceEmbeddings")
|
| 46 |
+
print("=" * 80)
|
| 47 |
+
|
| 48 |
+
print("""
|
| 49 |
+
ไฝ ็้กน็ฎ้
็ฝฎ๏ผ
|
| 50 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 51 |
+
|
| 52 |
+
self.embeddings = HuggingFaceEmbeddings(
|
| 53 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 54 |
+
model_kwargs={'device': device}, # CPU ๆ GPU
|
| 55 |
+
encode_kwargs={'normalize_embeddings': True} # ๅฝไธๅ
|
| 56 |
+
)
|
| 57 |
+
|
| 58 |
+
ๆจกๅ่ฏดๆ๏ผ
|
| 59 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 60 |
+
|
| 61 |
+
ๆจกๅๅ็งฐ: all-MiniLM-L6-v2
|
| 62 |
+
โโ ็ฑปๅ: Sentence-BERT (ๅ็ผ็ ๅจ)
|
| 63 |
+
โโ ๅๆฐ้: 22M (่ฝป้็บง)
|
| 64 |
+
โโ ่พๅบ็ปดๅบฆ: 384 ็ปดๅ้
|
| 65 |
+
โโ ่ฎญ็ปๆฐๆฎ: 10ไบฟ+ ๅฅๅญๅฏน
|
| 66 |
+
โโ ็น็น: ๅฟซ้ใๅ็กฎใ้ๅ่ฏญไนๆฃ็ดข
|
| 67 |
+
|
| 68 |
+
ๅทฅไฝๅ็๏ผ
|
| 69 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 70 |
+
|
| 71 |
+
่พๅ
ฅๆๆฌ: "ไบบๅทฅๆบ่ฝๆฏ่ฎก็ฎๆบ็งๅญฆ็ไธไธชๅๆฏ"
|
| 72 |
+
โ
|
| 73 |
+
Tokenization (ๅ่ฏ)
|
| 74 |
+
โ
|
| 75 |
+
Token IDs: [101, 782, 1435, 1819, 2510, 3221, ...]
|
| 76 |
+
โ
|
| 77 |
+
BERT Encoder (6 ๅฑ Transformer)
|
| 78 |
+
โ
|
| 79 |
+
[CLS] Token ็ๅ้่กจ็คบ
|
| 80 |
+
โ
|
| 81 |
+
384 ็ปดๅ้: [0.123, -0.456, 0.789, ...]
|
| 82 |
+
โ
|
| 83 |
+
L2 ๅฝไธๅ (normalize_embeddings=True)
|
| 84 |
+
โ
|
| 85 |
+
ๆ็ปๅ้: ||v|| = 1 (ๅไฝๅ้)
|
| 86 |
+
""")
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
# ============================================================================
|
| 90 |
+
# Part 3: ๅ้ๅ่ฟ็จๅๆญฅ่งฃๆ
|
| 91 |
+
# ============================================================================
|
| 92 |
+
print("\n" + "=" * 80)
|
| 93 |
+
print("๐ Part 3: ๅ้ๅ่ฟ็จ - ้ๆญฅ่งฃๆ")
|
| 94 |
+
print("=" * 80)
|
| 95 |
+
|
| 96 |
+
print("""
|
| 97 |
+
ๅ่ฎพๆไปฌๆ 3 ไธช chunks๏ผ
|
| 98 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 99 |
+
|
| 100 |
+
Chunk 1: "ไบบๅทฅๆบ่ฝๆฏ่ฎก็ฎๆบ็งๅญฆ็ไธไธชๅๆฏใๅฎ่ดๅไบ..."
|
| 101 |
+
Chunk 2: "ๆบๅจๅญฆไน ๆฏไบบๅทฅๆบ่ฝ็ๅญ้ขๅใๅฎไฝฟ่ฎก็ฎๆบ..."
|
| 102 |
+
Chunk 3: "ๆทฑๅบฆๅญฆไน ไฝฟ็จๅคๅฑ็ฅ็ป็ฝ็ปๆฅๅค็ๅคๆ็..."
|
| 103 |
+
|
| 104 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 105 |
+
|
| 106 |
+
ๅ้ๅ่ฟ็จ๏ผๆน้ๅค็๏ผ๏ผ
|
| 107 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 108 |
+
|
| 109 |
+
embeddings.embed_documents([chunk1, chunk2, chunk3])
|
| 110 |
+
โ
|
| 111 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 112 |
+
โ HuggingFace Embedding ๆจกๅ โ
|
| 113 |
+
โ (sentence-transformers/all-MiniLM-L6-v2) โ
|
| 114 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 115 |
+
โ
|
| 116 |
+
ๅ
้จๅค็๏ผๆฏไธช chunk๏ผ๏ผ
|
| 117 |
+
โ
|
| 118 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 119 |
+
โ Step 1: Tokenization โ
|
| 120 |
+
โ "ไบบๅทฅๆบ่ฝ..." โ [101, 782, 1435, ...] โ
|
| 121 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 122 |
+
โ
|
| 123 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 124 |
+
โ Step 2: ่ฝฌๆขไธบ Token Embeddings โ
|
| 125 |
+
โ Token IDs โ ๅๅงๅ้่กจ็คบ โ
|
| 126 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 127 |
+
โ
|
| 128 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 129 |
+
โ Step 3: BERT Encoder (6 ๅฑ) โ
|
| 130 |
+
โ Self-Attention + Feed Forward โ
|
| 131 |
+
โ ๆฏๅฑๆๅๆดๆทฑๅฑ็่ฏญไน โ
|
| 132 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 133 |
+
โ
|
| 134 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 135 |
+
โ Step 4: Mean Pooling โ
|
| 136 |
+
โ ๆๆ token ๅ้็ๅนณๅ โ ๅฅๅญๅ้ โ
|
| 137 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 138 |
+
โ
|
| 139 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 140 |
+
โ Step 5: L2 Normalization โ
|
| 141 |
+
โ ๅ้ๅฝไธๅๅฐๅไฝ้ฟๅบฆ โ
|
| 142 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 143 |
+
โ
|
| 144 |
+
่พๅบ๏ผ3 ไธชๅ้
|
| 145 |
+
โ
|
| 146 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 147 |
+
โ Vector 1: [0.123, -0.456, 0.789, ..., 0.321] (384็ปด) โ
|
| 148 |
+
โ Vector 2: [0.234, 0.567, -0.890, ..., 0.432] (384็ปด) โ
|
| 149 |
+
โ Vector 3: [-0.345, 0.678, 0.901, ..., -0.543] (384็ปด) โ
|
| 150 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 151 |
+
|
| 152 |
+
ๅ
ณ้ฎ็น๏ผ
|
| 153 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 154 |
+
โ
ๆฏไธช chunk โ 1 ไธชๅบๅฎ็ปดๅบฆ็ๅ้ (384็ปด)
|
| 155 |
+
โ
่ฏญไน็ธไผผ็ๆๆฌ โ ๅ้่ท็ฆป่ฟ
|
| 156 |
+
โ
ๅฝไธๅๅๅฏ็จไฝๅผฆ็ธไผผๅบฆๅฟซ้ๆฏ่พ
|
| 157 |
+
""")
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ============================================================================
|
| 161 |
+
# Part 4: Chroma ๆฐๆฎๅบๅญๅจ็ปๆ
|
| 162 |
+
# ============================================================================
|
| 163 |
+
print("\n" + "=" * 80)
|
| 164 |
+
print("๐พ Part 4: Chroma ๆฐๆฎๅบๅญๅจ็ปๆ")
|
| 165 |
+
print("=" * 80)
|
| 166 |
+
|
| 167 |
+
print("""
|
| 168 |
+
Chroma.from_documents() ๆง่ก็ๆไฝ๏ผ
|
| 169 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 170 |
+
|
| 171 |
+
Chroma.from_documents(
|
| 172 |
+
documents=doc_splits, # 20 ไธช chunks
|
| 173 |
+
collection_name="rag-chroma", # ้ๅๅ็งฐ
|
| 174 |
+
embedding=self.embeddings # Embedding ๅฝๆฐ
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
ๅ
้จๆต็จ๏ผ
|
| 178 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 179 |
+
|
| 180 |
+
Step 1: ๅๅปบ/ๆๅผ้ๅ
|
| 181 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 182 |
+
โ Collection: "rag-chroma" โ
|
| 183 |
+
โ ๅ
ๆฐๆฎ: embedding_dimension=384 โ
|
| 184 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 185 |
+
|
| 186 |
+
Step 2: ๆน้ๅ้ๅ
|
| 187 |
+
for chunk in doc_splits:
|
| 188 |
+
vector = embeddings.embed_documents([chunk.page_content])
|
| 189 |
+
โ
|
| 190 |
+
|
| 191 |
+
Step 3: ๅญๅจๆฐๆฎ๏ผๆฏไธช chunk๏ผ
|
| 192 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 193 |
+
โ ID: "chunk_1" โ
|
| 194 |
+
โ โโ Vector: [0.123, -0.456, ..., 0.321] (384็ปด) โ
|
| 195 |
+
โ โโ Document: "ไบบๅทฅๆบ่ฝๆฏ่ฎก็ฎๆบ็งๅญฆ็ไธไธชๅๆฏ..." โ
|
| 196 |
+
โ โโ Metadata: { โ
|
| 197 |
+
โ "source": "https://...", โ
|
| 198 |
+
โ "chunk_index": 0, โ
|
| 199 |
+
โ "total_chunks": 20 โ
|
| 200 |
+
โ } โ
|
| 201 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 202 |
+
โ ID: "chunk_2" โ
|
| 203 |
+
โ โโ Vector: [0.234, 0.567, ..., 0.432] โ
|
| 204 |
+
โ โโ Document: "ๆบๅจๅญฆไน ๆฏไบบๅทฅๆบ่ฝ็ๅญ้ขๅ..." โ
|
| 205 |
+
โ โโ Metadata: {...} โ
|
| 206 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 207 |
+
โ ID: "chunk_3" โ
|
| 208 |
+
โ โโ Vector: [-0.345, 0.678, ..., -0.543] โ
|
| 209 |
+
โ โโ Document: "ๆทฑๅบฆๅญฆไน ไฝฟ็จๅคๅฑ็ฅ็ป็ฝ็ป..." โ
|
| 210 |
+
โ โโ Metadata: {...} โ
|
| 211 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 212 |
+
|
| 213 |
+
Step 4: ๆๅปบ HNSW ็ดขๅผ
|
| 214 |
+
ๅ้ โ HNSW ๅพ็ปๆ โ ๅฟซ้ๆฃ็ดข
|
| 215 |
+
(ๅฑๆฌกๅๅฏผ่ชๅฐไธ็ๅพ)
|
| 216 |
+
|
| 217 |
+
ๅญๅจไฝ็ฝฎ๏ผ
|
| 218 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 219 |
+
้ป่ฎค่ทฏๅพ: ./chroma/ (ๆฌๅฐ็ฎๅฝ)
|
| 220 |
+
โโ collections/
|
| 221 |
+
โ โโ rag-chroma/
|
| 222 |
+
โ โโ data.parquet # ๅ้ๆฐๆฎ
|
| 223 |
+
โ โโ metadata.json # ๅ
ๆฐๆฎ
|
| 224 |
+
โ โโ index.bin # HNSW ็ดขๅผ
|
| 225 |
+
โโ chroma.sqlite3 # SQLite ๆฐๆฎๅบ
|
| 226 |
+
""")
|
| 227 |
+
|
| 228 |
+
|
| 229 |
+
# ============================================================================
|
| 230 |
+
# Part 5: HNSW ็ดขๅผๅทฅไฝๅ็
|
| 231 |
+
# ============================================================================
|
| 232 |
+
print("\n" + "=" * 80)
|
| 233 |
+
print("๐ Part 5: HNSW ็ดขๅผ - ๅฟซ้ๆฃ็ดข็็งๅฏ")
|
| 234 |
+
print("=" * 80)
|
| 235 |
+
|
| 236 |
+
print("""
|
| 237 |
+
HNSW = Hierarchical Navigable Small World
|
| 238 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 239 |
+
|
| 240 |
+
ไธบไปไน้่ฆ็ดขๅผ๏ผ
|
| 241 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 242 |
+
ๆดๅๆ็ดข: O(n) - ่ฎก็ฎๆฅ่ฏขๅ้ไธๆๆๅ้็่ท็ฆป
|
| 243 |
+
โโ 10000 ไธชๅ้ โ ้่ฆ่ฎก็ฎ 10000 ๆฌก่ท็ฆป
|
| 244 |
+
โโ ๅคชๆ
ข๏ผ
|
| 245 |
+
|
| 246 |
+
HNSW ็ดขๅผ: O(log n) - ๅฑๆฌกๅๅพ็ปๆๅฏผ่ช
|
| 247 |
+
โโ 10000 ไธชๅ้ โ ๅช้ๆฃๆฅ็บฆ 20-30 ไธช่็น
|
| 248 |
+
โโ ๅฟซ 100+ ๅ๏ผ
|
| 249 |
+
|
| 250 |
+
HNSW ็ปๆ๏ผ็ฎๅ็คบไพ๏ผ๏ผ
|
| 251 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 252 |
+
|
| 253 |
+
Layer 2 (ๆ็จ็)
|
| 254 |
+
Vโ โโโโโโโโ Vโ
โโโโโโโโ Vโโ
|
| 255 |
+
โ โ โ
|
| 256 |
+
|
| 257 |
+
Layer 1
|
| 258 |
+
Vโ โโ Vโ โโ Vโ
โโ Vโ โโ Vโโ
|
| 259 |
+
โ โ โ โ โ
|
| 260 |
+
|
| 261 |
+
Layer 0 (ๆๅฏ้)
|
| 262 |
+
Vโ โ Vโ โ Vโ โ Vโ โ Vโ
โ Vโ โ ... โ Vโโ
|
| 263 |
+
ๆๆๅ้้ฝๅจ่ฟไธๅฑ
|
| 264 |
+
|
| 265 |
+
ๆฃ็ดข่ฟ็จ๏ผ
|
| 266 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 267 |
+
|
| 268 |
+
ๆฅ่ฏขๅ้: Q = [0.2, -0.3, 0.5, ...]
|
| 269 |
+
|
| 270 |
+
Step 1: ไป Layer 2 ๅผๅง๏ผ็ฒ็ฅๆ็ดข๏ผ
|
| 271 |
+
ๅ
ฅๅฃ็น: Vโ
|
| 272 |
+
โ ่ฎก็ฎ dist(Q, Vโ), dist(Q, Vโ
), dist(Q, Vโโ)
|
| 273 |
+
โ Vโ
ๆ่ฟ โ ่ทณๅฐ Vโ
|
| 274 |
+
|
| 275 |
+
Step 2: ไธ้ๅฐ Layer 1๏ผไธญ็ญ็ฒพๅบฆ๏ผ
|
| 276 |
+
ไป Vโ
ๅผๅง
|
| 277 |
+
โ ๆฃๆฅ้ปๅฑ
Vโ, Vโ
|
| 278 |
+
โ Vโ ๆ่ฟ โ ่ทณๅฐ Vโ
|
| 279 |
+
|
| 280 |
+
Step 3: ไธ้ๅฐ Layer 0๏ผ้ซ็ฒพๅบฆ๏ผ
|
| 281 |
+
ไป Vโ ๅผๅง
|
| 282 |
+
โ ๆฃๆฅๆๆ้ปๅฑ
|
| 283 |
+
โ ๆพๅฐๆ่ฟ็ K ไธชๅ้
|
| 284 |
+
|
| 285 |
+
่ฟๅ็ปๆ: Top K ๆ็ธไผผ็ chunks
|
| 286 |
+
|
| 287 |
+
้ๅบฆๅฏนๆฏ๏ผ
|
| 288 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 289 |
+
ๆดๅๆ็ดข: 10000 ๆฌก่ท็ฆป่ฎก็ฎ โ 100ms
|
| 290 |
+
HNSW ็ดขๅผ: 20-30 ๆฌก่ท็ฆป่ฎก็ฎ โ 1ms โ ๅฟซ 100 ๅ๏ผ
|
| 291 |
+
""")
|
| 292 |
+
|
| 293 |
+
|
| 294 |
+
# ============================================================================
|
| 295 |
+
# Part 6: ๆฃ็ดข่ฟ็จ่ฏฆ่งฃ
|
| 296 |
+
# ============================================================================
|
| 297 |
+
print("\n" + "=" * 80)
|
| 298 |
+
print("๐ Part 6: ๆฃ็ดข่ฟ็จ - ไปๆฅ่ฏขๅฐ็ปๆ")
|
| 299 |
+
print("=" * 80)
|
| 300 |
+
|
| 301 |
+
print("""
|
| 302 |
+
็จๆทๆฅ่ฏข: "ไปไนๆฏๆบๅจๅญฆไน ๏ผ"
|
| 303 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 304 |
+
|
| 305 |
+
Step 1: ๆฅ่ฏขๅ้ๅ
|
| 306 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 307 |
+
"ไปไนๆฏๆบๅจๅญฆไน ๏ผ"
|
| 308 |
+
โ
|
| 309 |
+
embeddings.embed_query("ไปไนๆฏๆบๅจๅญฆไน ๏ผ")
|
| 310 |
+
โ
|
| 311 |
+
Query Vector: [0.345, -0.678, 0.234, ...] (384็ปด)
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
Step 2: HNSW ่ฟไผผๆ็ดข
|
| 315 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 316 |
+
vectorstore.similarity_search(
|
| 317 |
+
query="ไปไนๆฏๆบๅจๅญฆไน ๏ผ",
|
| 318 |
+
k=20 # ่ฟๅ Top 20
|
| 319 |
+
)
|
| 320 |
+
โ
|
| 321 |
+
Chroma ๅ
้จ:
|
| 322 |
+
1. ๆฅ่ฏขๅ้ๅ
|
| 323 |
+
2. HNSW ๅพๅฏผ่ช
|
| 324 |
+
3. ่ฎก็ฎไฝๅผฆ็ธไผผๅบฆ
|
| 325 |
+
โ
|
| 326 |
+
่ฟๅ Top 20 chunks:
|
| 327 |
+
โโโโโโโโโโโโฌโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 328 |
+
โ Chunk ID โ Score โ Content โ
|
| 329 |
+
โโโโโโโโโโโโผโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 330 |
+
โ chunk_5 โ 0.92 โ "ๆบๅจๅญฆไน ๆฏไบบๅทฅๆบ่ฝ็..." โ
|
| 331 |
+
โ chunk_2 โ 0.88 โ "ไบบๅทฅๆบ่ฝๅ
ๆฌๆบๅจๅญฆไน ..." โ
|
| 332 |
+
โ chunk_11 โ 0.85 โ "็็ฃๅญฆไน ๆฏๆบๅจๅญฆไน ..." โ
|
| 333 |
+
โ ... โ ... โ ... โ
|
| 334 |
+
โโโโโโโโโโโโดโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 335 |
+
|
| 336 |
+
|
| 337 |
+
Step 3: CrossEncoder ้ๆ๏ผไฝ ็้กน็ฎ็น่ฒ๏ผ
|
| 338 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 339 |
+
reranker.rerank(query, top_20_chunks, top_k=5)
|
| 340 |
+
โ
|
| 341 |
+
ๆฏไธช chunk ้ๆฐๆๅ๏ผๆทฑๅบฆไบคไบ๏ผ
|
| 342 |
+
โ
|
| 343 |
+
ๆ็ป Top 5:
|
| 344 |
+
โโโโโโโโโโโโฌโโโโโโโโโโฌโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 345 |
+
โ Chunk ID โ Score โ Content โ
|
| 346 |
+
โโโโโโโโโโโโผโโโโโโโโโโผโโโโโโโโโโโโโโโโโโโโโโโโโโโโโค
|
| 347 |
+
โ chunk_5 โ 8.45 โ "ๆบๅจๅญฆไน ๆฏไบบๅทฅๆบ่ฝ็..." โ
|
| 348 |
+
โ chunk_11 โ 7.89 โ "็็ฃๅญฆไน ๆฏๆบๅจๅญฆไน ..." โ
|
| 349 |
+
โ chunk_2 โ 7.23 โ "ไบบๅทฅๆบ่ฝๅ
ๆฌๆบๅจๅญฆไน ..." โ
|
| 350 |
+
โ chunk_14 โ 6.78 โ "ๆทฑๅบฆๅญฆไน ๆฏๆบๅจๅญฆไน ..." โ
|
| 351 |
+
โ chunk_8 โ 6.12 โ "ๅผบๅๅญฆไน ๅ
่ฎธ..." โ
|
| 352 |
+
โโโโโโโโโโโโดโโโโโโโโโโดโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 353 |
+
|
| 354 |
+
|
| 355 |
+
Step 4: ่ฟๅ็ป LLM
|
| 356 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 357 |
+
context = "\\n\\n".join([chunk.page_content for chunk in top_5])
|
| 358 |
+
โ
|
| 359 |
+
LLM ็ๆ็ญๆก
|
| 360 |
+
""")
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
# ============================================================================
|
| 364 |
+
# Part 7: ๅ
ณ้ฎๆๆฏ็ป่
|
| 365 |
+
# ============================================================================
|
| 366 |
+
print("\n" + "=" * 80)
|
| 367 |
+
print("โ๏ธ Part 7: ๅ
ณ้ฎๆๆฏ็ป่")
|
| 368 |
+
print("=" * 80)
|
| 369 |
+
|
| 370 |
+
print("""
|
| 371 |
+
1. ไธบไปไน่ฆๅฝไธๅๅ้๏ผ
|
| 372 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 373 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 374 |
+
|
| 375 |
+
ๅๅงๅ้: [1.23, -4.56, 7.89, ...] # ้ฟๅบฆไธไธ
|
| 376 |
+
ๅฝไธๅๅ: [0.12, -0.45, 0.78, ...] # ้ฟๅบฆ = 1
|
| 377 |
+
|
| 378 |
+
ๅฅฝๅค:
|
| 379 |
+
โ
ไฝๅผฆ็ธไผผๅบฆ = ็น็งฏ๏ผ่ฎก็ฎๆดๅฟซ๏ผ
|
| 380 |
+
โ
ๆๆๅ้ๅจๅไธๅฐบๅบฆไธ
|
| 381 |
+
โ
้ฟๅ
้ฟๅบฆๅฝฑๅ็ธไผผๅบฆ่ฎก็ฎ
|
| 382 |
+
|
| 383 |
+
|
| 384 |
+
2. ไฝๅผฆ็ธไผผๅบฆ vs ๆฌงๆฐ่ท็ฆป
|
| 385 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 386 |
+
|
| 387 |
+
ไฝๅผฆ็ธไผผๅบฆ๏ผไฝ ็้กน็ฎไฝฟ็จ๏ผโญ:
|
| 388 |
+
similarity = vโ ยท vโ / (||vโ|| ร ||vโ||)
|
| 389 |
+
่ๅด: [-1, 1]๏ผ1 ่กจ็คบๅฎๅ
จ็ธๅ
|
| 390 |
+
็น็น: ๅ
ณๆณจๆนๅ๏ผๅฟฝ็ฅ้ฟๅบฆ
|
| 391 |
+
|
| 392 |
+
ๆฌงๆฐ่ท็ฆป:
|
| 393 |
+
distance = โฮฃ(vโแตข - vโแตข)ยฒ
|
| 394 |
+
่ๅด: [0, โ]๏ผ0 ่กจ็คบๅฎๅ
จ็ธๅ
|
| 395 |
+
็น็น: ๅ
ณๆณจ็ปๅฏนไฝ็ฝฎๅทฎๅผ
|
| 396 |
+
|
| 397 |
+
ๅฝไธๅๅ๏ผไธค่
็ญไปท๏ผ
|
| 398 |
+
|
| 399 |
+
|
| 400 |
+
3. ๆน้ๅค็ไผๅ
|
| 401 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 402 |
+
|
| 403 |
+
ไธๆจ่๏ผๆ
ข๏ผ:
|
| 404 |
+
for chunk in chunks:
|
| 405 |
+
vector = embed_documents([chunk]) # ๅ็ฌๅค็
|
| 406 |
+
|
| 407 |
+
ๆจ่๏ผๅฟซ 10 ๅ๏ผโญ:
|
| 408 |
+
vectors = embed_documents(chunks) # ๆน้ๅค็
|
| 409 |
+
โโ GPU ๅนถ่ก่ฎก็ฎ
|
| 410 |
+
โโ ๅๅฐๆจกๅๅ ่ฝฝๅผ้
|
| 411 |
+
|
| 412 |
+
|
| 413 |
+
4. ๅ
ๅญไผๅ
|
| 414 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 415 |
+
|
| 416 |
+
ๅ้็ปดๅบฆ้ๆฉ:
|
| 417 |
+
384 ็ปด (all-MiniLM-L6-v2) โ ไฝ ็้กน็ฎ โญ
|
| 418 |
+
โโ ๅนณ่กก๏ผๅ็กฎ็ vs ๅญๅจ
|
| 419 |
+
|
| 420 |
+
768 ็ปด (BERT-base)
|
| 421 |
+
โโ ๆดๅ็กฎไฝๅญๅจ็ฟปๅ
|
| 422 |
+
|
| 423 |
+
1024 ็ปด (large models)
|
| 424 |
+
โโ ๆๅ็กฎไฝๅญๅจ 3 ๅ
|
| 425 |
+
|
| 426 |
+
ๅญๅจ่ฎก็ฎ:
|
| 427 |
+
20 ไธช chunks ร 384 ็ปด ร 4 bytes = 30KB
|
| 428 |
+
1000 ไธช chunks ร 384 ็ปด ร 4 bytes = 1.5MB
|
| 429 |
+
โโ ้ๅธธ้ซๆ๏ผ
|
| 430 |
+
""")
|
| 431 |
+
|
| 432 |
+
|
| 433 |
+
# ============================================================================
|
| 434 |
+
# Part 8: ๅฎๆดไปฃ็ ๆต็จ
|
| 435 |
+
# ============================================================================
|
| 436 |
+
print("\n" + "=" * 80)
|
| 437 |
+
print("๐ป Part 8: ๅฎๆดไปฃ็ ๆต็จๆป็ป")
|
| 438 |
+
print("=" * 80)
|
| 439 |
+
|
| 440 |
+
print("""
|
| 441 |
+
ไฝ ็้กน็ฎๅฎๆดๆต็จ๏ผ
|
| 442 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 443 |
+
|
| 444 |
+
# 1. ๅๅงๅ Embedding ๆจกๅ
|
| 445 |
+
embeddings = HuggingFaceEmbeddings(
|
| 446 |
+
model_name="sentence-transformers/all-MiniLM-L6-v2",
|
| 447 |
+
model_kwargs={'device': 'cpu'},
|
| 448 |
+
encode_kwargs={'normalize_embeddings': True}
|
| 449 |
+
)
|
| 450 |
+
|
| 451 |
+
# 2. ๆๆกฃๅๅฒ
|
| 452 |
+
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
| 453 |
+
chunk_size=250,
|
| 454 |
+
chunk_overlap=50 # โ ไฝ ๅไฟฎๆน็
|
| 455 |
+
)
|
| 456 |
+
doc_splits = text_splitter.split_documents(docs)
|
| 457 |
+
|
| 458 |
+
# 3. ๅ้ๅ + ๅญๅจๅฐ Chroma
|
| 459 |
+
vectorstore = Chroma.from_documents(
|
| 460 |
+
documents=doc_splits, # ่พๅ
ฅ: 20 ไธช chunks
|
| 461 |
+
collection_name="rag-chroma",
|
| 462 |
+
embedding=embeddings # ๅ้ๅๅฝๆฐ
|
| 463 |
+
)
|
| 464 |
+
# โ ๅ
้จ่ชๅจๅฎๆ:
|
| 465 |
+
# - ๆน้ๅ้ๅ: chunks โ 384็ปดๅ้
|
| 466 |
+
# - ๅญๅจ: ๅ้ + ๅๆ + ๅ
ๆฐๆฎ
|
| 467 |
+
# - ๆๅปบ HNSW ็ดขๅผ
|
| 468 |
+
|
| 469 |
+
# 4. ๅๅปบๆฃ็ดขๅจ
|
| 470 |
+
retriever = vectorstore.as_retriever()
|
| 471 |
+
|
| 472 |
+
# 5. ๆฃ็ดข
|
| 473 |
+
docs = retriever.get_relevant_documents("ไปไนๆฏๆบๅจๅญฆไน ๏ผ")
|
| 474 |
+
# โ ๅ
้จๆต็จ:
|
| 475 |
+
# - ๆฅ่ฏขๅ้ๅ
|
| 476 |
+
# - HNSW ๅฟซ้ๆฃ็ดข
|
| 477 |
+
# - ่ฟๅ Top K chunks
|
| 478 |
+
|
| 479 |
+
# 6. CrossEncoder ้ๆ๏ผๅฏ้๏ผไฝ ็้กน็ฎๆ๏ผ
|
| 480 |
+
reranked = crossencoder.rerank(query, docs, top_k=5)
|
| 481 |
+
|
| 482 |
+
# 7. ๅ็ป LLM ็ๆ็ญๆก
|
| 483 |
+
answer = llm.generate(context=docs, question=query)
|
| 484 |
+
|
| 485 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 486 |
+
""")
|
| 487 |
+
|
| 488 |
+
|
| 489 |
+
# ============================================================================
|
| 490 |
+
# Part 9: ๆง่ฝไผๅๅปบ่ฎฎ
|
| 491 |
+
# ============================================================================
|
| 492 |
+
print("\n" + "=" * 80)
|
| 493 |
+
print("๐ Part 9: ๆง่ฝไผๅๅปบ่ฎฎ")
|
| 494 |
+
print("=" * 80)
|
| 495 |
+
|
| 496 |
+
print("""
|
| 497 |
+
ๅฝๅ้
็ฝฎ่ฏๅ๏ผ
|
| 498 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 499 |
+
|
| 500 |
+
โ
Embedding ๆจกๅ: all-MiniLM-L6-v2 (่ฝป้้ซๆ) โญโญโญโญโญ
|
| 501 |
+
โ
ๅ้ๅฝไธๅ: True (ไฝๅผฆ็ธไผผๅบฆไผๅ) โญโญโญโญโญ
|
| 502 |
+
โ
็ดขๅผ็ฑปๅ: HNSW (ๅฟซ้ๆฃ็ดข) โญโญโญโญโญ
|
| 503 |
+
โ
Chunk overlap: 50 (ไฟๆไธไธๆ) โญโญโญโญโญ
|
| 504 |
+
โ
CrossEncoder ้ๆ (็ฒพๅๆๅบ) โญโญโญโญโญ
|
| 505 |
+
|
| 506 |
+
ๆป่ฏ: ๐ ็ไบง็บง้
็ฝฎ๏ผ
|
| 507 |
+
|
| 508 |
+
ๅฏ้ไผๅ๏ผๅฆ้่ฟไธๆญฅๆๅ๏ผ๏ผ
|
| 509 |
+
โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
|
| 510 |
+
|
| 511 |
+
1. GPU ๅ ้
|
| 512 |
+
model_kwargs={'device': 'cuda'} # ๅ้ๅ้ๅบฆ 10x โ
|
| 513 |
+
|
| 514 |
+
2. ๆดๅคง็ Embedding ๆจกๅ๏ผๅฆ้ๆด้ซๅ็กฎ็๏ผ
|
| 515 |
+
"BAAI/bge-large-en-v1.5" # 1024็ปด๏ผๅ็กฎ็ +5%
|
| 516 |
+
|
| 517 |
+
3. ๆน้ๅคงๅฐ่ฐๆด
|
| 518 |
+
batch_size=32 # ๅ ๅฟซๅ้ๅ
|
| 519 |
+
|
| 520 |
+
4. Chroma ๆไน
ๅ้
็ฝฎ
|
| 521 |
+
persist_directory="./chroma_db" # ้ฟๅ
้ๅคๅ้ๅ
|
| 522 |
+
""")
|
| 523 |
+
|
| 524 |
+
|
| 525 |
+
print("\n" + "=" * 80)
|
| 526 |
+
print("โ
่งฃๆๅฎๆ๏ผไฝ ็ฐๅจ็่งฃไบไปๅๅฒๅฐๅ้ๆฐๆฎๅบ็ๅฎๆดๆต็จ")
|
| 527 |
+
print("=" * 80)
|
| 528 |
+
print()
|