Update README.md
Browse files
README.md
CHANGED
|
@@ -27,5 +27,31 @@ just replace the pretrained model name and make sure you use Arabic text and spl
|
|
| 27 |
|
| 28 |
You can train a better model if you have access to adequate compute (can fine tune this model on more data, seed 42 was used tp pick the 100K sample).
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
Model first announced: https://www.linkedin.com/posts/akhooli_this-is-probably-the-first-arabic-colbert-activity-7217969205197848576-l8Cy
|
|
|
|
| 27 |
|
| 28 |
You can train a better model if you have access to adequate compute (can fine tune this model on more data, seed 42 was used tp pick the 100K sample).
|
| 29 |
|
| 30 |
+
# Training script
|
| 31 |
+
```
|
| 32 |
+
from datasets import load_dataset
|
| 33 |
+
from ragatouille import RAGTrainer
|
| 34 |
+
sample_size = 100000
|
| 35 |
+
ds = load_dataset('unicamp-dl/mmarco', 'arabic', split="train", trust_remote_code=True, streaming=True)
|
| 36 |
+
sds = ds.shuffle(seed=42, buffer_size=10_000)
|
| 37 |
+
dsf = sds.take(sample_size)
|
| 38 |
+
triplets = []
|
| 39 |
+
for item in iter(dsf):
|
| 40 |
+
triplets.append((item["query"], item["positive"], item["negative"]))
|
| 41 |
+
trainer = RAGTrainer(model_name="Arabic-ColBERT-100k", pretrained_model_name="aubmindlab/bert-base-arabertv02", language_code="ar",)
|
| 42 |
+
trainer.prepare_training_data(raw_data=triplets, mine_hard_negatives=False)
|
| 43 |
+
|
| 44 |
+
trainer.train(batch_size=32,
|
| 45 |
+
nbits=2, # How many bits will the trained model use when compressing indexes
|
| 46 |
+
maxsteps=100000, # Maximum steps hard stop
|
| 47 |
+
use_ib_negatives=True, # Use in-batch negative to calculate loss
|
| 48 |
+
dim=128, # How many dimensions per embedding. 128 is the default and works well.
|
| 49 |
+
learning_rate=5e-6, # Learning rate, small values ([3e-6,3e-5] work best if the base model is BERT-like, 5e-6 is often the sweet spot)
|
| 50 |
+
doc_maxlen=256, # Maximum document length. Because of how ColBERT works, smaller chunks (128-256) work very well.
|
| 51 |
+
use_relu=False, # Disable ReLU -- doesn't improve performance
|
| 52 |
+
warmup_steps="auto", # Defaults to 10%
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
```
|
| 56 |
|
| 57 |
Model first announced: https://www.linkedin.com/posts/akhooli_this-is-probably-the-first-arabic-colbert-activity-7217969205197848576-l8Cy
|