Instructions to use ssdataanalysis/gemma-4-E4B-hebrew-first with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ssdataanalysis/gemma-4-E4B-hebrew-first with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("ssdataanalysis/gemma-4-E4B-hebrew-first", dtype="auto") - Notebooks
- Google Colab
- Kaggle
Switch to packing=True, batch=4, grad_acc=4, step-based checkpoints for speed
Browse files
train.py
CHANGED
|
@@ -87,8 +87,6 @@ output_dir = os.environ.get("OUTPUT_DIR", "ssdataanalysis/gemma-4-E4B-hebrew-fir
|
|
| 87 |
print(f"=== Training {model_id} -> {output_dir} ===")
|
| 88 |
|
| 89 |
train_dataset = prepare_dataset(hebrew_ratio=0.5, max_total=120000)
|
| 90 |
-
# No eval dataset to avoid OOM during evaluation on A10G 24GB
|
| 91 |
-
# We will rely on training loss and periodic checkpointing
|
| 92 |
|
| 93 |
print("Loading tokenizer...")
|
| 94 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
|
@@ -112,28 +110,30 @@ peft_config = LoraConfig(
|
|
| 112 |
exclude_modules=["vision_tower", "multi_modal_projector"],
|
| 113 |
)
|
| 114 |
|
|
|
|
| 115 |
training_args = SFTConfig(
|
| 116 |
output_dir=output_dir,
|
| 117 |
num_train_epochs=3,
|
| 118 |
-
per_device_train_batch_size=
|
| 119 |
-
gradient_accumulation_steps=
|
| 120 |
learning_rate=2e-4,
|
| 121 |
lr_scheduler_type="cosine",
|
| 122 |
warmup_steps=500,
|
| 123 |
weight_decay=0.01,
|
| 124 |
max_length=2048,
|
| 125 |
-
packing=
|
| 126 |
bf16=True,
|
| 127 |
logging_strategy="steps",
|
| 128 |
logging_steps=10,
|
| 129 |
logging_first_step=True,
|
| 130 |
eval_strategy="no",
|
| 131 |
-
save_strategy="
|
| 132 |
-
|
|
|
|
| 133 |
push_to_hub=True,
|
| 134 |
hub_model_id=output_dir,
|
| 135 |
report_to="trackio",
|
| 136 |
-
run_name=output_dir,
|
| 137 |
remove_unused_columns=False,
|
| 138 |
disable_tqdm=True,
|
| 139 |
dataset_num_proc=4,
|
|
|
|
| 87 |
print(f"=== Training {model_id} -> {output_dir} ===")
|
| 88 |
|
| 89 |
train_dataset = prepare_dataset(hebrew_ratio=0.5, max_total=120000)
|
|
|
|
|
|
|
| 90 |
|
| 91 |
print("Loading tokenizer...")
|
| 92 |
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
|
|
|
|
| 110 |
exclude_modules=["vision_tower", "multi_modal_projector"],
|
| 111 |
)
|
| 112 |
|
| 113 |
+
# Optimized: packing=True, larger batch, step-based checkpoints
|
| 114 |
training_args = SFTConfig(
|
| 115 |
output_dir=output_dir,
|
| 116 |
num_train_epochs=3,
|
| 117 |
+
per_device_train_batch_size=4,
|
| 118 |
+
gradient_accumulation_steps=4,
|
| 119 |
learning_rate=2e-4,
|
| 120 |
lr_scheduler_type="cosine",
|
| 121 |
warmup_steps=500,
|
| 122 |
weight_decay=0.01,
|
| 123 |
max_length=2048,
|
| 124 |
+
packing=True,
|
| 125 |
bf16=True,
|
| 126 |
logging_strategy="steps",
|
| 127 |
logging_steps=10,
|
| 128 |
logging_first_step=True,
|
| 129 |
eval_strategy="no",
|
| 130 |
+
save_strategy="steps",
|
| 131 |
+
save_steps=500,
|
| 132 |
+
save_total_limit=3,
|
| 133 |
push_to_hub=True,
|
| 134 |
hub_model_id=output_dir,
|
| 135 |
report_to="trackio",
|
| 136 |
+
run_name=output_dir + "-fast",
|
| 137 |
remove_unused_columns=False,
|
| 138 |
disable_tqdm=True,
|
| 139 |
dataset_num_proc=4,
|