Mr-FineTuner commited on
Commit
bd92abf
·
verified ·
1 Parent(s): 466691d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +377 -0
README.md CHANGED
@@ -1,3 +1,4 @@
 
1
 
2
  # Fine-Tuned LLaMA-3-8B CEFR Model
3
 
@@ -26,3 +27,379 @@ This is a fine-tuned version of `unsloth/llama-3-8b-instruct-bnb-4bit` for CEFR-
26
  ```
27
 
28
  Uploaded using `huggingface_hub`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_train_epochs=0.1,
2
 
3
  # Fine-Tuned LLaMA-3-8B CEFR Model
4
 
 
27
  ```
28
 
29
  Uploaded using `huggingface_hub`.
30
+
31
+
32
+ import unsloth
33
+ from unsloth import FastLanguageModel, is_bfloat16_supported
34
+ import torch
35
+ import pandas as pd
36
+ from datasets import Dataset
37
+ from sklearn.utils import resample
38
+ from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForCausalLM, AutoTokenizer
39
+ from trl import SFTTrainer
40
+ from sentence_transformers import SentenceTransformer
41
+ from imblearn.over_sampling import SMOTE
42
+ from imblearn.under_sampling import RandomUnderSampler
43
+ from imblearn.pipeline import Pipeline
44
+ import numpy as np
45
+ import wandb
46
+ import os
47
+ from huggingface_hub import create_repo, upload_folder
48
+
49
+ # Verify environment
50
+ print(f"PyTorch version: {torch.__version__}")
51
+ print(f"CUDA available: {torch.cuda.is_available()}")
52
+ if torch.cuda.is_available():
53
+ print(f"GPU: {torch.cuda.get_device_name(0)}")
54
+
55
+ # Cell 1: Load model and tokenizer
56
+ max_seq_length = 2048
57
+ dtype = None
58
+ load_in_4bit = True
59
+
60
+ try:
61
+ model, tokenizer = FastLanguageModel.from_pretrained(
62
+ model_name="unsloth/llama-3-8b-instruct-bnb-4bit",
63
+ max_seq_length=max_seq_length,
64
+ dtype=dtype,
65
+ load_in_4bit=load_in_4bit,
66
+ use_exact_model_name=True,
67
+ device_map="auto"
68
+ )
69
+ print("Model and tokenizer loaded successfully with Unsloth!")
70
+ except Exception as e:
71
+ print(f"Error loading model with Unsloth: {e}")
72
+ print("Falling back to transformers...")
73
+ model_name = "unsloth/llama-3-8b-instruct-bnb-4bit"
74
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
75
+ model = AutoModelForCausalLM.from_pretrained(
76
+ model_name,
77
+ load_in_4bit=True,
78
+ device_map="auto"
79
+ )
80
+ print("Model and tokenizer loaded with transformers!")
81
+
82
+ # Cell 2: Configure LoRA
83
+ try:
84
+ model = FastLanguageModel.get_peft_model(
85
+ model,
86
+ r=32,
87
+ target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
88
+ lora_alpha=32,
89
+ lora_dropout=0.5,
90
+ bias="none",
91
+ use_gradient_checkpointing="unsloth",
92
+ random_state=3407,
93
+ )
94
+ print("LoRA configuration applied successfully!")
95
+ except Exception as e:
96
+ print(f"Error applying LoRA: {e}")
97
+ raise
98
+
99
+ # Cell 3: Load datasets
100
+ train_file = "train_merged_output.txt"
101
+ val_file = "dev_merged_output.txt"
102
+ test_file = "test_merged_output.txt"
103
+
104
+ cefr_mapping = {1: "A1", 2: "A2", 3: "B1", 4: "B2", 5: "C1", 6: "C2"}
105
+
106
+ def load_and_reformat(file_path):
107
+ try:
108
+ with open(file_path, "r") as f:
109
+ lines = f.readlines()
110
+ reformatted_data = []
111
+ for line in lines:
112
+ parts = line.strip().split("\t")
113
+ sentence = parts[0]
114
+ levels = parts[1:]
115
+ for level in levels:
116
+ level_int = int(level)
117
+ cefr_level = cefr_mapping.get(level_int, "Unknown")
118
+ reformatted_data.append({"sentence": sentence, "level": cefr_level})
119
+ return pd.DataFrame(reformatted_data)
120
+ except Exception as e:
121
+ print(f"Error loading file {file_path}: {e}")
122
+ raise
123
+
124
+ train_dataset = load_and_reformat(train_file)
125
+ val_dataset = load_and_reformat(val_file)
126
+ test_dataset = load_and_reformat(test_file)
127
+
128
+ print("Train dataset - Column names:", train_dataset.columns.tolist())
129
+ print("Train dataset - First 5 rows:\n", train_dataset.head())
130
+ print("Validation dataset - First 5 rows:\n", val_dataset.head())
131
+ print("Test dataset - First 5 rows:\n", test_dataset.head())
132
+
133
+ expected_columns = {"sentence", "level"}
134
+ for name, dataset in [("Train", train_dataset), ("Validation", val_dataset), ("Test", test_dataset)]:
135
+ if not expected_columns.issubset(dataset.columns):
136
+ missing = expected_columns - set(dataset.columns)
137
+ print(f"Warning: {name} dataset missing expected columns: {missing}")
138
+
139
+ # Cell 4: Rename columns
140
+ column_mapping = {"sentence": "sentence", "level": "level"}
141
+ train_dataset = train_dataset.rename(columns=column_mapping)
142
+ val_dataset = val_dataset.rename(columns=column_mapping)
143
+ test_dataset = test_dataset.rename(columns=column_mapping)
144
+
145
+ print("Train dataset - Renamed column names:", train_dataset.columns.tolist())
146
+ print("Train dataset - First row after renaming:\n", train_dataset.head(1))
147
+
148
+ # Cell 5: Convert to HF Dataset and format
149
+ train_dataset_hf = Dataset.from_pandas(train_dataset)
150
+ val_dataset_hf = Dataset.from_pandas(val_dataset)
151
+ test_dataset_hf = Dataset.from_pandas(test_dataset)
152
+
153
+ def format_func(example):
154
+ return {
155
+ "text": (
156
+ f"<|user|>\nGenerate a CEFR {example['level']} level sentence.<|end|>\n"
157
+ f"<|assistant|>\n{example['sentence']}<|end|>\n"
158
+ ),
159
+ "level": example['level']
160
+ }
161
+
162
+ train_dataset_transformed = train_dataset_hf.map(format_func)
163
+ val_dataset_transformed = val_dataset_hf.map(format_func)
164
+ test_dataset_transformed = test_dataset_hf.map(format_func)
165
+
166
+ train_dataset_transformed = train_dataset_transformed.remove_columns(['sentence'])
167
+ val_dataset_transformed = val_dataset_transformed.remove_columns(['sentence'])
168
+ test_dataset_transformed = test_dataset_transformed.remove_columns(['sentence'])
169
+
170
+ print("Train dataset columns after transformation:", train_dataset_transformed.column_names)
171
+ print("Example transformed text:", train_dataset_transformed[0]["text"])
172
+ print("Train CEFR distribution:\n", train_dataset["level"].value_counts())
173
+ print("Validation CEFR distribution:\n", val_dataset["level"].value_counts())
174
+ print("Test CEFR distribution:\n", test_dataset["level"].value_counts())
175
+
176
+ # Cell 6: Rebalance validation and test sets
177
+ train_proportions = {
178
+ 'A1': 0.0346, 'A2': 0.1789, 'B1': 0.3454,
179
+ 'B2': 0.3101, 'C1': 0.1239, 'C2': 0.0072
180
+ }
181
+
182
+ def rebalance_dataset(df, total_samples, proportions, random_state=3407):
183
+ resampled_dfs = []
184
+ for level, proportion in proportions.items():
185
+ level_df = df[df['level'] == level]
186
+ n_samples = int(total_samples * proportion)
187
+ if len(level_df) > n_samples:
188
+ level_df_resampled = resample(level_df, n_samples=n_samples, random_state=random_state)
189
+ else:
190
+ level_df_resampled = resample(level_df, n_samples=n_samples, replace=True, random_state=random_state)
191
+ resampled_dfs.append(level_df_resampled)
192
+ return pd.concat(resampled_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True)
193
+
194
+ val_df = val_dataset.copy()
195
+ new_val_df = rebalance_dataset(val_df, len(val_df), train_proportions)
196
+ new_val_dataset = Dataset.from_pandas(new_val_df)
197
+ new_val_dataset_transformed = new_val_dataset.map(format_func)
198
+ new_val_dataset_transformed = new_val_dataset_transformed.remove_columns(['sentence'])
199
+
200
+ test_df = test_dataset.copy()
201
+ new_test_df = rebalance_dataset(test_df, len(test_df), train_proportions)
202
+ new_test_dataset = Dataset.from_pandas(new_test_df)
203
+ new_test_dataset_transformed = new_test_dataset.map(format_func)
204
+ new_test_dataset_transformed = new_test_dataset_transformed.remove_columns(['sentence'])
205
+
206
+ print("New Validation CEFR distribution:\n", new_val_df["level"].value_counts(normalize=True))
207
+ print("New Test CEFR distribution:\n", new_test_df["level"].value_counts(normalize=True))
208
+
209
+ # Cell 7: Apply SMOTE and undersampling to balance training dataset
210
+ evaluator_model = SentenceTransformer("BAAI/bge-base-en-v1.5")
211
+
212
+ def apply_smote_to_dataset(df, target_proportions, random_state=3407):
213
+ print("Generating sentence embeddings...")
214
+ embeddings = evaluator_model.encode(df["sentence"].tolist(), show_progress_bar=True)
215
+
216
+ level_to_idx = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
217
+ labels = df["level"].map(level_to_idx).values
218
+
219
+ class_counts = df["level"].value_counts().to_dict()
220
+ print("Original class counts:", class_counts)
221
+
222
+ total_samples = len(df)
223
+ target_samples = {
224
+ level_to_idx[level]: max(int(total_samples * proportion), class_counts.get(level, 0))
225
+ for level, proportion in target_proportions.items()
226
+ }
227
+ print("Target sample counts:", target_samples)
228
+
229
+ pipeline = Pipeline([
230
+ ('oversample', SMOTE(sampling_strategy=target_samples, random_state=random_state)),
231
+ ('undersample', RandomUnderSampler(sampling_strategy=target_samples, random_state=random_state))
232
+ ])
233
+
234
+ print("Applying SMOTE and undersampling...")
235
+ X_resampled, y_resampled = pipeline.fit_resample(embeddings, labels)
236
+
237
+ idx_to_level = {v: k for k, v in level_to_idx.items()}
238
+ resampled_data = []
239
+ for embedding, label in zip(X_resampled, y_resampled):
240
+ # Find the closest original embedding
241
+ distances = np.linalg.norm(embeddings - embedding, axis=1)
242
+ closest_idx = np.argmin(distances)
243
+ sentence = df.iloc[closest_idx]["sentence"]
244
+ resampled_data.append({
245
+ "sentence": sentence,
246
+ "level": idx_to_level[label]
247
+ })
248
+
249
+ return pd.DataFrame(resampled_data)
250
+
251
+ train_dataset_smote = apply_smote_to_dataset(train_dataset, train_proportions)
252
+ train_dataset_hf = Dataset.from_pandas(train_dataset_smote)
253
+ train_dataset_transformed = train_dataset_hf.map(format_func)
254
+ train_dataset_transformed = train_dataset_transformed.remove_columns(['sentence'])
255
+
256
+ print("SMOTE-balanced Train CEFR distribution:\n", train_dataset_smote["level"].value_counts(normalize=True))
257
+
258
+ # Cell 8: Training setup
259
+ wandb.init(project="Phi-3-CEFR-finetuning_v3",
260
+ config={
261
+ "model": "unsloth/llama-3-8b-instruct-bnb-4bit",
262
+ "strategy": "gradient_checkpointing",
263
+ "learning_rate": 2e-5,
264
+ "batch_size": 8,
265
+ "lora_dropout": 0.5,
266
+ })
267
+
268
+ trainer = SFTTrainer(
269
+ model=model,
270
+ tokenizer=tokenizer,
271
+ train_dataset=train_dataset_transformed.shuffle(seed=3407),
272
+ eval_dataset=new_val_dataset_transformed,
273
+ dataset_text_field="text",
274
+ max_seq_length=max_seq_length,
275
+ callbacks=[
276
+ EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01),
277
+ ],
278
+ args=TrainingArguments(
279
+ per_device_train_batch_size=8,
280
+ gradient_accumulation_steps=1,
281
+ warmup_ratio=0.1,
282
+ num_train_epochs=0.1,
283
+ learning_rate=2e-5,
284
+ fp16=not is_bfloat16_supported(),
285
+ bf16=is_bfloat16_supported(),
286
+ logging_steps=50,
287
+ optim="adamw_8bit",
288
+ weight_decay=0.3,
289
+ lr_scheduler_type="cosine",
290
+ eval_strategy="steps",
291
+ eval_steps=200,
292
+ save_strategy="steps",
293
+ save_steps=200,
294
+ output_dir="outputs",
295
+ load_best_model_at_end=True,
296
+ metric_for_best_model="eval_loss",
297
+ greater_is_better=False,
298
+ seed=3407,
299
+ report_to="wandb",
300
+ run_name="phi3-cefr-lora-v14",
301
+ gradient_checkpointing=True,
302
+ ),
303
+ )
304
+
305
+ # Cell 9: Training and test evaluation
306
+ try:
307
+ trainer_stats = trainer.train()
308
+ print("Training completed successfully!")
309
+ print("Trainer stats:", trainer_stats)
310
+ except Exception as e:
311
+ print(f"Error during training: {e}")
312
+ raise
313
+
314
+ # Tokenize test dataset
315
+ def tokenize_function(example):
316
+ return tokenizer(example["text"], truncation=True, max_length=max_seq_length, padding=False)
317
+
318
+ new_test_dataset_tokenized = new_test_dataset_transformed.map(tokenize_function, batched=True)
319
+ new_test_dataset_tokenized = new_test_dataset_tokenized.remove_columns(['text'])
320
+ print("Test dataset structure:", new_test_dataset_tokenized[0])
321
+
322
+ # Evaluate on tokenized test dataset
323
+ try:
324
+ eval_results = trainer.evaluate(new_test_dataset_tokenized)
325
+ print("Test evaluation results:", eval_results)
326
+ except Exception as e:
327
+ print(f"Error during evaluation: {e}")
328
+ raise
329
+
330
+ # Cell 10: Save and upload the model to Hugging Face
331
+ # Save the fine-tuned model locally
332
+ output_dir = "./fine_tuned_model"
333
+ try:
334
+ model = model.merge_and_unload() # Merge LoRA weights with base model
335
+ model.save_pretrained(output_dir)
336
+ tokenizer.save_pretrained(output_dir)
337
+ print(f"Model and tokenizer saved locally to {output_dir}")
338
+ except Exception as e:
339
+ print(f"Error saving model locally: {e}")
340
+ raise
341
+
342
+ # Create a new repository on Hugging Face
343
+ repo_id = "Mr-FineTuner/Test___01"
344
+ try:
345
+ create_repo(repo_id, private=False) # Set private=True for a private repo
346
+ print(f"Repository {repo_id} created successfully!")
347
+ except Exception as e:
348
+ print(f"Error creating repository: {e}")
349
+
350
+ # Upload the model to Hugging Face
351
+ try:
352
+ upload_folder(
353
+ folder_path=output_dir,
354
+ repo_id=repo_id,
355
+ repo_type="model",
356
+ commit_message="Upload fine-tuned LLaMA-3-8B CEFR model"
357
+ )
358
+ print(f"Model uploaded successfully to https://huggingface.co/{repo_id}")
359
+ except Exception as e:
360
+ print(f"Error uploading model: {e}")
361
+ raise
362
+
363
+ # Create and upload a model card
364
+ model_card = """
365
+ # Fine-Tuned LLaMA-3-8B CEFR Model
366
+
367
+ This is a fine-tuned version of `unsloth/llama-3-8b-instruct-bnb-4bit` for CEFR-level sentence generation.
368
+
369
+ - **Base Model**: unsloth/llama-3-8b-instruct-bnb-4bit
370
+ - **Fine-Tuning**: LoRA with SMOTE-balanced dataset
371
+ - **Training Details**:
372
+ - Dataset: CEFR-level sentences with SMOTE and undersampling for balance
373
+ - LoRA Parameters: r=32, lora_alpha=32, lora_dropout=0.5
374
+ - Training Args: learning_rate=2e-5, batch_size=8, epochs=0.1, cosine scheduler
375
+ - Optimizer: adamw_8bit
376
+ - Early Stopping: Patience=3, threshold=0.01
377
+ - **Usage**:
378
+ ```python
379
+ from transformers import AutoModelForCausalLM, AutoTokenizer
380
+
381
+ model = AutoModelForCausalLM.from_pretrained("Mr-FineTuner/Test___01")
382
+ tokenizer = AutoTokenizer.from_pretrained("Mr-FineTuner/Test___01")
383
+
384
+ # Example inference
385
+ prompt = "<|user|>Generate a CEFR B1 level sentence.<|end|>"
386
+ inputs = tokenizer(prompt, return_tensors="pt")
387
+ outputs = model.generate(**inputs, max_length=50)
388
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
389
+ ```
390
+
391
+ Uploaded using `huggingface_hub`.
392
+ """
393
+ try:
394
+ with open(f"{output_dir}/README.md", "w") as f:
395
+ f.write(model_card)
396
+ upload_folder(
397
+ folder_path=output_dir,
398
+ repo_id=repo_id,
399
+ repo_type="model",
400
+ commit_message="Add model card"
401
+ )
402
+ print(f"Model card uploaded successfully!")
403
+ except Exception as e:
404
+ print(f"Error uploading model card: {e}")
405
+ raise