| num_train_epochs=0.1, | |
| # Fine-Tuned LLaMA-3-8B CEFR Model | |
| This is a fine-tuned version of `unsloth/llama-3-8b-instruct-bnb-4bit` for CEFR-level sentence generation. | |
| - **Base Model**: unsloth/llama-3-8b-instruct-bnb-4bit | |
| - **Fine-Tuning**: LoRA with SMOTE-balanced dataset | |
| - **Training Details**: | |
| - Dataset: CEFR-level sentences with SMOTE and undersampling for balance | |
| - LoRA Parameters: r=32, lora_alpha=32, lora_dropout=0.5 | |
| - Training Args: learning_rate=2e-5, batch_size=8, epochs=0.1, cosine scheduler | |
| - Optimizer: adamw_8bit | |
| - Early Stopping: Patience=3, threshold=0.01 | |
| - **Usage**: | |
| ```python | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| model = AutoModelForCausalLM.from_pretrained("Mr-FineTuner/Test___01") | |
| tokenizer = AutoTokenizer.from_pretrained("Mr-FineTuner/Test___01") | |
| # Example inference | |
| prompt = "<|user|>Generate a CEFR B1 level sentence.<|end|>" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_length=50) | |
| print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| ``` | |
| Uploaded using `huggingface_hub`. | |
| import unsloth | |
| from unsloth import FastLanguageModel, is_bfloat16_supported | |
| import torch | |
| import pandas as pd | |
| from datasets import Dataset | |
| from sklearn.utils import resample | |
| from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, AutoModelForCausalLM, AutoTokenizer | |
| from trl import SFTTrainer | |
| from sentence_transformers import SentenceTransformer | |
| from imblearn.over_sampling import SMOTE | |
| from imblearn.under_sampling import RandomUnderSampler | |
| from imblearn.pipeline import Pipeline | |
| import numpy as np | |
| import wandb | |
| import os | |
| from huggingface_hub import create_repo, upload_folder | |
| # Verify environment | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"GPU: {torch.cuda.get_device_name(0)}") | |
| # Cell 1: Load model and tokenizer | |
| max_seq_length = 2048 | |
| dtype = None | |
| load_in_4bit = True | |
| try: | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name="unsloth/llama-3-8b-instruct-bnb-4bit", | |
| max_seq_length=max_seq_length, | |
| dtype=dtype, | |
| load_in_4bit=load_in_4bit, | |
| use_exact_model_name=True, | |
| device_map="auto" | |
| ) | |
| print("Model and tokenizer loaded successfully with Unsloth!") | |
| except Exception as e: | |
| print(f"Error loading model with Unsloth: {e}") | |
| print("Falling back to transformers...") | |
| model_name = "unsloth/llama-3-8b-instruct-bnb-4bit" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| load_in_4bit=True, | |
| device_map="auto" | |
| ) | |
| print("Model and tokenizer loaded with transformers!") | |
| # Cell 2: Configure LoRA | |
| try: | |
| model = FastLanguageModel.get_peft_model( | |
| model, | |
| r=32, | |
| target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], | |
| lora_alpha=32, | |
| lora_dropout=0.5, | |
| bias="none", | |
| use_gradient_checkpointing="unsloth", | |
| random_state=3407, | |
| ) | |
| print("LoRA configuration applied successfully!") | |
| except Exception as e: | |
| print(f"Error applying LoRA: {e}") | |
| raise | |
| # Cell 3: Load datasets | |
| train_file = "train_merged_output.txt" | |
| val_file = "dev_merged_output.txt" | |
| test_file = "test_merged_output.txt" | |
| cefr_mapping = {1: "A1", 2: "A2", 3: "B1", 4: "B2", 5: "C1", 6: "C2"} | |
| def load_and_reformat(file_path): | |
| try: | |
| with open(file_path, "r") as f: | |
| lines = f.readlines() | |
| reformatted_data = [] | |
| for line in lines: | |
| parts = line.strip().split("\t") | |
| sentence = parts[0] | |
| levels = parts[1:] | |
| for level in levels: | |
| level_int = int(level) | |
| cefr_level = cefr_mapping.get(level_int, "Unknown") | |
| reformatted_data.append({"sentence": sentence, "level": cefr_level}) | |
| return pd.DataFrame(reformatted_data) | |
| except Exception as e: | |
| print(f"Error loading file {file_path}: {e}") | |
| raise | |
| train_dataset = load_and_reformat(train_file) | |
| val_dataset = load_and_reformat(val_file) | |
| test_dataset = load_and_reformat(test_file) | |
| print("Train dataset - Column names:", train_dataset.columns.tolist()) | |
| print("Train dataset - First 5 rows:\n", train_dataset.head()) | |
| print("Validation dataset - First 5 rows:\n", val_dataset.head()) | |
| print("Test dataset - First 5 rows:\n", test_dataset.head()) | |
| expected_columns = {"sentence", "level"} | |
| for name, dataset in [("Train", train_dataset), ("Validation", val_dataset), ("Test", test_dataset)]: | |
| if not expected_columns.issubset(dataset.columns): | |
| missing = expected_columns - set(dataset.columns) | |
| print(f"Warning: {name} dataset missing expected columns: {missing}") | |
| # Cell 4: Rename columns | |
| column_mapping = {"sentence": "sentence", "level": "level"} | |
| train_dataset = train_dataset.rename(columns=column_mapping) | |
| val_dataset = val_dataset.rename(columns=column_mapping) | |
| test_dataset = test_dataset.rename(columns=column_mapping) | |
| print("Train dataset - Renamed column names:", train_dataset.columns.tolist()) | |
| print("Train dataset - First row after renaming:\n", train_dataset.head(1)) | |
| # Cell 5: Convert to HF Dataset and format | |
| train_dataset_hf = Dataset.from_pandas(train_dataset) | |
| val_dataset_hf = Dataset.from_pandas(val_dataset) | |
| test_dataset_hf = Dataset.from_pandas(test_dataset) | |
| def format_func(example): | |
| return { | |
| "text": ( | |
| f"<|user|>\nGenerate a CEFR {example['level']} level sentence.<|end|>\n" | |
| f"<|assistant|>\n{example['sentence']}<|end|>\n" | |
| ), | |
| "level": example['level'] | |
| } | |
| train_dataset_transformed = train_dataset_hf.map(format_func) | |
| val_dataset_transformed = val_dataset_hf.map(format_func) | |
| test_dataset_transformed = test_dataset_hf.map(format_func) | |
| train_dataset_transformed = train_dataset_transformed.remove_columns(['sentence']) | |
| val_dataset_transformed = val_dataset_transformed.remove_columns(['sentence']) | |
| test_dataset_transformed = test_dataset_transformed.remove_columns(['sentence']) | |
| print("Train dataset columns after transformation:", train_dataset_transformed.column_names) | |
| print("Example transformed text:", train_dataset_transformed[0]["text"]) | |
| print("Train CEFR distribution:\n", train_dataset["level"].value_counts()) | |
| print("Validation CEFR distribution:\n", val_dataset["level"].value_counts()) | |
| print("Test CEFR distribution:\n", test_dataset["level"].value_counts()) | |
| # Cell 6: Rebalance validation and test sets | |
| train_proportions = { | |
| 'A1': 0.0346, 'A2': 0.1789, 'B1': 0.3454, | |
| 'B2': 0.3101, 'C1': 0.1239, 'C2': 0.0072 | |
| } | |
| def rebalance_dataset(df, total_samples, proportions, random_state=3407): | |
| resampled_dfs = [] | |
| for level, proportion in proportions.items(): | |
| level_df = df[df['level'] == level] | |
| n_samples = int(total_samples * proportion) | |
| if len(level_df) > n_samples: | |
| level_df_resampled = resample(level_df, n_samples=n_samples, random_state=random_state) | |
| else: | |
| level_df_resampled = resample(level_df, n_samples=n_samples, replace=True, random_state=random_state) | |
| resampled_dfs.append(level_df_resampled) | |
| return pd.concat(resampled_dfs).sample(frac=1, random_state=random_state).reset_index(drop=True) | |
| val_df = val_dataset.copy() | |
| new_val_df = rebalance_dataset(val_df, len(val_df), train_proportions) | |
| new_val_dataset = Dataset.from_pandas(new_val_df) | |
| new_val_dataset_transformed = new_val_dataset.map(format_func) | |
| new_val_dataset_transformed = new_val_dataset_transformed.remove_columns(['sentence']) | |
| test_df = test_dataset.copy() | |
| new_test_df = rebalance_dataset(test_df, len(test_df), train_proportions) | |
| new_test_dataset = Dataset.from_pandas(new_test_df) | |
| new_test_dataset_transformed = new_test_dataset.map(format_func) | |
| new_test_dataset_transformed = new_test_dataset_transformed.remove_columns(['sentence']) | |
| print("New Validation CEFR distribution:\n", new_val_df["level"].value_counts(normalize=True)) | |
| print("New Test CEFR distribution:\n", new_test_df["level"].value_counts(normalize=True)) | |
| # Cell 7: Apply SMOTE and undersampling to balance training dataset | |
| evaluator_model = SentenceTransformer("BAAI/bge-base-en-v1.5") | |
| def apply_smote_to_dataset(df, target_proportions, random_state=3407): | |
| print("Generating sentence embeddings...") | |
| embeddings = evaluator_model.encode(df["sentence"].tolist(), show_progress_bar=True) | |
| level_to_idx = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5} | |
| labels = df["level"].map(level_to_idx).values | |
| class_counts = df["level"].value_counts().to_dict() | |
| print("Original class counts:", class_counts) | |
| total_samples = len(df) | |
| target_samples = { | |
| level_to_idx[level]: max(int(total_samples * proportion), class_counts.get(level, 0)) | |
| for level, proportion in target_proportions.items() | |
| } | |
| print("Target sample counts:", target_samples) | |
| pipeline = Pipeline([ | |
| ('oversample', SMOTE(sampling_strategy=target_samples, random_state=random_state)), | |
| ('undersample', RandomUnderSampler(sampling_strategy=target_samples, random_state=random_state)) | |
| ]) | |
| print("Applying SMOTE and undersampling...") | |
| X_resampled, y_resampled = pipeline.fit_resample(embeddings, labels) | |
| idx_to_level = {v: k for k, v in level_to_idx.items()} | |
| resampled_data = [] | |
| for embedding, label in zip(X_resampled, y_resampled): | |
| # Find the closest original embedding | |
| distances = np.linalg.norm(embeddings - embedding, axis=1) | |
| closest_idx = np.argmin(distances) | |
| sentence = df.iloc[closest_idx]["sentence"] | |
| resampled_data.append({ | |
| "sentence": sentence, | |
| "level": idx_to_level[label] | |
| }) | |
| return pd.DataFrame(resampled_data) | |
| train_dataset_smote = apply_smote_to_dataset(train_dataset, train_proportions) | |
| train_dataset_hf = Dataset.from_pandas(train_dataset_smote) | |
| train_dataset_transformed = train_dataset_hf.map(format_func) | |
| train_dataset_transformed = train_dataset_transformed.remove_columns(['sentence']) | |
| print("SMOTE-balanced Train CEFR distribution:\n", train_dataset_smote["level"].value_counts(normalize=True)) | |
| # Cell 8: Training setup | |
| wandb.init(project="Phi-3-CEFR-finetuning_v3", | |
| config={ | |
| "model": "unsloth/llama-3-8b-instruct-bnb-4bit", | |
| "strategy": "gradient_checkpointing", | |
| "learning_rate": 2e-5, | |
| "batch_size": 8, | |
| "lora_dropout": 0.5, | |
| }) | |
| trainer = SFTTrainer( | |
| model=model, | |
| tokenizer=tokenizer, | |
| train_dataset=train_dataset_transformed.shuffle(seed=3407), | |
| eval_dataset=new_val_dataset_transformed, | |
| dataset_text_field="text", | |
| max_seq_length=max_seq_length, | |
| callbacks=[ | |
| EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01), | |
| ], | |
| args=TrainingArguments( | |
| per_device_train_batch_size=8, | |
| gradient_accumulation_steps=1, | |
| warmup_ratio=0.1, | |
| num_train_epochs=0.1, | |
| learning_rate=2e-5, | |
| fp16=not is_bfloat16_supported(), | |
| bf16=is_bfloat16_supported(), | |
| logging_steps=50, | |
| optim="adamw_8bit", | |
| weight_decay=0.3, | |
| lr_scheduler_type="cosine", | |
| eval_strategy="steps", | |
| eval_steps=200, | |
| save_strategy="steps", | |
| save_steps=200, | |
| output_dir="outputs", | |
| load_best_model_at_end=True, | |
| metric_for_best_model="eval_loss", | |
| greater_is_better=False, | |
| seed=3407, | |
| report_to="wandb", | |
| run_name="phi3-cefr-lora-v14", | |
| gradient_checkpointing=True, | |
| ), | |
| ) | |
| # Cell 9: Training and test evaluation | |
| try: | |
| trainer_stats = trainer.train() | |
| print("Training completed successfully!") | |
| print("Trainer stats:", trainer_stats) | |
| except Exception as e: | |
| print(f"Error during training: {e}") | |
| raise | |
| # Tokenize test dataset | |
| def tokenize_function(example): | |
| return tokenizer(example["text"], truncation=True, max_length=max_seq_length, padding=False) | |
| new_test_dataset_tokenized = new_test_dataset_transformed.map(tokenize_function, batched=True) | |
| new_test_dataset_tokenized = new_test_dataset_tokenized.remove_columns(['text']) | |
| print("Test dataset structure:", new_test_dataset_tokenized[0]) | |
| # Evaluate on tokenized test dataset | |
| try: | |
| eval_results = trainer.evaluate(new_test_dataset_tokenized) | |
| print("Test evaluation results:", eval_results) | |
| except Exception as e: | |
| print(f"Error during evaluation: {e}") | |
| raise | |
| # Cell 10: Save and upload the model to Hugging Face | |
| # Save the fine-tuned model locally | |
| output_dir = "./fine_tuned_model" | |
| try: | |
| model = model.merge_and_unload() # Merge LoRA weights with base model | |
| model.save_pretrained(output_dir) | |
| tokenizer.save_pretrained(output_dir) | |
| print(f"Model and tokenizer saved locally to {output_dir}") | |
| except Exception as e: | |
| print(f"Error saving model locally: {e}") | |
| raise | |
| # Create a new repository on Hugging Face | |
| repo_id = "Mr-FineTuner/Test___01" | |
| try: | |
| create_repo(repo_id, private=False) # Set private=True for a private repo | |
| print(f"Repository {repo_id} created successfully!") | |
| except Exception as e: | |
| print(f"Error creating repository: {e}") | |
| # Upload the model to Hugging Face | |
| try: | |
| upload_folder( | |
| folder_path=output_dir, | |
| repo_id=repo_id, | |
| repo_type="model", | |
| commit_message="Upload fine-tuned LLaMA-3-8B CEFR model" | |
| ) | |
| print(f"Model uploaded successfully to https://huggingface.co/{repo_id}") | |
| except Exception as e: | |
| print(f"Error uploading model: {e}") | |
| raise | |
| # Create and upload a model card | |
| model_card = """ | |
| # Fine-Tuned LLaMA-3-8B CEFR Model | |
| This is a fine-tuned version of `unsloth/llama-3-8b-instruct-bnb-4bit` for CEFR-level sentence generation. | |
| - **Base Model**: unsloth/llama-3-8b-instruct-bnb-4bit | |
| - **Fine-Tuning**: LoRA with SMOTE-balanced dataset | |
| - **Training Details**: | |
| - Dataset: CEFR-level sentences with SMOTE and undersampling for balance | |
| - LoRA Parameters: r=32, lora_alpha=32, lora_dropout=0.5 | |
| - Training Args: learning_rate=2e-5, batch_size=8, epochs=0.1, cosine scheduler | |
| - Optimizer: adamw_8bit | |
| - Early Stopping: Patience=3, threshold=0.01 | |
| - **Usage**: | |
| ```python | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| model = AutoModelForCausalLM.from_pretrained("Mr-FineTuner/Test___01") | |
| tokenizer = AutoTokenizer.from_pretrained("Mr-FineTuner/Test___01") | |
| # Example inference | |
| prompt = "<|user|>Generate a CEFR B1 level sentence.<|end|>" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_length=50) | |
| print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
| ``` | |
| Uploaded using `huggingface_hub`. | |
| """ | |
| try: | |
| with open(f"{output_dir}/README.md", "w") as f: | |
| f.write(model_card) | |
| upload_folder( | |
| folder_path=output_dir, | |
| repo_id=repo_id, | |
| repo_type="model", | |
| commit_message="Add model card" | |
| ) | |
| print(f"Model card uploaded successfully!") | |
| except Exception as e: | |
| print(f"Error uploading model card: {e}") | |
| raise | |