Esha commited on
Commit ·
4be4d35
1
Parent(s): b541b22
Fine-tune script, evaluation pipeline, and dataset updates
Browse files- configs/train_config.yaml +1 -1
- data/processed/test.json +3 -0
- demo_app/app.py +23 -3
- src/model/evaluate.py +44 -0
- src/model/train.py +23 -7
configs/train_config.yaml
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
model_name: "
|
| 2 |
learning_rate: 5e-5
|
| 3 |
batch_size: 16
|
| 4 |
num_epochs: 3
|
|
|
|
| 1 |
+
model_name: "gpt2"
|
| 2 |
learning_rate: 5e-5
|
| 3 |
batch_size: 16
|
| 4 |
num_epochs: 3
|
data/processed/test.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"text": "This is a test sentence to evaluate the model."}
|
| 2 |
+
{"text": "Another example test input for language generation."}
|
| 3 |
+
{"text": "Testing the fine-tuned model with unseen data."}
|
demo_app/app.py
CHANGED
|
@@ -4,14 +4,34 @@ from transformers import pipeline
|
|
| 4 |
st.title("Fine-tuned LLM Demo")
|
| 5 |
|
| 6 |
# Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
|
| 7 |
-
generator = pipeline('text-generation', model='
|
| 8 |
|
|
|
|
|
|
|
| 9 |
prompt = st.text_input("Enter prompt:")
|
| 10 |
|
| 11 |
-
#
|
| 12 |
num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
|
| 13 |
|
|
|
|
|
|
|
|
|
|
| 14 |
if prompt:
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
for i, result in enumerate(results):
|
| 17 |
st.write(f"Output {i+1}: {result['generated_text']}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
st.title("Fine-tuned LLM Demo")
|
| 5 |
|
| 6 |
# Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
|
| 7 |
+
generator = pipeline('text-generation', model='promptsbyesha/llm-finetuned-model')
|
| 8 |
|
| 9 |
+
|
| 10 |
+
# Prompt input
|
| 11 |
prompt = st.text_input("Enter prompt:")
|
| 12 |
|
| 13 |
+
# Slider to select number of responses
|
| 14 |
num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
|
| 15 |
|
| 16 |
+
# Additional context input (optional)
|
| 17 |
+
extra_context = st.text_area("Additional context (optional):", "")
|
| 18 |
+
|
| 19 |
if prompt:
|
| 20 |
+
# Step 1: Display original prompt and additional context
|
| 21 |
+
st.markdown(f"**Step 1: Prompt:** {prompt}")
|
| 22 |
+
if extra_context.strip():
|
| 23 |
+
st.markdown(f"**Step 1b: Context:** {extra_context}")
|
| 24 |
+
|
| 25 |
+
# Step 2: Combine prompt and context for model inference
|
| 26 |
+
full_prompt = prompt + " " + extra_context if extra_context.strip() else prompt
|
| 27 |
+
results = generator(full_prompt, max_length=100, num_return_sequences=num_responses)
|
| 28 |
+
|
| 29 |
+
# Step 3: Display generated outputs
|
| 30 |
for i, result in enumerate(results):
|
| 31 |
st.write(f"Output {i+1}: {result['generated_text']}")
|
| 32 |
+
|
| 33 |
+
# Step 4: Example post-processing (uppercase conversion as placeholder)
|
| 34 |
+
processed_outputs = [result['generated_text'].upper() for result in results]
|
| 35 |
+
st.markdown("**Step 4: Post-processed Outputs:**")
|
| 36 |
+
for i, output in enumerate(processed_outputs):
|
| 37 |
+
st.write(f"Processed Output {i+1}: {output}")
|
src/model/evaluate.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
import yaml
|
| 4 |
+
|
| 5 |
+
def load_config(config_path):
|
| 6 |
+
import yaml
|
| 7 |
+
with open(config_path, "r") as f:
|
| 8 |
+
return yaml.safe_load(f)
|
| 9 |
+
|
| 10 |
+
def main():
|
| 11 |
+
config = load_config("configs/train_config.yaml")
|
| 12 |
+
|
| 13 |
+
model_dir = config["output_dir"] # Use fine-tuned model output directory
|
| 14 |
+
test_dataset_path = "data/processed/test.json" # Adjust path to your test data
|
| 15 |
+
|
| 16 |
+
tokenizer = AutoTokenizer.from_pretrained(model_dir)
|
| 17 |
+
model = AutoModelForCausalLM.from_pretrained(model_dir)
|
| 18 |
+
|
| 19 |
+
dataset = load_dataset("json", data_files={"test": test_dataset_path})
|
| 20 |
+
|
| 21 |
+
def tokenize_function(examples):
|
| 22 |
+
tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
|
| 23 |
+
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 24 |
+
return tokenized
|
| 25 |
+
|
| 26 |
+
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
| 27 |
+
|
| 28 |
+
training_args = TrainingArguments(
|
| 29 |
+
output_dir="./eval_output",
|
| 30 |
+
per_device_eval_batch_size=8,
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
trainer = Trainer(
|
| 34 |
+
model=model,
|
| 35 |
+
args=training_args,
|
| 36 |
+
eval_dataset=tokenized_dataset["test"],
|
| 37 |
+
tokenizer=tokenizer,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
eval_result = trainer.evaluate()
|
| 41 |
+
print(f"Evaluation results: {eval_result}")
|
| 42 |
+
|
| 43 |
+
if __name__ == "__main__":
|
| 44 |
+
main()
|
src/model/train.py
CHANGED
|
@@ -2,11 +2,11 @@ import torch
|
|
| 2 |
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
|
| 3 |
from datasets import load_dataset
|
| 4 |
import os
|
| 5 |
-
import
|
| 6 |
|
| 7 |
def load_config(config_path):
|
| 8 |
with open(config_path, "r") as f:
|
| 9 |
-
return
|
| 10 |
|
| 11 |
def main():
|
| 12 |
config = load_config("configs/train_config.yaml")
|
|
@@ -14,23 +14,38 @@ def main():
|
|
| 14 |
model_name = config["model_name"]
|
| 15 |
dataset_path = config["dataset_path"]
|
| 16 |
output_dir = config["output_dir"]
|
| 17 |
-
learning_rate = config["learning_rate"]
|
| 18 |
batch_size = config["batch_size"]
|
| 19 |
num_epochs = config["num_epochs"]
|
| 20 |
|
| 21 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 23 |
|
| 24 |
# Example dataset loading; replace with your data loading pipeline
|
| 25 |
dataset = load_dataset("json", data_files={"train": dataset_path})
|
|
|
|
| 26 |
def tokenize_function(examples):
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
| 29 |
|
| 30 |
training_args = TrainingArguments(
|
| 31 |
output_dir=output_dir,
|
| 32 |
overwrite_output_dir=True,
|
| 33 |
-
evaluation_strategy
|
| 34 |
learning_rate=learning_rate,
|
| 35 |
per_device_train_batch_size=batch_size,
|
| 36 |
num_train_epochs=num_epochs,
|
|
@@ -47,7 +62,8 @@ def main():
|
|
| 47 |
)
|
| 48 |
|
| 49 |
trainer.train()
|
| 50 |
-
trainer.save_model(output_dir)
|
|
|
|
| 51 |
|
| 52 |
if __name__ == "__main__":
|
| 53 |
-
main()
|
|
|
|
| 2 |
from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
|
| 3 |
from datasets import load_dataset
|
| 4 |
import os
|
| 5 |
+
import yaml
|
| 6 |
|
| 7 |
def load_config(config_path):
|
| 8 |
with open(config_path, "r") as f:
|
| 9 |
+
return yaml.safe_load(f)
|
| 10 |
|
| 11 |
def main():
|
| 12 |
config = load_config("configs/train_config.yaml")
|
|
|
|
| 14 |
model_name = config["model_name"]
|
| 15 |
dataset_path = config["dataset_path"]
|
| 16 |
output_dir = config["output_dir"]
|
| 17 |
+
learning_rate = float(config["learning_rate"]) # Convert to float
|
| 18 |
batch_size = config["batch_size"]
|
| 19 |
num_epochs = config["num_epochs"]
|
| 20 |
|
| 21 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 22 |
+
|
| 23 |
+
# Fix for tokenizers without a pad_token (e.g., GPT-2)
|
| 24 |
+
if tokenizer.pad_token is None:
|
| 25 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 26 |
+
|
| 27 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 28 |
|
| 29 |
# Example dataset loading; replace with your data loading pipeline
|
| 30 |
dataset = load_dataset("json", data_files={"train": dataset_path})
|
| 31 |
+
|
| 32 |
def tokenize_function(examples):
|
| 33 |
+
tokenized = tokenizer(
|
| 34 |
+
examples['text'],
|
| 35 |
+
truncation=True,
|
| 36 |
+
padding="max_length",
|
| 37 |
+
max_length=512
|
| 38 |
+
)
|
| 39 |
+
# Add labels identical to input_ids for causal LM loss computation
|
| 40 |
+
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 41 |
+
return tokenized
|
| 42 |
+
|
| 43 |
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
| 44 |
|
| 45 |
training_args = TrainingArguments(
|
| 46 |
output_dir=output_dir,
|
| 47 |
overwrite_output_dir=True,
|
| 48 |
+
# Removed evaluation_strategy to avoid error
|
| 49 |
learning_rate=learning_rate,
|
| 50 |
per_device_train_batch_size=batch_size,
|
| 51 |
num_train_epochs=num_epochs,
|
|
|
|
| 62 |
)
|
| 63 |
|
| 64 |
trainer.train()
|
| 65 |
+
trainer.save_model(output_dir) # Saves model files like pytorch_model.bin, config.json
|
| 66 |
+
tokenizer.save_pretrained(output_dir) # Saves tokenizer files like tokenizer_config.json, vocab files
|
| 67 |
|
| 68 |
if __name__ == "__main__":
|
| 69 |
+
main()
|