Esha commited on
Commit
4be4d35
·
1 Parent(s): b541b22

Fine-tune script, evaluation pipeline, and dataset updates

Browse files
configs/train_config.yaml CHANGED
@@ -1,4 +1,4 @@
1
- model_name: "llama-base"
2
  learning_rate: 5e-5
3
  batch_size: 16
4
  num_epochs: 3
 
1
+ model_name: "gpt2"
2
  learning_rate: 5e-5
3
  batch_size: 16
4
  num_epochs: 3
data/processed/test.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {"text": "This is a test sentence to evaluate the model."}
2
+ {"text": "Another example test input for language generation."}
3
+ {"text": "Testing the fine-tuned model with unseen data."}
demo_app/app.py CHANGED
@@ -4,14 +4,34 @@ from transformers import pipeline
4
  st.title("Fine-tuned LLM Demo")
5
 
6
  # Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
7
- generator = pipeline('text-generation', model='gpt2')
8
 
 
 
9
  prompt = st.text_input("Enter prompt:")
10
 
11
- # Add a slider to select number of responses
12
  num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
13
 
 
 
 
14
  if prompt:
15
- results = generator(prompt, max_length=100, num_return_sequences=num_responses)
 
 
 
 
 
 
 
 
 
16
  for i, result in enumerate(results):
17
  st.write(f"Output {i+1}: {result['generated_text']}")
 
 
 
 
 
 
 
4
  st.title("Fine-tuned LLM Demo")
5
 
6
  # Load a model pipeline (replace 'gpt2' with your actual model repo id if needed)
7
+ generator = pipeline('text-generation', model='promptsbyesha/llm-finetuned-model')
8
 
9
+
10
+ # Prompt input
11
  prompt = st.text_input("Enter prompt:")
12
 
13
+ # Slider to select number of responses
14
  num_responses = st.slider("Number of responses", min_value=1, max_value=5, value=1)
15
 
16
+ # Additional context input (optional)
17
+ extra_context = st.text_area("Additional context (optional):", "")
18
+
19
  if prompt:
20
+ # Step 1: Display original prompt and additional context
21
+ st.markdown(f"**Step 1: Prompt:** {prompt}")
22
+ if extra_context.strip():
23
+ st.markdown(f"**Step 1b: Context:** {extra_context}")
24
+
25
+ # Step 2: Combine prompt and context for model inference
26
+ full_prompt = prompt + " " + extra_context if extra_context.strip() else prompt
27
+ results = generator(full_prompt, max_length=100, num_return_sequences=num_responses)
28
+
29
+ # Step 3: Display generated outputs
30
  for i, result in enumerate(results):
31
  st.write(f"Output {i+1}: {result['generated_text']}")
32
+
33
+ # Step 4: Example post-processing (uppercase conversion as placeholder)
34
+ processed_outputs = [result['generated_text'].upper() for result in results]
35
+ st.markdown("**Step 4: Post-processed Outputs:**")
36
+ for i, output in enumerate(processed_outputs):
37
+ st.write(f"Processed Output {i+1}: {output}")
src/model/evaluate.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
2
+ from datasets import load_dataset
3
+ import yaml
4
+
5
+ def load_config(config_path):
6
+ import yaml
7
+ with open(config_path, "r") as f:
8
+ return yaml.safe_load(f)
9
+
10
+ def main():
11
+ config = load_config("configs/train_config.yaml")
12
+
13
+ model_dir = config["output_dir"] # Use fine-tuned model output directory
14
+ test_dataset_path = "data/processed/test.json" # Adjust path to your test data
15
+
16
+ tokenizer = AutoTokenizer.from_pretrained(model_dir)
17
+ model = AutoModelForCausalLM.from_pretrained(model_dir)
18
+
19
+ dataset = load_dataset("json", data_files={"test": test_dataset_path})
20
+
21
+ def tokenize_function(examples):
22
+ tokenized = tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
23
+ tokenized["labels"] = tokenized["input_ids"].copy()
24
+ return tokenized
25
+
26
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
27
+
28
+ training_args = TrainingArguments(
29
+ output_dir="./eval_output",
30
+ per_device_eval_batch_size=8,
31
+ )
32
+
33
+ trainer = Trainer(
34
+ model=model,
35
+ args=training_args,
36
+ eval_dataset=tokenized_dataset["test"],
37
+ tokenizer=tokenizer,
38
+ )
39
+
40
+ eval_result = trainer.evaluate()
41
+ print(f"Evaluation results: {eval_result}")
42
+
43
+ if __name__ == "__main__":
44
+ main()
src/model/train.py CHANGED
@@ -2,11 +2,11 @@ import torch
2
  from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
3
  from datasets import load_dataset
4
  import os
5
- import json
6
 
7
  def load_config(config_path):
8
  with open(config_path, "r") as f:
9
- return json.load(f)
10
 
11
  def main():
12
  config = load_config("configs/train_config.yaml")
@@ -14,23 +14,38 @@ def main():
14
  model_name = config["model_name"]
15
  dataset_path = config["dataset_path"]
16
  output_dir = config["output_dir"]
17
- learning_rate = config["learning_rate"]
18
  batch_size = config["batch_size"]
19
  num_epochs = config["num_epochs"]
20
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
 
 
 
22
  model = AutoModelForCausalLM.from_pretrained(model_name)
23
 
24
  # Example dataset loading; replace with your data loading pipeline
25
  dataset = load_dataset("json", data_files={"train": dataset_path})
 
26
  def tokenize_function(examples):
27
- return tokenizer(examples['text'], truncation=True, padding="max_length", max_length=512)
 
 
 
 
 
 
 
 
 
28
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
29
 
30
  training_args = TrainingArguments(
31
  output_dir=output_dir,
32
  overwrite_output_dir=True,
33
- evaluation_strategy="epoch",
34
  learning_rate=learning_rate,
35
  per_device_train_batch_size=batch_size,
36
  num_train_epochs=num_epochs,
@@ -47,7 +62,8 @@ def main():
47
  )
48
 
49
  trainer.train()
50
- trainer.save_model(output_dir)
 
51
 
52
  if __name__ == "__main__":
53
- main()
 
2
  from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer
3
  from datasets import load_dataset
4
  import os
5
+ import yaml
6
 
7
  def load_config(config_path):
8
  with open(config_path, "r") as f:
9
+ return yaml.safe_load(f)
10
 
11
  def main():
12
  config = load_config("configs/train_config.yaml")
 
14
  model_name = config["model_name"]
15
  dataset_path = config["dataset_path"]
16
  output_dir = config["output_dir"]
17
+ learning_rate = float(config["learning_rate"]) # Convert to float
18
  batch_size = config["batch_size"]
19
  num_epochs = config["num_epochs"]
20
 
21
  tokenizer = AutoTokenizer.from_pretrained(model_name)
22
+
23
+ # Fix for tokenizers without a pad_token (e.g., GPT-2)
24
+ if tokenizer.pad_token is None:
25
+ tokenizer.pad_token = tokenizer.eos_token
26
+
27
  model = AutoModelForCausalLM.from_pretrained(model_name)
28
 
29
  # Example dataset loading; replace with your data loading pipeline
30
  dataset = load_dataset("json", data_files={"train": dataset_path})
31
+
32
  def tokenize_function(examples):
33
+ tokenized = tokenizer(
34
+ examples['text'],
35
+ truncation=True,
36
+ padding="max_length",
37
+ max_length=512
38
+ )
39
+ # Add labels identical to input_ids for causal LM loss computation
40
+ tokenized["labels"] = tokenized["input_ids"].copy()
41
+ return tokenized
42
+
43
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
44
 
45
  training_args = TrainingArguments(
46
  output_dir=output_dir,
47
  overwrite_output_dir=True,
48
+ # Removed evaluation_strategy to avoid error
49
  learning_rate=learning_rate,
50
  per_device_train_batch_size=batch_size,
51
  num_train_epochs=num_epochs,
 
62
  )
63
 
64
  trainer.train()
65
+ trainer.save_model(output_dir) # Saves model files like pytorch_model.bin, config.json
66
+ tokenizer.save_pretrained(output_dir) # Saves tokenizer files like tokenizer_config.json, vocab files
67
 
68
  if __name__ == "__main__":
69
+ main()