Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
|
|
| 1 |
from datasets import load_dataset
|
| 2 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
| 3 |
|
| 4 |
# Load the dataset
|
| 5 |
dataset = load_dataset("json", data_files="dataset.jsonl")
|
| 6 |
|
| 7 |
-
# Load the model and tokenizer
|
| 8 |
model_name = "Salesforce/codegen-2B-multi"
|
| 9 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 10 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
@@ -21,13 +22,14 @@ training_args = TrainingArguments(
|
|
| 21 |
overwrite_output_dir=True,
|
| 22 |
evaluation_strategy="epoch",
|
| 23 |
learning_rate=5e-5,
|
| 24 |
-
per_device_train_batch_size=
|
| 25 |
num_train_epochs=3,
|
| 26 |
save_strategy="epoch",
|
| 27 |
logging_dir="./logs",
|
|
|
|
| 28 |
)
|
| 29 |
|
| 30 |
-
#
|
| 31 |
trainer = Trainer(
|
| 32 |
model=model,
|
| 33 |
args=training_args,
|
|
@@ -35,7 +37,29 @@ trainer = Trainer(
|
|
| 35 |
eval_dataset=tokenized_dataset["train"],
|
| 36 |
)
|
| 37 |
|
|
|
|
| 38 |
trainer.train()
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
from datasets import load_dataset
|
| 3 |
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
|
| 4 |
|
| 5 |
# Load the dataset
|
| 6 |
dataset = load_dataset("json", data_files="dataset.jsonl")
|
| 7 |
|
| 8 |
+
# Load the pre-trained model and tokenizer
|
| 9 |
model_name = "Salesforce/codegen-2B-multi"
|
| 10 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
| 22 |
overwrite_output_dir=True,
|
| 23 |
evaluation_strategy="epoch",
|
| 24 |
learning_rate=5e-5,
|
| 25 |
+
per_device_train_batch_size=2,
|
| 26 |
num_train_epochs=3,
|
| 27 |
save_strategy="epoch",
|
| 28 |
logging_dir="./logs",
|
| 29 |
+
logging_strategy="epoch",
|
| 30 |
)
|
| 31 |
|
| 32 |
+
# Trainer setup
|
| 33 |
trainer = Trainer(
|
| 34 |
model=model,
|
| 35 |
args=training_args,
|
|
|
|
| 37 |
eval_dataset=tokenized_dataset["train"],
|
| 38 |
)
|
| 39 |
|
| 40 |
+
# Train the model
|
| 41 |
trainer.train()
|
| 42 |
+
|
| 43 |
+
# Save the fine-tuned model
|
| 44 |
+
trainer.save_model("./fine_tuned_model")
|
| 45 |
+
tokenizer.save_pretrained("./fine_tuned_model")
|
| 46 |
+
|
| 47 |
+
# Load the fine-tuned model for inference
|
| 48 |
+
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model")
|
| 49 |
+
fine_tuned_tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_model")
|
| 50 |
+
|
| 51 |
+
# Define a Gradio interface for testing the model
|
| 52 |
+
def generate_cypress_code(prompt):
|
| 53 |
+
inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
|
| 54 |
+
outputs = fine_tuned_model.generate(inputs["input_ids"], max_length=150, num_return_sequences=1)
|
| 55 |
+
return fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 56 |
+
|
| 57 |
+
# Launch the Gradio interface
|
| 58 |
+
interface = gr.Interface(
|
| 59 |
+
fn=generate_cypress_code,
|
| 60 |
+
inputs="text",
|
| 61 |
+
outputs="text",
|
| 62 |
+
title="Cypress Test Generator",
|
| 63 |
+
description="Enter a description of the test you want to generate Cypress code for.",
|
| 64 |
+
)
|
| 65 |
+
interface.launch()
|