Upload 2 files
Browse files- debug_t5_model.py +94 -0
- update_model_safetensor_cmd.py +78 -0
debug_t5_model.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Revised script to use the fine-tuned T5 model for generating command descriptions
|
| 2 |
+
# Includes debugging output and adjusted generation parameters
|
| 3 |
+
# Model directory: C:\app\dataset\new_cmd_model
|
| 4 |
+
# Prerequisites: Ensure transformers, torch, and sentencepiece are installed
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer
|
| 8 |
+
import torch
|
| 9 |
+
|
| 10 |
+
# Define model and tokenizer path
|
| 11 |
+
model_path = r"C:\app\dataset\new_cmd_model"
|
| 12 |
+
|
| 13 |
+
# Check if model directory exists
|
| 14 |
+
if not os.path.exists(model_path):
|
| 15 |
+
raise FileNotFoundError(f"Model directory '{model_path}' not found. Please verify the path.")
|
| 16 |
+
|
| 17 |
+
# Load the fine-tuned model and tokenizer
|
| 18 |
+
try:
|
| 19 |
+
model = T5ForConditionalGeneration.from_pretrained(model_path)
|
| 20 |
+
tokenizer = T5Tokenizer.from_pretrained(model_path)
|
| 21 |
+
print("Model and tokenizer loaded successfully.")
|
| 22 |
+
except Exception as e:
|
| 23 |
+
raise Exception(f"Error loading model or tokenizer: {str(e)}")
|
| 24 |
+
|
| 25 |
+
# Function to generate a description for a given command and source
|
| 26 |
+
def generate_command_description(command_name, source, max_length=150):
|
| 27 |
+
# Format the input prompt as used during training
|
| 28 |
+
prompt = f"Describe the command: {command_name} in {source}"
|
| 29 |
+
print(f"Input prompt: {prompt}") # Debug: Show the prompt being used
|
| 30 |
+
|
| 31 |
+
# Tokenize the input
|
| 32 |
+
inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True)
|
| 33 |
+
|
| 34 |
+
# Move inputs to GPU if available
|
| 35 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 36 |
+
model.to(device)
|
| 37 |
+
inputs = {key: value.to(device) for key, value in inputs.items()}
|
| 38 |
+
print(f"Using device: {device}") # Debug: Show device being used
|
| 39 |
+
|
| 40 |
+
# Generate output with adjusted parameters
|
| 41 |
+
try:
|
| 42 |
+
outputs = model.generate(
|
| 43 |
+
inputs["input_ids"],
|
| 44 |
+
max_length=max_length, # Increased for longer descriptions
|
| 45 |
+
num_beams=5, # Increased beams for better quality
|
| 46 |
+
length_penalty=1.2, # Slightly favor longer outputs
|
| 47 |
+
early_stopping=True,
|
| 48 |
+
no_repeat_ngram_size=2, # Prevent repetitive phrases
|
| 49 |
+
do_sample=False # Use beam search, not sampling
|
| 50 |
+
)
|
| 51 |
+
# Decode and return the generated text
|
| 52 |
+
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 53 |
+
if not generated_text.strip():
|
| 54 |
+
return "Warning: Empty description generated. Check model training or prompt format."
|
| 55 |
+
return generated_text
|
| 56 |
+
except Exception as e:
|
| 57 |
+
return f"Error generating description: {str(e)}"
|
| 58 |
+
|
| 59 |
+
# Example usage: Generate descriptions for a few commands
|
| 60 |
+
test_commands = [
|
| 61 |
+
("ls", "linux"),
|
| 62 |
+
("dir", "cmd"),
|
| 63 |
+
("chmod", "macos"),
|
| 64 |
+
("MsgBox", "vbscript")
|
| 65 |
+
]
|
| 66 |
+
|
| 67 |
+
print("\nGenerated Command Descriptions:")
|
| 68 |
+
print("-" * 50)
|
| 69 |
+
for command_name, source in test_commands:
|
| 70 |
+
description = generate_command_description(command_name, source)
|
| 71 |
+
print(f"Command: {command_name} ({source})")
|
| 72 |
+
print(f"Description: {description}")
|
| 73 |
+
print("-" * 50)
|
| 74 |
+
|
| 75 |
+
# Interactive mode: Allow user to input a command and source
|
| 76 |
+
print("\nInteractive Mode: Enter a command and source to get its description.")
|
| 77 |
+
print("Valid sources: cmd, linux, macos, vbscript")
|
| 78 |
+
print("Type 'exit' to quit.\n")
|
| 79 |
+
while True:
|
| 80 |
+
command_name = input("Enter command name (or 'exit' to quit): ").strip()
|
| 81 |
+
if command_name.lower() == "exit":
|
| 82 |
+
break
|
| 83 |
+
source = input("Enter source (e.g., cmd, linux, macos, vbscript): ").strip().lower()
|
| 84 |
+
# Validate source
|
| 85 |
+
valid_sources = ["cmd", "linux", "macos", "vbscript"]
|
| 86 |
+
if source not in valid_sources:
|
| 87 |
+
print(f"Invalid source. Please use one of: {', '.join(valid_sources)}")
|
| 88 |
+
continue
|
| 89 |
+
description = generate_command_description(command_name, source)
|
| 90 |
+
print(f"\nCommand: {command_name} ({source})")
|
| 91 |
+
print(f"Description: {description}")
|
| 92 |
+
print("-" * 50)
|
| 93 |
+
|
| 94 |
+
print("Exiting interactive mode.")
|
update_model_safetensor_cmd.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
+
import torch
|
| 5 |
+
|
| 6 |
+
model_name = "t5-small"
|
| 7 |
+
model = T5ForConditionalGeneration.from_pretrained(model_name)
|
| 8 |
+
tokenizer = T5Tokenizer.from_pretrained(model_name)
|
| 9 |
+
|
| 10 |
+
dataset = load_dataset("csv", data_files={"train": "all_commands.csv"})
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
dataset = dataset["train"].train_test_split(test_size=0.2)
|
| 14 |
+
dataset["validation"] = dataset["test"] # Rename 'test' to 'validation' for Trainer
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def preprocess_function(examples):
|
| 18 |
+
inputs = [f"Describe the command: {name} in {source}" for name, source in zip(examples["name"], examples["source"])]
|
| 19 |
+
targets = examples["description"]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
|
| 26 |
+
model_inputs["labels"] = labels["input_ids"]
|
| 27 |
+
|
| 28 |
+
return model_inputs
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
training_args = TrainingArguments(
|
| 35 |
+
output_dir="./new_cmd_model",
|
| 36 |
+
evaluation_strategy="epoch",
|
| 37 |
+
learning_rate=5e-5,
|
| 38 |
+
per_device_train_batch_size=8,
|
| 39 |
+
per_device_eval_batch_size=8,
|
| 40 |
+
num_train_epochs=3,
|
| 41 |
+
weight_decay=0.01,
|
| 42 |
+
save_strategy="epoch",
|
| 43 |
+
load_best_model_at_end=True,
|
| 44 |
+
metric_for_best_model="eval_loss",
|
| 45 |
+
greater_is_better=False,
|
| 46 |
+
fp16=True,
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# Initialize Trainer
|
| 50 |
+
trainer = Trainer(
|
| 51 |
+
model=model,
|
| 52 |
+
args=training_args,
|
| 53 |
+
train_dataset=tokenized_dataset["train"],
|
| 54 |
+
eval_dataset=tokenized_dataset["validation"],
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Train the model
|
| 58 |
+
trainer.train()
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
model.save_pretrained("./new_cmd_model")
|
| 62 |
+
tokenizer.save_pretrained("./new_cmd_model")
|
| 63 |
+
|
| 64 |
+
print("Fine-tuning complete. Model saved to './new_cmd_model'.")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
fine_tuned_model = T5ForConditionalGeneration.from_pretrained("./new_cmd_model")
|
| 68 |
+
fine_tuned_tokenizer = T5Tokenizer.from_pretrained("./new_cmd_model")
|
| 69 |
+
|
| 70 |
+
# Example prompt
|
| 71 |
+
prompt = "Describe the command: ls in linux"
|
| 72 |
+
|
| 73 |
+
# Tokenize and generate
|
| 74 |
+
inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
|
| 75 |
+
outputs = fine_tuned_model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
|
| 76 |
+
generated_text = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 77 |
+
|
| 78 |
+
print("Example generated description:", generated_text)
|