Upload 2 files

Browse files

Files changed (2) hide show

debug_t5_model.py +94 -0
update_model_safetensor_cmd.py +78 -0

debug_t5_model.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Revised script to use the fine-tuned T5 model for generating command descriptions
+# Includes debugging output and adjusted generation parameters
+# Model directory: C:\app\dataset\new_cmd_model
+# Prerequisites: Ensure transformers, torch, and sentencepiece are installed
+import os
+from transformers import T5ForConditionalGeneration, T5Tokenizer
+import torch
+# Define model and tokenizer path
+model_path = r"C:\app\dataset\new_cmd_model"
+# Check if model directory exists
+if not os.path.exists(model_path):
+    raise FileNotFoundError(f"Model directory '{model_path}' not found. Please verify the path.")
+# Load the fine-tuned model and tokenizer
+try:
+    model = T5ForConditionalGeneration.from_pretrained(model_path)
+    tokenizer = T5Tokenizer.from_pretrained(model_path)
+    print("Model and tokenizer loaded successfully.")
+except Exception as e:
+    raise Exception(f"Error loading model or tokenizer: {str(e)}")
+# Function to generate a description for a given command and source
+def generate_command_description(command_name, source, max_length=150):
+    # Format the input prompt as used during training
+    prompt = f"Describe the command: {command_name} in {source}"
+    print(f"Input prompt: {prompt}")  # Debug: Show the prompt being used
+    # Tokenize the input
+    inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True)
+    # Move inputs to GPU if available
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model.to(device)
+    inputs = {key: value.to(device) for key, value in inputs.items()}
+    print(f"Using device: {device}")  # Debug: Show device being used
+    # Generate output with adjusted parameters
+    try:
+        outputs = model.generate(
+            inputs["input_ids"],
+            max_length=max_length,      # Increased for longer descriptions
+            num_beams=5,               # Increased beams for better quality
+            length_penalty=1.2,        # Slightly favor longer outputs
+            early_stopping=True,
+            no_repeat_ngram_size=2,    # Prevent repetitive phrases
+            do_sample=False            # Use beam search, not sampling
+        )
+        # Decode and return the generated text
+        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        if not generated_text.strip():
+            return "Warning: Empty description generated. Check model training or prompt format."
+        return generated_text
+    except Exception as e:
+        return f"Error generating description: {str(e)}"
+# Example usage: Generate descriptions for a few commands
+test_commands = [
+    ("ls", "linux"),
+    ("dir", "cmd"),
+    ("chmod", "macos"),
+    ("MsgBox", "vbscript")
+]
+print("\nGenerated Command Descriptions:")
+print("-" * 50)
+for command_name, source in test_commands:
+    description = generate_command_description(command_name, source)
+    print(f"Command: {command_name} ({source})")
+    print(f"Description: {description}")
+    print("-" * 50)
+# Interactive mode: Allow user to input a command and source
+print("\nInteractive Mode: Enter a command and source to get its description.")
+print("Valid sources: cmd, linux, macos, vbscript")
+print("Type 'exit' to quit.\n")
+while True:
+    command_name = input("Enter command name (or 'exit' to quit): ").strip()
+    if command_name.lower() == "exit":
+        break
+    source = input("Enter source (e.g., cmd, linux, macos, vbscript): ").strip().lower()
+    # Validate source
+    valid_sources = ["cmd", "linux", "macos", "vbscript"]
+    if source not in valid_sources:
+        print(f"Invalid source. Please use one of: {', '.join(valid_sources)}")
+        continue
+    description = generate_command_description(command_name, source)
+    print(f"\nCommand: {command_name} ({source})")
+    print(f"Description: {description}")
+    print("-" * 50)
+print("Exiting interactive mode.")

update_model_safetensor_cmd.py ADDED Viewed

	@@ -0,0 +1,78 @@

+from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
+from datasets import load_dataset
+import torch
+model_name = "t5-small"
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+dataset = load_dataset("csv", data_files={"train": "all_commands.csv"})
+dataset = dataset["train"].train_test_split(test_size=0.2)
+dataset["validation"] = dataset["test"]  # Rename 'test' to 'validation' for Trainer
+def preprocess_function(examples):
+    inputs = [f"Describe the command: {name} in {source}" for name, source in zip(examples["name"], examples["source"])]
+    targets = examples["description"]
+    model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
+    labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
+    model_inputs["labels"] = labels["input_ids"]
+    return model_inputs
+tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
+training_args = TrainingArguments(
+    output_dir="./new_cmd_model",
+    evaluation_strategy="epoch",
+    learning_rate=5e-5,
+    per_device_train_batch_size=8,
+    per_device_eval_batch_size=8,
+    num_train_epochs=3,
+    weight_decay=0.01,
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    metric_for_best_model="eval_loss",
+    greater_is_better=False,
+    fp16=True,
+)
+# Initialize Trainer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=tokenized_dataset["train"],
+    eval_dataset=tokenized_dataset["validation"],
+)
+# Train the model
+trainer.train()
+model.save_pretrained("./new_cmd_model")
+tokenizer.save_pretrained("./new_cmd_model")
+print("Fine-tuning complete. Model saved to './new_cmd_model'.")
+fine_tuned_model = T5ForConditionalGeneration.from_pretrained("./new_cmd_model")
+fine_tuned_tokenizer = T5Tokenizer.from_pretrained("./new_cmd_model")
+# Example prompt
+prompt = "Describe the command: ls in linux"
+# Tokenize and generate
+inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
+outputs = fine_tuned_model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
+generated_text = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
+print("Example generated description:", generated_text)