File size: 3,994 Bytes

df03681

# Revised script to use the fine-tuned T5 model for generating command descriptions
# Includes debugging output and adjusted generation parameters
# Model directory: C:\app\dataset\new_cmd_model
# Prerequisites: Ensure transformers, torch, and sentencepiece are installed

import os
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

# Define model and tokenizer path
model_path = r"C:\app\dataset\new_cmd_model"

# Check if model directory exists
if not os.path.exists(model_path):
    raise FileNotFoundError(f"Model directory '{model_path}' not found. Please verify the path.")

# Load the fine-tuned model and tokenizer
try:
    model = T5ForConditionalGeneration.from_pretrained(model_path)
    tokenizer = T5Tokenizer.from_pretrained(model_path)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    raise Exception(f"Error loading model or tokenizer: {str(e)}")

# Function to generate a description for a given command and source
def generate_command_description(command_name, source, max_length=150):
    # Format the input prompt as used during training
    prompt = f"Describe the command: {command_name} in {source}"
    print(f"Input prompt: {prompt}")  # Debug: Show the prompt being used
    
    # Tokenize the input
    inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True)
    
    # Move inputs to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    print(f"Using device: {device}")  # Debug: Show device being used
    
    # Generate output with adjusted parameters
    try:
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_length,      # Increased for longer descriptions
            num_beams=5,               # Increased beams for better quality
            length_penalty=1.2,        # Slightly favor longer outputs
            early_stopping=True,
            no_repeat_ngram_size=2,    # Prevent repetitive phrases
            do_sample=False            # Use beam search, not sampling
        )
        # Decode and return the generated text
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if not generated_text.strip():
            return "Warning: Empty description generated. Check model training or prompt format."
        return generated_text
    except Exception as e:
        return f"Error generating description: {str(e)}"

# Example usage: Generate descriptions for a few commands
test_commands = [
    ("ls", "linux"),
    ("dir", "cmd"),
    ("chmod", "macos"),
    ("MsgBox", "vbscript")
]

print("\nGenerated Command Descriptions:")
print("-" * 50)
for command_name, source in test_commands:
    description = generate_command_description(command_name, source)
    print(f"Command: {command_name} ({source})")
    print(f"Description: {description}")
    print("-" * 50)

# Interactive mode: Allow user to input a command and source
print("\nInteractive Mode: Enter a command and source to get its description.")
print("Valid sources: cmd, linux, macos, vbscript")
print("Type 'exit' to quit.\n")
while True:
    command_name = input("Enter command name (or 'exit' to quit): ").strip()
    if command_name.lower() == "exit":
        break
    source = input("Enter source (e.g., cmd, linux, macos, vbscript): ").strip().lower()
    # Validate source
    valid_sources = ["cmd", "linux", "macos", "vbscript"]
    if source not in valid_sources:
        print(f"Invalid source. Please use one of: {', '.join(valid_sources)}")
        continue
    description = generate_command_description(command_name, source)
    print(f"\nCommand: {command_name} ({source})")
    print(f"Description: {description}")
    print("-" * 50)

print("Exiting interactive mode.")