ankitkushwaha90 commited on
Commit
df03681
·
verified ·
1 Parent(s): 7fe3a6c

Upload 2 files

Browse files
Files changed (2) hide show
  1. debug_t5_model.py +94 -0
  2. update_model_safetensor_cmd.py +78 -0
debug_t5_model.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Revised script to use the fine-tuned T5 model for generating command descriptions
2
+ # Includes debugging output and adjusted generation parameters
3
+ # Model directory: C:\app\dataset\new_cmd_model
4
+ # Prerequisites: Ensure transformers, torch, and sentencepiece are installed
5
+
6
+ import os
7
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
8
+ import torch
9
+
10
+ # Define model and tokenizer path
11
+ model_path = r"C:\app\dataset\new_cmd_model"
12
+
13
+ # Check if model directory exists
14
+ if not os.path.exists(model_path):
15
+ raise FileNotFoundError(f"Model directory '{model_path}' not found. Please verify the path.")
16
+
17
+ # Load the fine-tuned model and tokenizer
18
+ try:
19
+ model = T5ForConditionalGeneration.from_pretrained(model_path)
20
+ tokenizer = T5Tokenizer.from_pretrained(model_path)
21
+ print("Model and tokenizer loaded successfully.")
22
+ except Exception as e:
23
+ raise Exception(f"Error loading model or tokenizer: {str(e)}")
24
+
25
+ # Function to generate a description for a given command and source
26
+ def generate_command_description(command_name, source, max_length=150):
27
+ # Format the input prompt as used during training
28
+ prompt = f"Describe the command: {command_name} in {source}"
29
+ print(f"Input prompt: {prompt}") # Debug: Show the prompt being used
30
+
31
+ # Tokenize the input
32
+ inputs = tokenizer(prompt, return_tensors="pt", max_length=128, truncation=True)
33
+
34
+ # Move inputs to GPU if available
35
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
36
+ model.to(device)
37
+ inputs = {key: value.to(device) for key, value in inputs.items()}
38
+ print(f"Using device: {device}") # Debug: Show device being used
39
+
40
+ # Generate output with adjusted parameters
41
+ try:
42
+ outputs = model.generate(
43
+ inputs["input_ids"],
44
+ max_length=max_length, # Increased for longer descriptions
45
+ num_beams=5, # Increased beams for better quality
46
+ length_penalty=1.2, # Slightly favor longer outputs
47
+ early_stopping=True,
48
+ no_repeat_ngram_size=2, # Prevent repetitive phrases
49
+ do_sample=False # Use beam search, not sampling
50
+ )
51
+ # Decode and return the generated text
52
+ generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
+ if not generated_text.strip():
54
+ return "Warning: Empty description generated. Check model training or prompt format."
55
+ return generated_text
56
+ except Exception as e:
57
+ return f"Error generating description: {str(e)}"
58
+
59
+ # Example usage: Generate descriptions for a few commands
60
+ test_commands = [
61
+ ("ls", "linux"),
62
+ ("dir", "cmd"),
63
+ ("chmod", "macos"),
64
+ ("MsgBox", "vbscript")
65
+ ]
66
+
67
+ print("\nGenerated Command Descriptions:")
68
+ print("-" * 50)
69
+ for command_name, source in test_commands:
70
+ description = generate_command_description(command_name, source)
71
+ print(f"Command: {command_name} ({source})")
72
+ print(f"Description: {description}")
73
+ print("-" * 50)
74
+
75
+ # Interactive mode: Allow user to input a command and source
76
+ print("\nInteractive Mode: Enter a command and source to get its description.")
77
+ print("Valid sources: cmd, linux, macos, vbscript")
78
+ print("Type 'exit' to quit.\n")
79
+ while True:
80
+ command_name = input("Enter command name (or 'exit' to quit): ").strip()
81
+ if command_name.lower() == "exit":
82
+ break
83
+ source = input("Enter source (e.g., cmd, linux, macos, vbscript): ").strip().lower()
84
+ # Validate source
85
+ valid_sources = ["cmd", "linux", "macos", "vbscript"]
86
+ if source not in valid_sources:
87
+ print(f"Invalid source. Please use one of: {', '.join(valid_sources)}")
88
+ continue
89
+ description = generate_command_description(command_name, source)
90
+ print(f"\nCommand: {command_name} ({source})")
91
+ print(f"Description: {description}")
92
+ print("-" * 50)
93
+
94
+ print("Exiting interactive mode.")
update_model_safetensor_cmd.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, Trainer, TrainingArguments
3
+ from datasets import load_dataset
4
+ import torch
5
+
6
+ model_name = "t5-small"
7
+ model = T5ForConditionalGeneration.from_pretrained(model_name)
8
+ tokenizer = T5Tokenizer.from_pretrained(model_name)
9
+
10
+ dataset = load_dataset("csv", data_files={"train": "all_commands.csv"})
11
+
12
+
13
+ dataset = dataset["train"].train_test_split(test_size=0.2)
14
+ dataset["validation"] = dataset["test"] # Rename 'test' to 'validation' for Trainer
15
+
16
+
17
+ def preprocess_function(examples):
18
+ inputs = [f"Describe the command: {name} in {source}" for name, source in zip(examples["name"], examples["source"])]
19
+ targets = examples["description"]
20
+
21
+
22
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
23
+
24
+
25
+ labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
26
+ model_inputs["labels"] = labels["input_ids"]
27
+
28
+ return model_inputs
29
+
30
+
31
+ tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)
32
+
33
+
34
+ training_args = TrainingArguments(
35
+ output_dir="./new_cmd_model",
36
+ evaluation_strategy="epoch",
37
+ learning_rate=5e-5,
38
+ per_device_train_batch_size=8,
39
+ per_device_eval_batch_size=8,
40
+ num_train_epochs=3,
41
+ weight_decay=0.01,
42
+ save_strategy="epoch",
43
+ load_best_model_at_end=True,
44
+ metric_for_best_model="eval_loss",
45
+ greater_is_better=False,
46
+ fp16=True,
47
+ )
48
+
49
+ # Initialize Trainer
50
+ trainer = Trainer(
51
+ model=model,
52
+ args=training_args,
53
+ train_dataset=tokenized_dataset["train"],
54
+ eval_dataset=tokenized_dataset["validation"],
55
+ )
56
+
57
+ # Train the model
58
+ trainer.train()
59
+
60
+
61
+ model.save_pretrained("./new_cmd_model")
62
+ tokenizer.save_pretrained("./new_cmd_model")
63
+
64
+ print("Fine-tuning complete. Model saved to './new_cmd_model'.")
65
+
66
+
67
+ fine_tuned_model = T5ForConditionalGeneration.from_pretrained("./new_cmd_model")
68
+ fine_tuned_tokenizer = T5Tokenizer.from_pretrained("./new_cmd_model")
69
+
70
+ # Example prompt
71
+ prompt = "Describe the command: ls in linux"
72
+
73
+ # Tokenize and generate
74
+ inputs = fine_tuned_tokenizer(prompt, return_tensors="pt")
75
+ outputs = fine_tuned_model.generate(inputs["input_ids"], max_length=100, num_beams=4, early_stopping=True)
76
+ generated_text = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
77
+
78
+ print("Example generated description:", generated_text)