SushantGautam commited on
Commit
8a8e78a
·
verified ·
1 Parent(s): fe97a99

Create script.py

Browse files
Files changed (1) hide show
  1. script.py +87 -0
script.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import random
3
+ import ast
4
+ import re
5
+ import subprocess
6
+ import sys
7
+ from peft import PeftModel
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
9
+
10
+ # Global variables to store the model and tokenizer (to avoid multiple loads)
11
+
12
+ _model = None
13
+ _tokenizer = None
14
+
15
+
16
+ def load_model():
17
+ global _model, _tokenizer
18
+ if _model is None or _tokenizer is None:
19
+ print("Installing required packages...")
20
+ process = subprocess.Popen(
21
+ [sys.executable, "-m", "pip", "install", "bitsandbytes", "transformers", "accelerate", "peft", "torch"],
22
+ stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True
23
+ )
24
+ for line in process.stdout: print(line, end='')
25
+ process.wait()
26
+
27
+ print("Loading model and tokenizer...")
28
+ # Enable quantization to reduce memory usage
29
+ bnb_config = BitsAndBytesConfig(load_in_8bit=True)
30
+
31
+ # Load tokenizer
32
+ _tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Coder-7B-Instruct")
33
+
34
+ # Load model with quantization
35
+ base_model = AutoModelForCausalLM.from_pretrained(
36
+ "Qwen/Qwen2.5-Coder-7B-Instruct",
37
+ quantization_config=bnb_config,
38
+ device_map="auto",
39
+ )
40
+
41
+ # Load the fine-tuned model
42
+ _model = PeftModel.from_pretrained(
43
+ base_model,
44
+ "SushantGautam/vulnerability_ativ0.1",
45
+ device_map="auto",
46
+ )
47
+ else:
48
+ print("Model and tokenizer already loaded.")
49
+ return _model, _tokenizer
50
+
51
+
52
+ def extract_dict(text):
53
+ match = re.search(r"```python\n(.*?)\n```", text, re.DOTALL)
54
+ return ast.literal_eval(match.group(1)) if match else None
55
+
56
+
57
+ def generate(prompt):
58
+ model, tokenizer = load_model()
59
+
60
+ messages = [
61
+ {"role": "system", "content": "You are a cybersecurity expert specializing in CWE vulnerabilities in codes. Your responses must be accompanied by a python JSON."},
62
+ {"role": "user", "content": prompt},
63
+ ]
64
+
65
+ text = tokenizer.apply_chat_template(
66
+ messages,
67
+ tokenize=False,
68
+ add_generation_prompt=True
69
+ )
70
+
71
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
72
+ generated_ids = model.generate(
73
+ **model_inputs,
74
+ max_new_tokens=4000,
75
+ )
76
+ generated_ids = [
77
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
78
+ ]
79
+
80
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
81
+
82
+ try:
83
+ response_formatted = extract_dict(response)
84
+ except:
85
+ response_formatted = "XXX"
86
+
87
+ return {"Generated Answer": response, "Extracted Dict": response_formatted}