a0ms1n's picture
Update model to V.0.5.3
cd68abb
raw
history blame
3.16 kB
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
import pandas as pd
import re
import subprocess
import shutil
import torch
model_path = "Model-V0.5.3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
labels = model.config.id2label
label2id = model.config.label2id
import re
preprocessor_pattern = re.compile(r'^\s*#.*$', re.MULTILINE)
block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
using_pattern = re.compile(r'^\s*using\s+[^\n;]+;', re.MULTILINE)
typedef_pattern = re.compile(r'^\s*typedef\s+[^\n;]+;', re.MULTILINE)
line_comment_pattern = re.compile(r'//.*')
def remove_comments(code):
code = block_comment_pattern.sub('', code)
code = line_comment_pattern.sub('', code)
return code
def replace_preprocessor(code):
code = preprocessor_pattern.sub('<PREPROCESSOR>', code)
code = using_pattern.sub('<PREPROCESSOR>', code)
code = typedef_pattern.sub('<PREPROCESSOR>',code)
return code
def strip_lines(text, max_blank_lines=0):
text += '\n'
lines = text.splitlines()
kept = []
consec = 0
for line in lines:
if line.strip() == "":
consec +=1
else:
consec = 0
if consec <= max_blank_lines:
kept.append(line)
return '\n'.join(kept)
space_braces_function_pattern = re.compile(r'(\([^\)]*\))\s*\{')
multiline_function_pattern = re.compile(r'(\([^\)]*\))\s*\n\s*\{')
def normalize_braces(code):
code = multiline_function_pattern.sub(r'\1{', code)
code = space_braces_function_pattern.sub(r'\1{',code)
return code
def format_cpp(code: str, style: str = "Google") -> str:
if not shutil.which("clang-format"):
raise EnvironmentError("clang-format is not installed or not in PATH.")
result = subprocess.run(
["clang-format", f"--style={style}"],
input=code.encode(),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True
)
return result.stdout.decode()
def preprocess(code):
# code = format_cpp(code)
code = remove_comments(code)
code = replace_preprocessor(code)
code = normalize_braces(code)
code = strip_lines(code)
return code
def eval(source):
source = preprocess(source)
inputs = tokenizer(
source,
truncation=True,
padding='max_length',
max_length=512,
return_tensors='pt'
)
model.cpu()
model.eval()
inputs = {k: v.cpu() for k, v in inputs.items()}
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
pred_id = probs.argmax()
# print("Label:", labels[pred_id], " | Score:", probs[pred_id])
return labels[pred_id], f"{probs[label2id['AI']]*100:.2f} %"