from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value import pandas as pd import re import subprocess import shutil import torch model_path = "Model-V0.5.3" tokenizer = AutoTokenizer.from_pretrained(model_path) config = AutoConfig.from_pretrained(model_path) model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config) labels = model.config.id2label label2id = model.config.label2id import re preprocessor_pattern = re.compile(r'^\s*#.*$', re.MULTILINE) block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL) using_pattern = re.compile(r'^\s*using\s+[^\n;]+;', re.MULTILINE) typedef_pattern = re.compile(r'^\s*typedef\s+[^\n;]+;', re.MULTILINE) line_comment_pattern = re.compile(r'//.*') def remove_comments(code): code = block_comment_pattern.sub('', code) code = line_comment_pattern.sub('', code) return code def replace_preprocessor(code): code = preprocessor_pattern.sub('', code) code = using_pattern.sub('', code) code = typedef_pattern.sub('',code) return code def strip_lines(text, max_blank_lines=0): text += '\n' lines = text.splitlines() kept = [] consec = 0 for line in lines: if line.strip() == "": consec +=1 else: consec = 0 if consec <= max_blank_lines: kept.append(line) return '\n'.join(kept) space_braces_function_pattern = re.compile(r'(\([^\)]*\))\s*\{') multiline_function_pattern = re.compile(r'(\([^\)]*\))\s*\n\s*\{') def normalize_braces(code): code = multiline_function_pattern.sub(r'\1{', code) code = space_braces_function_pattern.sub(r'\1{',code) return code def format_cpp(code: str, style: str = "Google") -> str: if not shutil.which("clang-format"): raise EnvironmentError("clang-format is not installed or not in PATH.") result = subprocess.run( ["clang-format", f"--style={style}"], input=code.encode(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True ) return result.stdout.decode() def preprocess(code): # code = format_cpp(code) code = remove_comments(code) code = replace_preprocessor(code) code = normalize_braces(code) code = strip_lines(code) return code def eval(source): source = preprocess(source) inputs = tokenizer( source, truncation=True, padding='max_length', max_length=512, return_tensors='pt' ) model.cpu() model.eval() inputs = {k: v.cpu() for k, v in inputs.items()} with torch.no_grad(): outputs = model(**inputs) probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0] pred_id = probs.argmax() # print("Label:", labels[pred_id], " | Score:", probs[pred_id]) return labels[pred_id], f"{probs[label2id['AI']]*100:.2f} %"