| | from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel
|
| | from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
|
| | import pandas as pd
|
| | import re
|
| | import subprocess
|
| | import shutil
|
| | import torch
|
| |
|
| | model_path = "Model-V0.5.3"
|
| | tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| | config = AutoConfig.from_pretrained(model_path)
|
| | model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
|
| | labels = model.config.id2label
|
| | label2id = model.config.label2id
|
| |
|
| | import re
|
| | preprocessor_pattern = re.compile(r'^\s*#.*$', re.MULTILINE)
|
| | block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
|
| | using_pattern = re.compile(r'^\s*using\s+[^\n;]+;', re.MULTILINE)
|
| | typedef_pattern = re.compile(r'^\s*typedef\s+[^\n;]+;', re.MULTILINE)
|
| | line_comment_pattern = re.compile(r'//.*')
|
| |
|
| | def remove_comments(code):
|
| | code = block_comment_pattern.sub('', code)
|
| | code = line_comment_pattern.sub('', code)
|
| | return code
|
| |
|
| | def replace_preprocessor(code):
|
| | code = preprocessor_pattern.sub('<PREPROCESSOR>', code)
|
| | code = using_pattern.sub('<PREPROCESSOR>', code)
|
| | code = typedef_pattern.sub('<PREPROCESSOR>',code)
|
| | return code
|
| |
|
| | def strip_lines(text, max_blank_lines=0):
|
| | text += '\n'
|
| | lines = text.splitlines()
|
| | kept = []
|
| | consec = 0
|
| | for line in lines:
|
| | if line.strip() == "":
|
| | consec +=1
|
| | else:
|
| | consec = 0
|
| | if consec <= max_blank_lines:
|
| | kept.append(line)
|
| | return '\n'.join(kept)
|
| |
|
| | space_braces_function_pattern = re.compile(r'(\([^\)]*\))\s*\{')
|
| | multiline_function_pattern = re.compile(r'(\([^\)]*\))\s*\n\s*\{')
|
| |
|
| | def normalize_braces(code):
|
| | code = multiline_function_pattern.sub(r'\1{', code)
|
| | code = space_braces_function_pattern.sub(r'\1{',code)
|
| | return code
|
| |
|
| | def format_cpp(code: str, style: str = "Google") -> str:
|
| | if not shutil.which("clang-format"):
|
| | raise EnvironmentError("clang-format is not installed or not in PATH.")
|
| |
|
| | result = subprocess.run(
|
| | ["clang-format", f"--style={style}"],
|
| | input=code.encode(),
|
| | stdout=subprocess.PIPE,
|
| | stderr=subprocess.PIPE,
|
| | check=True
|
| | )
|
| |
|
| | return result.stdout.decode()
|
| |
|
| | def preprocess(code):
|
| |
|
| | code = remove_comments(code)
|
| | code = replace_preprocessor(code)
|
| | code = normalize_braces(code)
|
| | code = strip_lines(code)
|
| | return code
|
| |
|
| |
|
| | def eval(source):
|
| | source = preprocess(source)
|
| | inputs = tokenizer(
|
| | source,
|
| | truncation=True,
|
| | padding='max_length',
|
| | max_length=512,
|
| | return_tensors='pt'
|
| | )
|
| |
|
| | model.cpu()
|
| | model.eval()
|
| | inputs = {k: v.cpu() for k, v in inputs.items()}
|
| |
|
| | with torch.no_grad():
|
| | outputs = model(**inputs)
|
| |
|
| | probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
|
| | pred_id = probs.argmax()
|
| |
|
| | return labels[pred_id], f"{probs[label2id['AI']]*100:.2f} %"
|
| |
|