File size: 3,161 Bytes
e6e78f1
 
 
 
 
 
 
 
cd68abb
e6e78f1
 
 
 
 
 
 
 
 
cd68abb
 
e6e78f1
 
 
 
 
 
 
 
 
cd68abb
 
 
e6e78f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd68abb
e6e78f1
 
 
 
cd68abb
e6e78f1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig, AutoModel
from datasets import Dataset, DatasetDict, Features, Sequence, ClassLabel, Value
import pandas as pd
import re
import subprocess
import shutil
import torch

model_path = "Model-V0.5.3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, config=config)
labels = model.config.id2label
label2id = model.config.label2id

import re
preprocessor_pattern = re.compile(r'^\s*#.*$', re.MULTILINE)
block_comment_pattern = re.compile(r'/\*.*?\*/', re.DOTALL)
using_pattern = re.compile(r'^\s*using\s+[^\n;]+;', re.MULTILINE)
typedef_pattern = re.compile(r'^\s*typedef\s+[^\n;]+;', re.MULTILINE)
line_comment_pattern = re.compile(r'//.*')

def remove_comments(code):
    code = block_comment_pattern.sub('', code)
    code = line_comment_pattern.sub('', code)
    return code
    
def replace_preprocessor(code):
    code = preprocessor_pattern.sub('<PREPROCESSOR>', code)
    code = using_pattern.sub('<PREPROCESSOR>', code)
    code = typedef_pattern.sub('<PREPROCESSOR>',code)
    return code
    
def strip_lines(text, max_blank_lines=0):
    text += '\n'
    lines = text.splitlines()
    kept = []
    consec = 0
    for line in lines:
        if line.strip() == "":
            consec +=1
        else:
            consec = 0
        if consec <= max_blank_lines:
            kept.append(line)
    return '\n'.join(kept)    

space_braces_function_pattern = re.compile(r'(\([^\)]*\))\s*\{')
multiline_function_pattern = re.compile(r'(\([^\)]*\))\s*\n\s*\{')

def normalize_braces(code):
    code = multiline_function_pattern.sub(r'\1{', code)
    code = space_braces_function_pattern.sub(r'\1{',code)
    return code

def format_cpp(code: str, style: str = "Google") -> str:
    if not shutil.which("clang-format"):
        raise EnvironmentError("clang-format is not installed or not in PATH.")

    result = subprocess.run(
        ["clang-format", f"--style={style}"],
        input=code.encode(),
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        check=True
    )

    return result.stdout.decode()

def preprocess(code):
    # code = format_cpp(code)
    code = remove_comments(code)
    code = replace_preprocessor(code)
    code = normalize_braces(code)
    code = strip_lines(code)
    return code


def eval(source):
    source = preprocess(source)
    inputs = tokenizer(
        source,
        truncation=True,
        padding='max_length',
        max_length=512,
        return_tensors='pt'
    )

    model.cpu()
    model.eval()
    inputs = {k: v.cpu() for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.softmax(outputs.logits, dim=-1).detach().cpu().numpy()[0]
    pred_id = probs.argmax()
    # print("Label:", labels[pred_id], " | Score:", probs[pred_id])
    return labels[pred_id], f"{probs[label2id['AI']]*100:.2f} %"