CodeBERT_CodeReviewer

Runtime error

App Files Files Community

Not-Grim-Refer commited on Jul 25, 2023

Commit

78cc03b

1 Parent(s): 43c5345

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -282

app.py CHANGED Viewed

@@ -1,287 +1,81 @@
-import gradio as gr
 import requests
-from torch import nn
-from torch.nn import CrossEntropyLoss
-from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForSeq2SeqLM, T5Config
 import torch
 MAX_SOURCE_LENGTH = 512
-class ReviewerModel(T5ForConditionalGeneration):
-    def __init__(self, config):
-        super().__init__(config)
-        self.cls_head = nn.Linear(self.config.d_model, 2, bias=True)
-        self.init()
-    def init(self):
-        nn.init.xavier_uniform_(self.lm_head.weight)
-        factor = self.config.initializer_factor
-        self.cls_head.weight.data.normal_(mean=0.0, \
-                                          std=factor * ((self.config.d_model) ** -0.5))
-        self.cls_head.bias.data.zero_()
-    def forward(
-            self, *argv, **kwargs
-    ):
-        r"""
-        Doc from Huggingface transformers:
-        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
-            Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ...,
-            config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for
-            labels in ``[0, ..., config.vocab_size]``
-        Returns:
-        Examples::
-            >>> from transformers import T5Tokenizer, T5ForConditionalGeneration
-            >>> tokenizer = T5Tokenizer.from_pretrained('t5-small')
-            >>> model = T5ForConditionalGeneration.from_pretrained('t5-small')
-            >>> # training
-            >>> input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-            >>> labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-            >>> outputs = model(input_ids=input_ids, labels=labels)
-            >>> loss = outputs.loss
-            >>> logits = outputs.logits
-            >>> # inference
-            >>> input_ids = tokenizer("summarize: studies have shown that owning a dog is good for you", return_tensors="pt").input_ids  # Batch size 1
-            >>> outputs = model.generate(input_ids)
-            >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-            >>> # studies have shown that owning a dog is good for you.
-        """
-        if "cls" in kwargs:
-            assert (
-                    "input_ids" in kwargs and \
-                    "labels" in kwargs and \
-                    "attention_mask" in kwargs
-            )
-            return self.cls(
-                input_ids=kwargs["input_ids"],
-                labels=kwargs["labels"],
-                attention_mask=kwargs["attention_mask"],
-            )
-        if "input_labels" in kwargs:
-            assert (
-                    "input_ids" in kwargs and \
-                    "input_labels" in kwargs and \
-                    "decoder_input_ids" in kwargs and \
-                    "attention_mask" in kwargs and \
-                    "decoder_attention_mask" in kwargs
-            ), "Please give these arg keys."
-            input_ids = kwargs["input_ids"]
-            input_labels = kwargs["input_labels"]
-            decoder_input_ids = kwargs["decoder_input_ids"]
-            attention_mask = kwargs["attention_mask"]
-            decoder_attention_mask = kwargs["decoder_attention_mask"]
-            if "encoder_loss" not in kwargs:
-                encoder_loss = True
-            else:
-                encoder_loss = kwargs["encoder_loss"]
-            return self.review_forward(input_ids, input_labels, decoder_input_ids, attention_mask,
-                                       decoder_attention_mask, encoder_loss)
-        return super().forward(*argv, **kwargs)
-    def cls(
-            self,
-            input_ids,
-            labels,
-            attention_mask,
-    ):
-        encoder_outputs = self.encoder( \
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            output_attentions=False,
-            return_dict=False
-        )
-        hidden_states = encoder_outputs[0]
-        first_hidden = hidden_states[:, 0, :]
-        first_hidden = nn.Dropout(0.3)(first_hidden)
-        logits = self.cls_head(first_hidden)
-        loss_fct = CrossEntropyLoss()
-        if labels != None:
-            loss = loss_fct(logits, labels)
-            return loss
-        return logits
-    def review_forward(
-            self,
-            input_ids,
-            input_labels,
-            decoder_input_ids,
-            attention_mask,
-            decoder_attention_mask,
-            encoder_loss=True
-    ):
-        encoder_outputs = self.encoder( \
-            input_ids=input_ids,
-            attention_mask=attention_mask,
-            output_attentions=False,
-            return_dict=False
-        )
-        hidden_states = encoder_outputs[0]
-        decoder_inputs = self._shift_right(decoder_input_ids)
-        # Decode
-        decoder_outputs = self.decoder(
-            input_ids=decoder_inputs,
-            attention_mask=decoder_attention_mask,
-            encoder_hidden_states=hidden_states,
-            encoder_attention_mask=attention_mask,
-            output_attentions=False,
-            return_dict=False
-        )
-        sequence_output = decoder_outputs[0]
-        if self.config.tie_word_embeddings:  # this is True default
-            sequence_output = sequence_output * (self.model_dim ** -0.5)
-        if encoder_loss:
-            # print(self.encoder.get_input_embeddings().weight.shape)
-            cls_logits = nn.functional.linear(hidden_states, self.encoder.get_input_embeddings().weight)
-            # cls_logits = self.cls_head(hidden_states)
-        lm_logits = self.lm_head(sequence_output)
-        if decoder_input_ids is not None:
-            lm_loss_fct = CrossEntropyLoss(ignore_index=0)  # Warning: PAD_ID should be 0
-            loss = lm_loss_fct(lm_logits.view(-1, lm_logits.size(-1)), decoder_input_ids.view(-1))
-            if encoder_loss and input_labels is not None:
-                cls_loss_fct = CrossEntropyLoss(ignore_index=-100)
-                loss += cls_loss_fct(cls_logits.view(-1, cls_logits.size(-1)), input_labels.view(-1))
-            return loss
-        return cls_logits, lm_logits
-def prepare_models():
-    tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")
-    tokenizer.special_dict = {
-        f"<e{i}>": tokenizer.get_vocab()[f"<e{i}>"] for i in range(99, -1, -1)
-    }
-    tokenizer.mask_id = tokenizer.get_vocab()["<mask>"]
-    tokenizer.bos_id = tokenizer.get_vocab()["<s>"]
-    tokenizer.pad_id = tokenizer.get_vocab()["<pad>"]
-    tokenizer.eos_id = tokenizer.get_vocab()["</s>"]
-    tokenizer.msg_id = tokenizer.get_vocab()["<msg>"]
-    tokenizer.keep_id = tokenizer.get_vocab()["<keep>"]
-    tokenizer.add_id = tokenizer.get_vocab()["<add>"]
-    tokenizer.del_id = tokenizer.get_vocab()["<del>"]
-    tokenizer.start_id = tokenizer.get_vocab()["<start>"]
-    tokenizer.end_id = tokenizer.get_vocab()["<end>"]
-    config = T5Config.from_pretrained("microsoft/codereviewer")
-    model = ReviewerModel.from_pretrained("microsoft/codereviewer", config=config)
-    model.eval()
-    return tokenizer, model
-def pad_assert(tokenizer, source_ids):
-    source_ids = source_ids[:MAX_SOURCE_LENGTH - 2]
-    source_ids = [tokenizer.bos_id] + source_ids + [tokenizer.eos_id]
-    pad_len = MAX_SOURCE_LENGTH - len(source_ids)
-    source_ids += [tokenizer.pad_id] * pad_len
-    assert len(source_ids) == MAX_SOURCE_LENGTH, "Not equal length."
-    return source_ids
-def encode_diff(tokenizer, diff, msg, source):
-    difflines = diff.split("\n")[1:]  # remove start @@
-    difflines = [line for line in difflines if len(line.strip()) > 0]
-    map_dic = {"-": 0, "+": 1, " ": 2}
-    def f(s):
-        if s in map_dic:
-            return map_dic[s]
-        else:
-            return 2
-    labels = [f(line[0]) for line in difflines]
-    difflines = [line[1:].strip() for line in difflines]
-    inputstr = "<s>" + source + "</s>"
-    inputstr += "<msg>" + msg
-    for label, line in zip(labels, difflines):
-        if label == 1:
-            inputstr += "<add>" + line
-        elif label == 0:
-            inputstr += "<del>" + line
-        else:
-            inputstr += "<keep>" + line
-    source_ids = tokenizer.encode(inputstr, max_length=MAX_SOURCE_LENGTH, truncation=True)[1:-1]
-    source_ids = pad_assert(tokenizer, source_ids)
-    return source_ids
-class FileDiffs(object):
-    def __init__(self, diff_string):
-        diff_array = diff_string.split("\n")
-        self.file_name = diff_array[0]
-        self.file_path = self.file_name.split("a/", 1)[1].rsplit("b/", 1)[0]
-        self.diffs = list()
-        for line in diff_array[4:]:
-            if line.startswith("@@"):
-                self.diffs.append(str())
-            self.diffs[-1] += "\n" + line
-def review_commit(user="p4vv37", repository="ueflow", commit="610a8c7b02b946bc9e5e26e6dacbba0e2abba259"):
-    tokenizer, model = prepare_models()
-    # Get diff and commit metadata from GitHub API
-    commit_metadata = requests.get(F"https://api.github.com/repos/{user}/{repository}/commits/{commit}").json()
-    msg = commit_metadata["commit"]["message"]
-    diff_data = requests.get(F"https://api.github.com/repos/{user}/{repository}/commits/{commit}",
-                             headers={"Accept": "application/vnd.github.diff"})
-    code_diff = diff_data.text
-    # Parse diff into FileDiffs objects
-    files_diffs = list()
-    for file in code_diff.split("diff --git"):
-        if len(file) > 0:
-            fd = FileDiffs(file)
-            files_diffs.append(fd)
-    # Generate comments for each diff
-    output = ""
-    for fd in files_diffs:
-        output += F"File:{fd.file_path}\n"
-        source = requests.get(F"https://raw.githubusercontent.com/{user}/{repository}/^{commit}/{fd.file_path}").text
-        for diff in fd.diffs:
-            inputs = torch.tensor([encode_diff(tokenizer, diff, msg, source)], dtype=torch.long).to("cpu")
-            inputs_mask = inputs.ne(tokenizer.pad_id)
-            logits = model(
-                input_ids=inputs,
-                cls=True,
-                attention_mask=inputs_mask,
-                labels=None,
-                use_cache=True,
-                num_beams=5,
-                early_stopping=True,
-                max_length=100
-            )
-            needs_review = torch.argmax(logits, dim=-1).cpu().numpy()[0]
-            if not needs_review:
-                continue
-            preds = model.generate(inputs,
-                                   attention_mask=inputs_mask,
-                                   use_cache=True,
-                                   num_beams=5,
-                                   early_stopping=True,
-                                   max_length=100,
-                                   num_return_sequences=2
-                                   )
-            preds = list(preds.cpu().numpy())
-            pred_nls = [tokenizer.decode(_id[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
-                        for _id in preds]
-            output += diff + "\n#######\nComment:\n#######\n" + pred_nls[0] + "\n#######\n"
-    return output
-description = "An interface for running " \
-              "\"Microsoft CodeBERT CodeReviewer: Pre-Training for Automating Code Review Activities.\" " \
-              "(microsoft/codereviewer) on GitHub commits."
-examples = [
-    ["p4vv37", "ueflow", "610a8c7b02b946bc9e5e26e6dacbba0e2abba259"],
-    ["microsoft", "vscode", "378b0d711f6b82ac59b47fb246906043a6fb995a"],
-]
-iface = gr.Interface(fn=review_commit,
-                     description=description,
-                     inputs=["text", "text", "text"],
-                     outputs="text",
-                     examples=examples)
-iface.launch()

+# Import necessary modules
+import gradio as gr
 import requests
+from transformers import AutoTokenizer, T5ForConditionalGeneration, T5Config
 import torch
+# Define maximum sequence length
 MAX_SOURCE_LENGTH = 512
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")
+tokenizer.add_special_tokens({'additional_special_tokens': ['<e99>', '<e98>',..., '<e0>', '<msg>', '<add>', '<del>', '<keep>']})
+config = T5Config.from_pretrained("microsoft/codereviewer")
+model = T5ForConditionalGeneration.from_pretrained("microsoft/codereviewer", config=config)
+model.eval()
+def pad_to_max_length(source_ids):
+  source_ids = source_ids[:MAX_SOURCE_LENGTH-2]
+  source_ids = [tokenizer.bos_token_id] + source_ids + [tokenizer.eos_token_id]
+  pad_len = MAX_SOURCE_LENGTH - len(source_ids)
+  source_ids += [tokenizer.pad_token_id] * pad_len
+  assert len(source_ids) == MAX_SOURCE_LENGTH
+  return source_ids
+def encode_diff(diff, msg, source):
+  lines = diff.split('\n')[1:]
+  lines = [line for line in lines if line.strip()]
+  labels = [0 if line[0] == '-' else 1 if line[0] == '+' else 2 for line in lines]
+  lines = [line[1:].strip() for line in lines]
+  tokens = [tokenizer.bos_token] + tokenizer.tokenize(source) + [tokenizer.eos_token]
+  tokens += tokenizer.tokenize(msg)
+  for label, line in zip(labels, lines):
+     if label == 1:
+        tokens += ['<add>'] + tokenizer.tokenize(line)
+     elif label == 0:
+        tokens += ['<del>'] + tokenizer.tokenize(line)
+     else:
+        tokens += ['<keep>'] + tokenizer.tokenize(line)
+  return pad_to_max_length(tokenizer.convert_tokens_to_ids(tokens))
+def get_diffs_and_msg(user, repo, commit):
+  commit_data = requests.get(f'https://api.github.com/repos/{user}/{repo}/commits/{commit}').json()
+  msg = commit_data['commit']['message']
+  diff_response = requests.get(f'https://api.github.com/repos/{user}/{repo}/commits/{commit}',
+                               headers={'Accept': 'application/vnd.github.diff'})
+  diffs = diff_response.text
+  return diffs, msg
+def generate_comments(user, repo, commit):
+  diffs, msg = get_diffs_and_msg(user, repo, commit)
+  file_diffs = []
+  for diff in diffs.split('diff --git')[1:]:
+    lines = diff.split('\n')
+    file_name = lines[0].split(' a/')[1].split(' b/')[0]
+    file_diffs.append({'name': file_name, 'diff': diff})
+  output = ''
+  for fd in file_diffs:
+    source = requests.get(f'https://raw.githubusercontent.com/{user}/{repo}/{commit}/{fd["name"]}').text
+    encoded = encode_diff(fd['diff'], msg, source)
+    input_ids = torch.tensor([encoded]).to(model.device)
+    attention_mask = input_ids.ne(tokenizer.pad_token_id).to(model.device)
+    output_sequences = model.generate(
+      input_ids=input_ids,
+      attention_mask=attention_mask,
+      max_length=100,
+      num_beams=5,
+      num_return_sequences=2,
+      early_stopping=True
+    )
+    comments = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_sequences]
+    output += f'File: {fd["name"]}\n{fd["diff"]}\n\nComments:\n{comments[0]}\n\n'
+  return output