Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from streamlit_option_menu import option_menu | |
| import numpy as np | |
| import os | |
| import datasets | |
| import argparse | |
| from typing import Tuple | |
| import transformers | |
| import torch | |
| from torch.utils.data import Dataset | |
| import matplotlib as plt | |
| import random | |
| from tqdm import tqdm | |
| import pandas as pd | |
| from huggingface_hub import login | |
| from torch.optim import lr_scheduler | |
| from typing import Callable, Dict, List, Tuple, Union | |
| import csv | |
| from timeit import default_timer as timer | |
| def load_tokenizer(tokenizer_name:str)->object: | |
| """ | |
| Function to load the tokenizer by the model's name | |
| Args: | |
| - tokenizer_name -> the name of the tokenizerto download | |
| Returns: | |
| - tokenizer -> returns respectively the model and the tokenizer | |
| """ | |
| tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m") | |
| return tokenizer | |
| def load_model(model_name:str)->object: | |
| """ | |
| Function for model loading | |
| Args: | |
| - model_name -> the name of the model | |
| Returns: | |
| - model,tokenizer -> returns respectively the model and the tokenizer | |
| """ | |
| print(f'Loading model {model_name}...') | |
| model_kwargs = {} | |
| model_kwargs.update(dict( torch_dtype=torch.bfloat16)) | |
| transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"] | |
| model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs) | |
| print("---MODEL LOADED---") | |
| return model_encoder | |
| class stylometer_classifier(torch.nn.Module): | |
| def __init__(self,pretrained_encoder,dimensionality): | |
| super(stylometer_classifier, self).__init__() | |
| self.modelBase = pretrained_encoder | |
| self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16) | |
| self.activation = torch.nn.ReLU() | |
| self.dropout = torch.nn.Dropout(0.2) | |
| self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16) | |
| def forward(self, input_ids, padding_mask): | |
| output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask) | |
| hidden_state = output_1[0] | |
| #Here i take only the cls token representation for further classification | |
| cls_output = hidden_state[:, 0] | |
| pooler = self.pre_classifier(cls_output) | |
| afterActivation = self.activation(pooler) | |
| pooler_after_act = self.dropout(afterActivation) | |
| output = torch.sigmoid(self.classifier(pooler_after_act)) | |
| if output>=0.07: | |
| return {"my_class":"It's a Human!", | |
| "prob":output} | |
| else: | |
| return {"my_class":"It's an LLM!", | |
| "prob":output} | |
| return output | |
| def adapt_model(model:object, dim:int=1024) -> object: | |
| """ | |
| This function returns the model with a classification head | |
| """ | |
| newModel = stylometer_classifier(model,dimensionality=dim) | |
| return newModel | |
| def main(): | |
| print("----starting enviroment----") | |
| model_name = "Salesforce/codet5p-770m" | |
| checkpoint = "checkpoint.bin" | |
| DEVICE = "cpu" | |
| #load tokenizer | |
| tokenizer = load_tokenizer(model_name) | |
| print("tokenizer loaded!") | |
| #loading model and tokenizer for functional translation | |
| model = load_model(model_name) | |
| #adding classification head to the model | |
| model = adapt_model(model, dim=model.shared.embedding_dim) | |
| model.load_state_dict(torch.load(checkpoint,map_location='cpu')) | |
| model = model.eval() | |
| st.title("Human-AI stylometer - Multilingual") | |
| st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry') | |
| text = st.text_area("insert your code here") | |
| button = st.button("send") | |
| if button or text: | |
| input = tokenizer([text]) | |
| out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask)) | |
| st.write(out["my_class"]) | |
| if __name__ == '__main__': | |
| main() |