Spaces:

isThisYouLLM
/

Human-Ai

Sleeping

App Files Files Community

Human-Ai / app.py

isThisYouLLM

Update app.py

0f0b9bc verified about 1 year ago

raw

history blame

3.94 kB

	import streamlit as st
	from streamlit_option_menu import option_menu
	import numpy as np
	import os
	import datasets
	import argparse
	from typing import Tuple
	import transformers
	import torch
	from torch.utils.data import Dataset
	import matplotlib as plt
	import random
	from tqdm import tqdm
	import pandas as pd
	from huggingface_hub import login
	from torch.optim import lr_scheduler
	from typing import Callable, Dict, List, Tuple, Union
	import csv
	from timeit import default_timer as timer





	def load_tokenizer(tokenizer_name:str)->object:
	"""
	Function to load the tokenizer by the model's name
	Args:
	- tokenizer_name -> the name of the tokenizerto download
	Returns:
	- tokenizer -> returns respectively the model and the tokenizer
	"""
	tokenizer = transformers.AutoTokenizer.from_pretrained("Salesforce/codet5p-770m")


	return tokenizer


	def load_model(model_name:str)->object:
	"""
	Function for model loading
	Args:
	- model_name -> the name of the model
	Returns:
	- model,tokenizer -> returns respectively the model and the tokenizer
	"""

	print(f'Loading model {model_name}...')


	model_kwargs = {}

	model_kwargs.update(dict( torch_dtype=torch.bfloat16))
	transformers.T5EncoderModel._keys_to_ignore_on_load_unexpected = ["decoder.*"]
	model_encoder = transformers.T5EncoderModel.from_pretrained("Salesforce/codet5p-770m", **model_kwargs)

	print("---MODEL LOADED---")



	return model_encoder

	class stylometer_classifier(torch.nn.Module):
	def __init__(self,pretrained_encoder,dimensionality):
	super(stylometer_classifier, self).__init__()
	self.modelBase = pretrained_encoder
	self.pre_classifier = torch.nn.Linear(dimensionality, 768, dtype=torch.bfloat16)
	self.activation = torch.nn.ReLU()
	self.dropout = torch.nn.Dropout(0.2)
	self.classifier = torch.nn.Linear(768, 1, dtype=torch.bfloat16)




	def forward(self, input_ids, padding_mask):
	output_1 = self.modelBase(input_ids=input_ids, attention_mask=padding_mask)
	hidden_state = output_1[0]
	#Here i take only the cls token representation for further classification
	cls_output = hidden_state[:, 0]
	pooler = self.pre_classifier(cls_output)
	afterActivation = self.activation(pooler)
	pooler_after_act = self.dropout(afterActivation)
	output = torch.sigmoid(self.classifier(pooler_after_act))

	if output>=0.07:
	return {"my_class":"It's a Human!",
	"prob":output}
	else:
	return {"my_class":"It's an LLM!",
	"prob":output}


	return output

	def adapt_model(model:object, dim:int=1024) -> object:
	"""
	This function returns the model with a classification head
	"""
	newModel = stylometer_classifier(model,dimensionality=dim)

	return newModel





	def main():
	print("----starting enviroment----")


	model_name = "Salesforce/codet5p-770m"
	checkpoint = "checkpoint.bin"


	DEVICE = "cpu"



	#load tokenizer
	tokenizer = load_tokenizer(model_name)
	print("tokenizer loaded!")


	#loading model and tokenizer for functional translation
	model = load_model(model_name)
	#adding classification head to the model
	model = adapt_model(model, dim=model.shared.embedding_dim)



	model.load_state_dict(torch.load(checkpoint,map_location='cpu'))
	model = model.eval()
	st.title("Human-AI stylometer - Multilingual")

	st.caption('From the paper: Is This You, LLM? Recognizing AI-written Programs with Multilingual Code Stylometry')

	text = st.text_area("insert your code here")
	button = st.button("send")
	if button or text:
	input = tokenizer([text])
	out= model(torch.tensor(input.input_ids),torch.tensor(input.attention_mask))
	st.write(out["my_class"])




	if __name__ == '__main__':
	main()