Spaces:

imseldrith
/

ChatGPT-Detection

Paused

App Files Files Community

ChatGPT-Detection / main.py

imseldrith

Update main.py

72fe39e almost 3 years ago

raw

history blame contribute delete

7.03 kB

	import gradio as gr
	import spacy
	import re
	import numpy as np
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM

	# Define some constants for error messages
	MAX_TEXT_LENGTH = 2048
	MIN_GEN_LENGTH = 1
	MAX_GEN_LENGTH = 2048
	MIN_AI_PERCENTAGE = 50

	# Download the Spacy model for English
	spacy.cli.download("en_core_web_sm")
	nlp = spacy.load("en_core_web_sm")

	# Define a function to detect AI-generated content and calculate the perplexity score,
	# burstiness score, and average perplexity score for a given text input
	def detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage):
	# Clean the text by removing extra spaces, line breaks, and special characters
	cleaned_text = re.sub(r'\s+', ' ', text).strip()
	cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)

	# If the cleaned text is empty or contains only one sentence, return an error message
	doc = nlp(cleaned_text)
	if not re.search('\S', cleaned_text) or len(list(doc.sents)) < 2:
	return {"error": "Input text must contain at least two sentences."}

	# Check if the cleaned text is longer than the maximum allowed by the GPT model
	if len(cleaned_text) > MAX_TEXT_LENGTH:
	return {"error": f"Input text must be no longer than {MAX_TEXT_LENGTH} characters."}

	# Check if the minimum threshold for the percentage of AI-generated content is within the allowed range
	if not (0 <= min_ai_percentage <= 100):
	return {"error": "Minimum threshold for AI percentage must be between 0 and 100."}

	# Load the specified GPT model and tokenizer
	model_name = f"{model_name}-{model_size}"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForCausalLM.from_pretrained(model_name)

	# Set the device to run the model on (either "cuda" or "cpu")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	model.to(device)

	# Set the end of sequence token ID for text generation
	eos_token_id = tokenizer.eos_token_id

	# Generate multiple sequences using the pre-trained GPT model
	input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, return_tensors='pt').to(device)

	output_sequences = []
	for i in range(num_return_sequences):
	output_sequence = model.generate(
	input_ids=input_ids,
	max_length=max_gen_length + len(input_ids[0]),
	temperature=temperature,
	top_k=50,
	top_p=0.95,
	repetition_penalty=1.5,
	do_sample=True,
	num_return_sequences=1
	)[0]
	output_sequences.append(output_sequence)

	# Decode the generated sequences using the GPT tokenizer
	generated_texts = []
	perplexities = []
	ai_percentages = []

	for i, output_sequence in enumerate(output_sequences):
	generated_text = tokenizer.decode(output_sequence.tolist()[len(input_ids[0]):], skip_special_tokens=True)

	# Calculate the percentage of AI-generated content in the generated text
	ai_percentage = round(len(generated_text) / len(cleaned_text) * 100, 2)
	ai_percentages.append(ai_percentage)

	# Check if the AI percentage and perplexity score are above their respective thresholds
	generated_input_ids = tokenizer.encode(generated_text, add_special_tokens=False, return_tensors='pt').to(device)
	with torch.no_grad():
	loss = model(generated_input_ids, labels=generated_input_ids).loss.item()
	perplexity = np.exp(loss)
	perplexities.append(perplexity)
	generated_texts.append(generated_text)

	# Remove the generated sequences that are identical or highly similar to the cleaned text
	clean_doc = nlp(cleaned_text)
	unique_generated_texts = []

	for generated_text in generated_texts:
	gen_doc = nlp(generated_text)
	similarity = clean_doc.similarity(gen_doc)

	if similarity < 0.8:
	is_unique = True

	for unique_text in unique_generated_texts:
	unique_doc = nlp(unique_text)
	unique_similarity = unique_doc.similarity(gen_doc)

	if unique_similarity >= 0.8:
	is_unique = False
	break

	if is_unique:
	unique_generated_texts.append(generated_text)

	# Calculate the burstiness score for the input text, which measures the diversity of vocabulary in the input text
	doc = nlp(cleaned_text)
	# Extract the tokens from the input text's sentences
	all_tokens = []
	for sent in doc.sents:
	tokens = [token.text.lower() for token in sent if not token.is_punct and not token.is_stop]
	all_tokens += tokens

	# Calculate the burstiness score for the input text
	unique_tokens = set(all_tokens)
	num_unique_tokens = len(unique_tokens)
	total_tokens = len(all_tokens)
	burstiness_score = (num_unique_tokens * num_unique_tokens) / (total_tokens * total_tokens)

	# Calculate the average perplexity score and AI percentage for the generated texts
	avg_perplexity = np.mean(perplexities)
	avg_ai_percentage = round(np.mean(ai_percentages), 2)

	# Check if the AI percentage and perplexity score are above their respective thresholds
	if avg_ai_percentage < min_ai_percentage:
	return {"error": f"The generated text has an AI percentage of {avg_ai_percentage}%, which is below the minimum threshold of {min_ai_percentage}%."}
	if avg_perplexity < MIN_GEN_LENGTH:
	return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is below the minimum threshold of {MIN_GEN_LENGTH}."}
	if avg_perplexity > MAX_GEN_LENGTH:
	return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is above the maximum threshold of {MAX_GEN_LENGTH}."}

	# Return the unique generated texts, burstiness score, and average AI percentage and perplexity score
	return {
	"generated_texts": unique_generated_texts,
	"burstiness_score": round(burstiness_score, 2),
	"avg_ai_percentage": avg_ai_percentage,
	"avg_perplexity": round(avg_perplexity, 2)
	}

	# Define the Gradio interface for the BAI Chat function
	def bai_chat(text, max_gen_length=256, temperature=0.7, model_name="gpt2", model_size="medium", num_return_sequences=5, min_ai_percentage=50):
	result = detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage)

	if "error" in result:
	return result["error"]
	else:
	generated_texts = "\n\n".join(result["generated_texts"])
	return f"Burstiness Score: {result['burstiness_score']}\nAverage AI Percentage: {result['avg_ai_percentage']}%\nAverage Perplexity Score: {result['avg_perplexity']}\n\n{generated_texts}"

	gr.Interface(fn=bai_chat,
	inputs=[gr.inputs.Textbox("Enter your text here...")],
	outputs=[gr.outputs.Textbox(label="Generated Texts")]).launch()