imseldrith's picture
Update main.py
72fe39e
import gradio as gr
import spacy
import re
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# Define some constants for error messages
MAX_TEXT_LENGTH = 2048
MIN_GEN_LENGTH = 1
MAX_GEN_LENGTH = 2048
MIN_AI_PERCENTAGE = 50
# Download the Spacy model for English
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
# Define a function to detect AI-generated content and calculate the perplexity score,
# burstiness score, and average perplexity score for a given text input
def detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage):
# Clean the text by removing extra spaces, line breaks, and special characters
cleaned_text = re.sub(r'\s+', ' ', text).strip()
cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
# If the cleaned text is empty or contains only one sentence, return an error message
doc = nlp(cleaned_text)
if not re.search('\S', cleaned_text) or len(list(doc.sents)) < 2:
return {"error": "Input text must contain at least two sentences."}
# Check if the cleaned text is longer than the maximum allowed by the GPT model
if len(cleaned_text) > MAX_TEXT_LENGTH:
return {"error": f"Input text must be no longer than {MAX_TEXT_LENGTH} characters."}
# Check if the minimum threshold for the percentage of AI-generated content is within the allowed range
if not (0 <= min_ai_percentage <= 100):
return {"error": "Minimum threshold for AI percentage must be between 0 and 100."}
# Load the specified GPT model and tokenizer
model_name = f"{model_name}-{model_size}"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
# Set the device to run the model on (either "cuda" or "cpu")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
# Set the end of sequence token ID for text generation
eos_token_id = tokenizer.eos_token_id
# Generate multiple sequences using the pre-trained GPT model
input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, return_tensors='pt').to(device)
output_sequences = []
for i in range(num_return_sequences):
output_sequence = model.generate(
input_ids=input_ids,
max_length=max_gen_length + len(input_ids[0]),
temperature=temperature,
top_k=50,
top_p=0.95,
repetition_penalty=1.5,
do_sample=True,
num_return_sequences=1
)[0]
output_sequences.append(output_sequence)
# Decode the generated sequences using the GPT tokenizer
generated_texts = []
perplexities = []
ai_percentages = []
for i, output_sequence in enumerate(output_sequences):
generated_text = tokenizer.decode(output_sequence.tolist()[len(input_ids[0]):], skip_special_tokens=True)
# Calculate the percentage of AI-generated content in the generated text
ai_percentage = round(len(generated_text) / len(cleaned_text) * 100, 2)
ai_percentages.append(ai_percentage)
# Check if the AI percentage and perplexity score are above their respective thresholds
generated_input_ids = tokenizer.encode(generated_text, add_special_tokens=False, return_tensors='pt').to(device)
with torch.no_grad():
loss = model(generated_input_ids, labels=generated_input_ids).loss.item()
perplexity = np.exp(loss)
perplexities.append(perplexity)
generated_texts.append(generated_text)
# Remove the generated sequences that are identical or highly similar to the cleaned text
clean_doc = nlp(cleaned_text)
unique_generated_texts = []
for generated_text in generated_texts:
gen_doc = nlp(generated_text)
similarity = clean_doc.similarity(gen_doc)
if similarity < 0.8:
is_unique = True
for unique_text in unique_generated_texts:
unique_doc = nlp(unique_text)
unique_similarity = unique_doc.similarity(gen_doc)
if unique_similarity >= 0.8:
is_unique = False
break
if is_unique:
unique_generated_texts.append(generated_text)
# Calculate the burstiness score for the input text, which measures the diversity of vocabulary in the input text
doc = nlp(cleaned_text)
# Extract the tokens from the input text's sentences
all_tokens = []
for sent in doc.sents:
tokens = [token.text.lower() for token in sent if not token.is_punct and not token.is_stop]
all_tokens += tokens
# Calculate the burstiness score for the input text
unique_tokens = set(all_tokens)
num_unique_tokens = len(unique_tokens)
total_tokens = len(all_tokens)
burstiness_score = (num_unique_tokens * num_unique_tokens) / (total_tokens * total_tokens)
# Calculate the average perplexity score and AI percentage for the generated texts
avg_perplexity = np.mean(perplexities)
avg_ai_percentage = round(np.mean(ai_percentages), 2)
# Check if the AI percentage and perplexity score are above their respective thresholds
if avg_ai_percentage < min_ai_percentage:
return {"error": f"The generated text has an AI percentage of {avg_ai_percentage}%, which is below the minimum threshold of {min_ai_percentage}%."}
if avg_perplexity < MIN_GEN_LENGTH:
return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is below the minimum threshold of {MIN_GEN_LENGTH}."}
if avg_perplexity > MAX_GEN_LENGTH:
return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is above the maximum threshold of {MAX_GEN_LENGTH}."}
# Return the unique generated texts, burstiness score, and average AI percentage and perplexity score
return {
"generated_texts": unique_generated_texts,
"burstiness_score": round(burstiness_score, 2),
"avg_ai_percentage": avg_ai_percentage,
"avg_perplexity": round(avg_perplexity, 2)
}
# Define the Gradio interface for the BAI Chat function
def bai_chat(text, max_gen_length=256, temperature=0.7, model_name="gpt2", model_size="medium", num_return_sequences=5, min_ai_percentage=50):
result = detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage)
if "error" in result:
return result["error"]
else:
generated_texts = "\n\n".join(result["generated_texts"])
return f"Burstiness Score: {result['burstiness_score']}\nAverage AI Percentage: {result['avg_ai_percentage']}%\nAverage Perplexity Score: {result['avg_perplexity']}\n\n{generated_texts}"
gr.Interface(fn=bai_chat,
inputs=[gr.inputs.Textbox("Enter your text here...")],
outputs=[gr.outputs.Textbox(label="Generated Texts")]).launch()