| import gradio as gr |
| import spacy |
| import re |
| import numpy as np |
| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| |
| MAX_TEXT_LENGTH = 2048 |
| MIN_GEN_LENGTH = 1 |
| MAX_GEN_LENGTH = 2048 |
| MIN_AI_PERCENTAGE = 50 |
|
|
| |
| spacy.cli.download("en_core_web_sm") |
| nlp = spacy.load("en_core_web_sm") |
|
|
| |
| |
| def detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage): |
| |
| cleaned_text = re.sub(r'\s+', ' ', text).strip() |
| cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) |
|
|
| |
| doc = nlp(cleaned_text) |
| if not re.search('\S', cleaned_text) or len(list(doc.sents)) < 2: |
| return {"error": "Input text must contain at least two sentences."} |
|
|
| |
| if len(cleaned_text) > MAX_TEXT_LENGTH: |
| return {"error": f"Input text must be no longer than {MAX_TEXT_LENGTH} characters."} |
|
|
| |
| if not (0 <= min_ai_percentage <= 100): |
| return {"error": "Minimum threshold for AI percentage must be between 0 and 100."} |
|
|
| |
| model_name = f"{model_name}-{model_size}" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model.to(device) |
|
|
| |
| eos_token_id = tokenizer.eos_token_id |
|
|
| |
| input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, return_tensors='pt').to(device) |
| |
| |
| output_sequences = [] |
| for i in range(num_return_sequences): |
| output_sequence = model.generate( |
| input_ids=input_ids, |
| max_length=max_gen_length + len(input_ids[0]), |
| temperature=temperature, |
| top_k=50, |
| top_p=0.95, |
| repetition_penalty=1.5, |
| do_sample=True, |
| num_return_sequences=1 |
| )[0] |
| output_sequences.append(output_sequence) |
| |
|
|
| |
| generated_texts = [] |
| perplexities = [] |
| ai_percentages = [] |
| |
| for i, output_sequence in enumerate(output_sequences): |
| generated_text = tokenizer.decode(output_sequence.tolist()[len(input_ids[0]):], skip_special_tokens=True) |
|
|
| |
| ai_percentage = round(len(generated_text) / len(cleaned_text) * 100, 2) |
| ai_percentages.append(ai_percentage) |
|
|
| |
| generated_input_ids = tokenizer.encode(generated_text, add_special_tokens=False, return_tensors='pt').to(device) |
| with torch.no_grad(): |
| loss = model(generated_input_ids, labels=generated_input_ids).loss.item() |
| perplexity = np.exp(loss) |
| perplexities.append(perplexity) |
| generated_texts.append(generated_text) |
|
|
| |
| clean_doc = nlp(cleaned_text) |
| unique_generated_texts = [] |
|
|
| for generated_text in generated_texts: |
| gen_doc = nlp(generated_text) |
| similarity = clean_doc.similarity(gen_doc) |
|
|
| if similarity < 0.8: |
| is_unique = True |
|
|
| for unique_text in unique_generated_texts: |
| unique_doc = nlp(unique_text) |
| unique_similarity = unique_doc.similarity(gen_doc) |
|
|
| if unique_similarity >= 0.8: |
| is_unique = False |
| break |
|
|
| if is_unique: |
| unique_generated_texts.append(generated_text) |
|
|
| |
| doc = nlp(cleaned_text) |
| tokens = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha] |
| unique_tokens = set(tokens) |
| burstiness = len(unique_tokens) / len(tokens) |
|
|
| |
| |
| valid_generated_texts = [] |
| valid_perplexities = [] |
| valid_ai_percentages = [] |
|
|
| for i, generated_text in enumerate(unique_generated_texts): |
| ai_percentage = ai_percentages[i] |
| perplexity = perplexities[i] |
|
|
| if ai_percentage >= min_ai_percentage and perplexity > 5: |
| valid_generated_texts.append(generated_text) |
| valid_perplexities.append(perplexity) |
| valid_ai_percentages.append(ai_percentage) |
|
|
| |
| if not valid_generated_texts: |
| return {"error": "No AI-generated content meeting the minimum threshold was found in the input text."} |
|
|
| |
| max_num_returned_sequences = min(num_return_sequences, len(valid_generated_texts)) |
| combined_scores = [(i, valid_ai_percentages[i] * valid_perplexities[i]) for i in range(len(valid_generated_texts))] |
| sorted_indices = sorted(combined_scores, key=lambda x: x[1], reverse=True) |
|
|
| result = { |
| "cleaned_text": cleaned_text, |
| "generated_texts": [], |
| "perplexities": valid_perplexities, |
| "ai_percentages": valid_ai_percentages, |
| "burstiness": burstiness, |
| "avg_perplexity": np.mean(valid_perplexities), |
| } |
|
|
| for i in range(max_num_returned_sequences): |
| idx = sorted_indices[i][0] |
| result["generated_texts"].append(valid_generated_texts[idx]) |
|
|
| |
| cleaned_text = re.sub(r'\n+', '\n', cleaned_text) |
| generated_texts = [re.sub(r'\n+', '\n', text) for text in result["generated_texts"]] |
| perplexities = [f"{perplexity:.2f}" for perplexity in result["perplexities"]] |
| ai_percentages = [f"{ai_percentage:.2f}%" for ai_percentage in result["ai_percentages"]] |
| avg_perplexity = f"{result['avg_perplexity']:.2f}" |
| burstiness = f"{result['burstiness']:.2f}" |
|
|
| return {"cleaned_text": cleaned_text, "generated_texts": generated_texts, |
| "perplexities": perplexities, "ai_percentages": ai_percentages, |
| "burstiness": burstiness, "avg_perplexity": avg_perplexity} |
|
|
| |
| input_textbox = gr.Textbox(label="Input Text", placeholder="Enter some text here...") |
| max_gen_length_slider = gr.Slider(minimum=MIN_GEN_LENGTH, maximum=MAX_GEN_LENGTH, default=256, label="Max Length for Generated Text") |
| temperature_slider = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, default=1.0, label="Temperature", |
| info="Higher values make the model generate more random text.") |
| model_dropdown = gr.Dropdown(choices=["gpt2", "gpt2-medium", "gpt2-large", "gpt2-xl", "EleutherAI/gpt-neo-125M", |
| "EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B"], |
| default="gpt2", label="GPT Model", tooltip="The pre-trained GPT model to use for text generation.") |
| model_size_dropdown = gr.Dropdown(choices=["small", "medium", "large", "xl"], default="medium", label="Model Size", |
| tooltip="The size of the pre-trained GPT model to use for text generation.") |
| num_return_sequences_slider = gr.Slider(minimum=1, maximum=10, default=1, label="Number of Generated Sequences to Return") |
| min_ai_percentage_slider = gr.Slider(minimum=0, maximum=100, default=MIN_AI_PERCENTAGE, label="Minimum Threshold for AI Percentage", |
| info="The minimum percentage of AI-generated content that a generated sequence must have.") |
|
|
| |
| css_style = """ |
| .gradio-input-wrapper { |
| margin-bottom: 10px; |
| } |
| |
| .gradio-input-wrapper label { |
| font-size: 1.2rem; |
| font-weight: bold; |
| margin-bottom: 5px; |
| } |
| |
| .gradio-slider { |
| height: 30px; |
| margin-bottom: 15px; |
| } |
| |
| .gradio-slider .input-label-container { |
| display: flex; |
| justify-content: space-between; |
| align-items: center; |
| } |
| |
| .gradio-dropdown { |
| margin-bottom: 20px; |
| } |
| |
| .gradio-textbox textarea { |
| height: 150px; |
| font-size: 1.2rem; |
| } |
| |
| .gradio-output { |
| background-color: #f5f5f5; |
| border-radius: 4px; |
| padding: 15px; |
| margin-top: 30px; |
| margin-bottom: 30px; |
| } |
| |
| .gradio-output label { |
| font-size: 1.2rem; |
| font-weight: bold; |
| margin-bottom: 10px; |
| display: block; |
| } |
| |
| .gradio-output p { |
| margin-bottom: 10px; |
| } |
| |
| .gradio-error { |
| color: red; |
| font-weight: bold; |
| margin-top: 10px; |
| } |
| |
| .gradio-progress { |
| margin-top: 20px; |
| } |
| """ |
|
|
| |
| iface = gr.Interface( |
| detect_ai_content, |
| inputs=[input_textbox, max_gen_length_slider, temperature_slider, model_dropdown, model_size_dropdown, |
| num_return_sequences_slider, min_ai_percentage_slider], |
| outputs=[ |
| gr.Label(label="Input Text"), |
| gr.Label(label="Generated Text"), |
| gr.Label(label="Perplexity Scores"), |
| gr.Label(label="AI Percentages"), |
| gr.Label(label="Burstiness Score"), |
| gr.Label(label="Average Perplexity Score"), |
| ], |
| title="AI Content Detection and Generation", |
| description="This app detects and generates AI-generated content in input text using GPT-2 and GPT-Neo models.", |
| theme="default", |
| layout="vertical", |
| css_style=css_style, |
| ) |
|
|
| |
| iface.launch() |