| import gradio as gr |
| import spacy |
| import re |
| import numpy as np |
| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
|
|
| |
| MAX_TEXT_LENGTH = 2048 |
| MIN_GEN_LENGTH = 1 |
| MAX_GEN_LENGTH = 2048 |
| MIN_AI_PERCENTAGE = 50 |
|
|
| |
| spacy.cli.download("en_core_web_sm") |
| nlp = spacy.load("en_core_web_sm") |
|
|
| |
| |
| def detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage): |
| |
| cleaned_text = re.sub(r'\s+', ' ', text).strip() |
| cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text) |
|
|
| |
| doc = nlp(cleaned_text) |
| if not re.search('\S', cleaned_text) or len(list(doc.sents)) < 2: |
| return {"error": "Input text must contain at least two sentences."} |
|
|
| |
| if len(cleaned_text) > MAX_TEXT_LENGTH: |
| return {"error": f"Input text must be no longer than {MAX_TEXT_LENGTH} characters."} |
|
|
| |
| if not (0 <= min_ai_percentage <= 100): |
| return {"error": "Minimum threshold for AI percentage must be between 0 and 100."} |
|
|
| |
| model_name = f"{model_name}-{model_size}" |
| tokenizer = AutoTokenizer.from_pretrained(model_name) |
| model = AutoModelForCausalLM.from_pretrained(model_name) |
|
|
| |
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| model.to(device) |
|
|
| |
| eos_token_id = tokenizer.eos_token_id |
|
|
| |
| input_ids = tokenizer.encode(cleaned_text, add_special_tokens=True, return_tensors='pt').to(device) |
|
|
| output_sequences = [] |
| for i in range(num_return_sequences): |
| output_sequence = model.generate( |
| input_ids=input_ids, |
| max_length=max_gen_length + len(input_ids[0]), |
| temperature=temperature, |
| top_k=50, |
| top_p=0.95, |
| repetition_penalty=1.5, |
| do_sample=True, |
| num_return_sequences=1 |
| )[0] |
| output_sequences.append(output_sequence) |
| |
| |
| generated_texts = [] |
| perplexities = [] |
| ai_percentages = [] |
|
|
| for i, output_sequence in enumerate(output_sequences): |
| generated_text = tokenizer.decode(output_sequence.tolist()[len(input_ids[0]):], skip_special_tokens=True) |
|
|
| |
| ai_percentage = round(len(generated_text) / len(cleaned_text) * 100, 2) |
| ai_percentages.append(ai_percentage) |
|
|
| |
| generated_input_ids = tokenizer.encode(generated_text, add_special_tokens=False, return_tensors='pt').to(device) |
| with torch.no_grad(): |
| loss = model(generated_input_ids, labels=generated_input_ids).loss.item() |
| perplexity = np.exp(loss) |
| perplexities.append(perplexity) |
| generated_texts.append(generated_text) |
|
|
| |
| clean_doc = nlp(cleaned_text) |
| unique_generated_texts = [] |
|
|
| for generated_text in generated_texts: |
| gen_doc = nlp(generated_text) |
| similarity = clean_doc.similarity(gen_doc) |
|
|
| if similarity < 0.8: |
| is_unique = True |
|
|
| for unique_text in unique_generated_texts: |
| unique_doc = nlp(unique_text) |
| unique_similarity = unique_doc.similarity(gen_doc) |
|
|
| if unique_similarity >= 0.8: |
| is_unique = False |
| break |
|
|
| if is_unique: |
| unique_generated_texts.append(generated_text) |
|
|
| |
| doc = nlp(cleaned_text) |
| |
| all_tokens = [] |
| for sent in doc.sents: |
| tokens = [token.text.lower() for token in sent if not token.is_punct and not token.is_stop] |
| all_tokens += tokens |
|
|
| |
| unique_tokens = set(all_tokens) |
| num_unique_tokens = len(unique_tokens) |
| total_tokens = len(all_tokens) |
| burstiness_score = (num_unique_tokens * num_unique_tokens) / (total_tokens * total_tokens) |
|
|
| |
| avg_perplexity = np.mean(perplexities) |
| avg_ai_percentage = round(np.mean(ai_percentages), 2) |
|
|
| |
| if avg_ai_percentage < min_ai_percentage: |
| return {"error": f"The generated text has an AI percentage of {avg_ai_percentage}%, which is below the minimum threshold of {min_ai_percentage}%."} |
| if avg_perplexity < MIN_GEN_LENGTH: |
| return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is below the minimum threshold of {MIN_GEN_LENGTH}."} |
| if avg_perplexity > MAX_GEN_LENGTH: |
| return {"error": f"The generated text has a perplexity score of {avg_perplexity}, which is above the maximum threshold of {MAX_GEN_LENGTH}."} |
|
|
| |
| return { |
| "generated_texts": unique_generated_texts, |
| "burstiness_score": round(burstiness_score, 2), |
| "avg_ai_percentage": avg_ai_percentage, |
| "avg_perplexity": round(avg_perplexity, 2) |
| } |
|
|
| |
| def bai_chat(text, max_gen_length=256, temperature=0.7, model_name="gpt2", model_size="medium", num_return_sequences=5, min_ai_percentage=50): |
| result = detect_ai_content(text, max_gen_length, temperature, model_name, model_size, num_return_sequences, min_ai_percentage) |
|
|
| if "error" in result: |
| return result["error"] |
| else: |
| generated_texts = "\n\n".join(result["generated_texts"]) |
| return f"Burstiness Score: {result['burstiness_score']}\nAverage AI Percentage: {result['avg_ai_percentage']}%\nAverage Perplexity Score: {result['avg_perplexity']}\n\n{generated_texts}" |
|
|
| gr.Interface(fn=bai_chat, |
| inputs=[gr.inputs.Textbox("Enter your text here...")], |
| outputs=[gr.outputs.Textbox(label="Generated Texts")]).launch() |