In [None]:
#Installing dependent libraries
%pip install pandas matplotlib
%pip install imblearn
%pip install nltk
%pip install textstat 

In [None]:
#Connecting With Wandb(optional)
%pip install wandb
import wandb
wandb.login()

In [None]:
#Importing all the libraries
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler
import numpy as np
import random
from collections import Counter
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textstat import flesch_reading_ease
import textstat
import joblib
from scipy.sparse import hstack
from sklearn.linear_model import SGDClassifier
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score, classification_report
from multiprocessing import cpu_count
import time
import gc


In [None]:
#Basic dataset handling and new file creation
df = pd.read_csv("Datasets/AI_Human.csv", engine='python', encoding='utf-8',on_bad_lines='skip')

df.dropna(inplace=True)
df = df[df["text"].str.strip() != ""]
df.drop_duplicates(inplace=True)
df["text"] = df["text"].str.lower().str.strip()

df.to_csv("Datasets/cleaned_dataset.csv", index=False)

del df

In [None]:
#Checking class distribution
df = pd.read_csv("Datasets/cleaned_dataset.csv",dtype={'generated': 'float'}, low_memory=False)
gc.collect()
print(df["generated"].value_counts())

# Plot distribution
df["generated"].value_counts().plot(kind="bar", color=["blue", "red"])
plt.title("Distribution of AI vs. Human Texts")
plt.xlabel("Label (0=Human, 1=AI)")
plt.ylabel("Count")
plt.show()

In [None]:
#Balancing dataset for equal class distribution

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(df[["text"]], df["generated"])

df_resampled = pd.DataFrame(X_resampled, columns=["text"])
df_resampled["generated"] = y_resampled

print(df_resampled["generated"].value_counts())

In [None]:
#check for sentence length size
df["text_length"] = df["text"].apply(len)

# Plot text length distribution
df.hist(column="text_length", by="generated", bins=50, figsize=(10, 5), color=["blue"])
plt.suptitle("Text Length Distribution for AI vs. Human")
plt.show()

In [None]:
#Checking for Words Lenght Distribution
df["words_length"] = df["text"].apply(lambda x: len(x.split())) # Count words

# Plot histogram
plt.hist(df["words_length"], bins=50, color="blue", alpha=0.7)
plt.xlabel("Words Length")
plt.ylabel("Frequency")
plt.title("Words Length Distribution")
plt.show()

In [None]:
#Trimming Long Text Length for balancing both classes

def smart_truncate(text, max_length=700):
 words = text.split()
 length = len(words)

 if length > max_length:
 decay_factor = np.exp(-0.002 * (length - max_length)) 
 if random.random() > decay_factor:
 trunc_limit = random.randint(600, 700) 
 return " ".join(words[:trunc_limit])

 return text # Keep original if within limit

df["text"] = df["text"].apply(smart_truncate)


In [None]:
#check text length after trimming
df["words_length"] = df["text"].apply(lambda x: len(x.split())) # Count words
plt.hist(df["words_length"], bins=50, color="blue", alpha=0.7)
plt.xlabel("Text Length (words)")
plt.ylabel("Frequency")
plt.title("Text Length Distribution")
plt.show()

In [None]:
#check for data overlap
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Get the most common words in AI-generated vs. Human text
ai_words = Counter(" ".join(df[df["generated"] == 1]["text"]).split())
human_words = Counter(" ".join(df[df["generated"] == 0]["text"]).split())

# Remove stopwords
ai_words = {word: count for word, count in ai_words.items() if word.lower() not in stop_words}
human_words = {word: count for word, count in human_words.items() if word.lower() not in stop_words}

ai_words = Counter(ai_words) # Convert to Counter
human_words = Counter(human_words) # Convert to Counter

# Compare the top 20 words
print("Top 20 AI-generated words:", ai_words.most_common(20))
print("Top 20 Human words:", human_words.most_common(20))


In [None]:
#check for overlap percentage
ai_top_words = set(word for word, _ in ai_words.most_common(50))
human_top_words = set(word for word, _ in human_words.most_common(50))

overlap = ai_top_words.intersection(human_top_words)
overlap_percentage = (len(overlap) / len(ai_top_words)) * 100
print(f"Overlap Percentage: {overlap_percentage:.2f}%")

#checking graph distribution for overlap
ai_freqs = [count for _, count in ai_words.most_common(20)]
human_freqs = [count for _, count in human_words.most_common(20)]
labels = [word for word, _ in ai_words.most_common(20)]

plt.figure(figsize=(12, 6))
plt.bar(labels, ai_freqs, color='blue', alpha=0.6, label="AI-generated")
plt.bar(labels, human_freqs, color='red', alpha=0.6, label="Human-written")
plt.xticks(rotation=45)
plt.ylabel("Frequency")
plt.title("Word Frequency Comparison: AI vs. Human")
plt.legend()
plt.show()

#check for ai specific bias
for word in ["electoral", "students", "college", "may"]:
 ai_count = ai_words.get(word, 0)
 human_count = human_words.get(word, 0)
 print(f"{word}: AI={ai_count}, Human={human_count}, Ratio={ai_count/human_count:.2f}")



In [None]:
#checking for lexical diversity
def lexical_diversity(texts):
 total_words = sum(len(text.split()) for text in texts)
 unique_words = len(set(" ".join(texts).split()))
 return unique_words / total_words

ai_texts = df[df['generated'] == 1]['text'].tolist()
human_texts = df[df['generated'] == 0]['text'].tolist()

ai_diversity = lexical_diversity(ai_texts) # List of AI-generated texts
human_diversity = lexical_diversity(human_texts) # List of human-written texts

print(f"Lexical Diversity - AI: {ai_diversity:.4f}, Human: {human_diversity:.4f}")


In [None]:
#checking for context coherence

ai_sample = ai_texts[:500]
human_sample = human_texts[:500]


texts = ai_sample + human_sample


vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(texts)


ai_vectors = tfidf_matrix[:len(ai_sample)]
human_vectors = tfidf_matrix[len(ai_sample):]

ai_avg_vector = np.asarray(ai_vectors.mean(axis=0))
human_avg_vector = np.asarray(human_vectors.mean(axis=0))

# Compute similarity
similarity_score = cosine_similarity(ai_avg_vector, human_avg_vector)[0][0]
print(f"Context Similarity (AI vs. Human): {similarity_score:.4f}")


In [None]:
#Readablity Score

ai_readability = sum(flesch_reading_ease(text) for text in ai_sample) / len(ai_sample)
human_readability = sum(flesch_reading_ease(text) for text in human_sample) / len(human_sample)

print(f"AI Readability Score: {ai_readability:.2f}")
print(f"Human Readability Score: {human_readability:.2f}")

In [None]:
nltk.download('punkt_tab')

In [None]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True) 

In [None]:
#Split into Train (90%) and Test (10%) to use more data for training
train_size = int(0.9 * len(df))
test_size = int(0.1 * len(df))
df_train = df[:train_size]
df_test = df[train_size:]

In [None]:
#Initializing W&B (optional)
wandb.init(
 project="ai-text-detector",
 name="full_training",
 config={"train_size": train_size, "test_size": test_size}
)

In [None]:
# Defining feature extraction functions (optimized)
def calculate_readability(text):
 return textstat.flesch_reading_ease(text)

def lexical_diversity(text):
 words = nltk.word_tokenize(text)
 return len(set(words)) / len(words) if len(words) > 0 else 0

def sentence_length(text):
 sentences = nltk.sent_tokenize(text)
 return sum(len(nltk.word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0

In [None]:
# Apply feature extraction
print("Extracting features... (This may take some time)")
df_train['readability'] = df_train['text'].apply(calculate_readability)
df_train['lexical_diversity'] = df_train['text'].apply(lexical_diversity)
df_train['sentence_length'] = df_train['text'].apply(sentence_length)

df_test['readability'] = df_test['text'].apply(calculate_readability)
df_test['lexical_diversity'] = df_test['text'].apply(lexical_diversity)
df_test['sentence_length'] = df_test['text'].apply(sentence_length)


In [None]:
#Initialize TF-IDF Vectorizer with Parallel Processing
vectorizer = TfidfVectorizer(max_features=5000, n_jobs=-1) 
X_train_tfidf = vectorizer.fit_transform(df_train['text'])
X_test_tfidf = vectorizer.transform(df_test['text'])

In [None]:
# Stack Sparse Matrices for Final Features
X_train = hstack((X_train_tfidf, df_train[['readability', 'lexical_diversity', 'sentence_length']].values))
X_test = hstack((X_test_tfidf, df_test[['readability', 'lexical_diversity', 'sentence_length']].values))


In [None]:
#Defining Train Test Dataset
y_train = df_train['generated']
y_test = df_test['generated']

In [None]:
# Initialize Model with Multi-core Processing
model = SGDClassifier(loss='log_loss', max_iter=1000, n_jobs=-1)

In [None]:
# Training the Model
start_time = time.time()
print("\nšŸš€ Training Model...")

model.fit(X_train, y_train)

training_time = time.time() - start_time

In [None]:
# Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"\nāœ… Training Completed in {training_time:.2f} sec - Accuracy: {accuracy:.4f}")

In [None]:
# Log Metrics to W&B(Optional)
wandb.log({
 "training_time": training_time,
 "accuracy": accuracy,
 "class_0_train": (y_train == 0).sum(),
 "class_1_train": (y_train == 1).sum(),
 "class_0_test": (y_test == 0).sum(),
 "class_1_test": (y_test == 1).sum(),
})
wandb.finish()

In [None]:

# Save Model
joblib.dump(model, 'ai_detector_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

print("\nšŸŽ‰ Model training completed and saved!")