{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0", "metadata": { "id": "a3074189-9ff0-41da-a99b-d42d2172a914" }, "outputs": [], "source": [ "#Installing dependent libraries\n", "%pip install pandas matplotlib\n", "%pip install imblearn\n", "%pip install nltk\n", "%pip install textstat " ] }, { "cell_type": "code", "execution_count": null, "id": "1", "metadata": {}, "outputs": [], "source": [ "#Connecting With Wandb(optional)\n", "%pip install wandb\n", "import wandb\n", "wandb.login()" ] }, { "cell_type": "code", "execution_count": null, "id": "2", "metadata": {}, "outputs": [], "source": [ "#Importing all the libraries\n", "import pandas as pd\n", "import matplotlib.pyplot as plt\n", "from imblearn.under_sampling import RandomUnderSampler\n", "import numpy as np\n", "import random\n", "from collections import Counter\n", "import nltk\n", "from nltk.corpus import stopwords\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from textstat import flesch_reading_ease\n", "import textstat\n", "import joblib\n", "from scipy.sparse import hstack\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.utils import shuffle\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from multiprocessing import cpu_count\n", "import time\n", "import gc\n" ] }, { "cell_type": "code", "execution_count": null, "id": "3", "metadata": { "id": "b2160971-e7b8-4bc0-812c-769dbaf2945e" }, "outputs": [], "source": [ "#Basic dataset handling and new file creation\n", "df = pd.read_csv(\"Datasets/AI_Human.csv\", engine='python', encoding='utf-8',on_bad_lines='skip')\n", "\n", "df.dropna(inplace=True)\n", "df = df[df[\"text\"].str.strip() != \"\"]\n", "df.drop_duplicates(inplace=True)\n", "df[\"text\"] = df[\"text\"].str.lower().str.strip()\n", "\n", "df.to_csv(\"Datasets/cleaned_dataset.csv\", index=False)\n", "\n", "del df" ] }, { "cell_type": "code", "execution_count": null, "id": "4", "metadata": { "id": "2b062d3a-e196-40c0-af09-26c5e3f6b2a3" }, "outputs": [], "source": [ "#Checking class distribution\n", "df = pd.read_csv(\"Datasets/cleaned_dataset.csv\",dtype={'generated': 'float'}, low_memory=False)\n", "gc.collect()\n", "print(df[\"generated\"].value_counts())\n", "\n", "# Plot distribution\n", "df[\"generated\"].value_counts().plot(kind=\"bar\", color=[\"blue\", \"red\"])\n", "plt.title(\"Distribution of AI vs. Human Texts\")\n", "plt.xlabel(\"Label (0=Human, 1=AI)\")\n", "plt.ylabel(\"Count\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "5", "metadata": { "id": "2205b524-66b4-4d64-8b87-ec892f260590" }, "outputs": [], "source": [ "#Balancing dataset for equal class distribution\n", "\n", "rus = RandomUnderSampler(random_state=42)\n", "X_resampled, y_resampled = rus.fit_resample(df[[\"text\"]], df[\"generated\"])\n", "\n", "df_resampled = pd.DataFrame(X_resampled, columns=[\"text\"])\n", "df_resampled[\"generated\"] = y_resampled\n", "\n", "print(df_resampled[\"generated\"].value_counts())" ] }, { "cell_type": "code", "execution_count": null, "id": "6", "metadata": { "id": "a3a94a8f-c082-4c34-aae1-8d6310b6ac35" }, "outputs": [], "source": [ "#check for sentence length size\n", "df[\"text_length\"] = df[\"text\"].apply(len)\n", "\n", "# Plot text length distribution\n", "df.hist(column=\"text_length\", by=\"generated\", bins=50, figsize=(10, 5), color=[\"blue\"])\n", "plt.suptitle(\"Text Length Distribution for AI vs. Human\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "7", "metadata": { "id": "1aa4110a-79cc-4e5c-80f5-b8f6ee8b9fdf" }, "outputs": [], "source": [ "#Checking for Words Lenght Distribution\n", "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split())) # Count words\n", "\n", "# Plot histogram\n", "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n", "plt.xlabel(\"Words Length\")\n", "plt.ylabel(\"Frequency\")\n", "plt.title(\"Words Length Distribution\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "8", "metadata": { "id": "1cb5091b-8c4d-45ff-8323-8c5a8ec45001" }, "outputs": [], "source": [ "#Trimming Long Text Length for balancing both classes\n", "\n", "def smart_truncate(text, max_length=700):\n", " words = text.split()\n", " length = len(words)\n", "\n", " if length > max_length:\n", " decay_factor = np.exp(-0.002 * (length - max_length)) \n", " if random.random() > decay_factor:\n", " trunc_limit = random.randint(600, 700) \n", " return \" \".join(words[:trunc_limit])\n", "\n", " return text # Keep original if within limit\n", "\n", "df[\"text\"] = df[\"text\"].apply(smart_truncate)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "9", "metadata": { "id": "662656fd-2202-47f0-a8d0-e45c83471797" }, "outputs": [], "source": [ "#check text length after trimming\n", "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split())) # Count words\n", "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n", "plt.xlabel(\"Text Length (words)\")\n", "plt.ylabel(\"Frequency\")\n", "plt.title(\"Text Length Distribution\")\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "10", "metadata": { "id": "859bfa5d-1628-4c20-ad76-5cdc9d0c503f" }, "outputs": [], "source": [ "#check for data overlap\n", "nltk.download(\"stopwords\")\n", "\n", "stop_words = set(stopwords.words(\"english\"))\n", "\n", "# Get the most common words in AI-generated vs. Human text\n", "ai_words = Counter(\" \".join(df[df[\"generated\"] == 1][\"text\"]).split())\n", "human_words = Counter(\" \".join(df[df[\"generated\"] == 0][\"text\"]).split())\n", "\n", "# Remove stopwords\n", "ai_words = {word: count for word, count in ai_words.items() if word.lower() not in stop_words}\n", "human_words = {word: count for word, count in human_words.items() if word.lower() not in stop_words}\n", "\n", "ai_words = Counter(ai_words) # Convert to Counter\n", "human_words = Counter(human_words) # Convert to Counter\n", "\n", "# Compare the top 20 words\n", "print(\"Top 20 AI-generated words:\", ai_words.most_common(20))\n", "print(\"Top 20 Human words:\", human_words.most_common(20))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "11", "metadata": { "id": "4a7803ee-49bc-493f-aa88-b9d981161397" }, "outputs": [], "source": [ "#check for overlap percentage\n", "ai_top_words = set(word for word, _ in ai_words.most_common(50))\n", "human_top_words = set(word for word, _ in human_words.most_common(50))\n", "\n", "overlap = ai_top_words.intersection(human_top_words)\n", "overlap_percentage = (len(overlap) / len(ai_top_words)) * 100\n", "print(f\"Overlap Percentage: {overlap_percentage:.2f}%\")\n", "\n", "#checking graph distribution for overlap\n", "ai_freqs = [count for _, count in ai_words.most_common(20)]\n", "human_freqs = [count for _, count in human_words.most_common(20)]\n", "labels = [word for word, _ in ai_words.most_common(20)]\n", "\n", "plt.figure(figsize=(12, 6))\n", "plt.bar(labels, ai_freqs, color='blue', alpha=0.6, label=\"AI-generated\")\n", "plt.bar(labels, human_freqs, color='red', alpha=0.6, label=\"Human-written\")\n", "plt.xticks(rotation=45)\n", "plt.ylabel(\"Frequency\")\n", "plt.title(\"Word Frequency Comparison: AI vs. Human\")\n", "plt.legend()\n", "plt.show()\n", "\n", "#check for ai specific bias\n", "for word in [\"electoral\", \"students\", \"college\", \"may\"]:\n", " ai_count = ai_words.get(word, 0)\n", " human_count = human_words.get(word, 0)\n", " print(f\"{word}: AI={ai_count}, Human={human_count}, Ratio={ai_count/human_count:.2f}\")\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "12", "metadata": { "id": "3e501a40-6373-492d-862e-d4037645164d" }, "outputs": [], "source": [ "#checking for lexical diversity\n", "def lexical_diversity(texts):\n", " total_words = sum(len(text.split()) for text in texts)\n", " unique_words = len(set(\" \".join(texts).split()))\n", " return unique_words / total_words\n", "\n", "ai_texts = df[df['generated'] == 1]['text'].tolist()\n", "human_texts = df[df['generated'] == 0]['text'].tolist()\n", "\n", "ai_diversity = lexical_diversity(ai_texts) # List of AI-generated texts\n", "human_diversity = lexical_diversity(human_texts) # List of human-written texts\n", "\n", "print(f\"Lexical Diversity - AI: {ai_diversity:.4f}, Human: {human_diversity:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "13", "metadata": { "id": "20230773-aeca-4dad-a273-6418fd6a14d1" }, "outputs": [], "source": [ "#checking for context coherence\n", "\n", "ai_sample = ai_texts[:500]\n", "human_sample = human_texts[:500]\n", "\n", "\n", "texts = ai_sample + human_sample\n", "\n", "\n", "vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n", "tfidf_matrix = vectorizer.fit_transform(texts)\n", "\n", "\n", "ai_vectors = tfidf_matrix[:len(ai_sample)]\n", "human_vectors = tfidf_matrix[len(ai_sample):]\n", "\n", "ai_avg_vector = np.asarray(ai_vectors.mean(axis=0))\n", "human_avg_vector = np.asarray(human_vectors.mean(axis=0))\n", "\n", "# Compute similarity\n", "similarity_score = cosine_similarity(ai_avg_vector, human_avg_vector)[0][0]\n", "print(f\"Context Similarity (AI vs. Human): {similarity_score:.4f}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "14", "metadata": { "id": "5d5dbc50-1689-4755-a66a-413999158f6e" }, "outputs": [], "source": [ "#Readablity Score\n", "\n", "ai_readability = sum(flesch_reading_ease(text) for text in ai_sample) / len(ai_sample)\n", "human_readability = sum(flesch_reading_ease(text) for text in human_sample) / len(human_sample)\n", "\n", "print(f\"AI Readability Score: {ai_readability:.2f}\")\n", "print(f\"Human Readability Score: {human_readability:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "15", "metadata": {}, "outputs": [], "source": [ "nltk.download('punkt_tab')" ] }, { "cell_type": "code", "execution_count": null, "id": "16", "metadata": {}, "outputs": [], "source": [ "df = df.sample(frac=1, random_state=42).reset_index(drop=True) " ] }, { "cell_type": "code", "execution_count": null, "id": "17", "metadata": {}, "outputs": [], "source": [ "#Split into Train (90%) and Test (10%) to use more data for training\n", "train_size = int(0.9 * len(df))\n", "test_size = int(0.1 * len(df))\n", "df_train = df[:train_size]\n", "df_test = df[train_size:]" ] }, { "cell_type": "code", "execution_count": null, "id": "18", "metadata": {}, "outputs": [], "source": [ "#Initializing W&B (optional)\n", "wandb.init(\n", " project=\"ai-text-detector\",\n", " name=\"full_training\",\n", " config={\"train_size\": train_size, \"test_size\": test_size}\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "19", "metadata": {}, "outputs": [], "source": [ "# Defining feature extraction functions (optimized)\n", "def calculate_readability(text):\n", " return textstat.flesch_reading_ease(text)\n", "\n", "def lexical_diversity(text):\n", " words = nltk.word_tokenize(text)\n", " return len(set(words)) / len(words) if len(words) > 0 else 0\n", "\n", "def sentence_length(text):\n", " sentences = nltk.sent_tokenize(text)\n", " return sum(len(nltk.word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0" ] }, { "cell_type": "code", "execution_count": null, "id": "20", "metadata": {}, "outputs": [], "source": [ "# Apply feature extraction\n", "print(\"Extracting features... (This may take some time)\")\n", "df_train['readability'] = df_train['text'].apply(calculate_readability)\n", "df_train['lexical_diversity'] = df_train['text'].apply(lexical_diversity)\n", "df_train['sentence_length'] = df_train['text'].apply(sentence_length)\n", "\n", "df_test['readability'] = df_test['text'].apply(calculate_readability)\n", "df_test['lexical_diversity'] = df_test['text'].apply(lexical_diversity)\n", "df_test['sentence_length'] = df_test['text'].apply(sentence_length)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "21", "metadata": {}, "outputs": [], "source": [ "#Initialize TF-IDF Vectorizer with Parallel Processing\n", "vectorizer = TfidfVectorizer(max_features=5000, n_jobs=-1) \n", "X_train_tfidf = vectorizer.fit_transform(df_train['text'])\n", "X_test_tfidf = vectorizer.transform(df_test['text'])" ] }, { "cell_type": "code", "execution_count": null, "id": "22", "metadata": {}, "outputs": [], "source": [ "# Stack Sparse Matrices for Final Features\n", "X_train = hstack((X_train_tfidf, df_train[['readability', 'lexical_diversity', 'sentence_length']].values))\n", "X_test = hstack((X_test_tfidf, df_test[['readability', 'lexical_diversity', 'sentence_length']].values))\n" ] }, { "cell_type": "code", "execution_count": null, "id": "23", "metadata": {}, "outputs": [], "source": [ "#Defining Train Test Dataset\n", "y_train = df_train['generated']\n", "y_test = df_test['generated']" ] }, { "cell_type": "code", "execution_count": null, "id": "24", "metadata": {}, "outputs": [], "source": [ "# Initialize Model with Multi-core Processing\n", "model = SGDClassifier(loss='log_loss', max_iter=1000, n_jobs=-1)" ] }, { "cell_type": "code", "execution_count": null, "id": "25", "metadata": {}, "outputs": [], "source": [ "# Training the Model\n", "start_time = time.time()\n", "print(\"\\nšŸš€ Training Model...\")\n", "\n", "model.fit(X_train, y_train)\n", "\n", "training_time = time.time() - start_time" ] }, { "cell_type": "code", "execution_count": null, "id": "26", "metadata": {}, "outputs": [], "source": [ "# Evaluate Model\n", "y_pred = model.predict(X_test)\n", "accuracy = accuracy_score(y_test, y_pred)\n", "print(f\"\\nāœ… Training Completed in {training_time:.2f} sec - Accuracy: {accuracy:.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "27", "metadata": {}, "outputs": [], "source": [ "# Log Metrics to W&B(Optional)\n", "wandb.log({\n", " \"training_time\": training_time,\n", " \"accuracy\": accuracy,\n", " \"class_0_train\": (y_train == 0).sum(),\n", " \"class_1_train\": (y_train == 1).sum(),\n", " \"class_0_test\": (y_test == 0).sum(),\n", " \"class_1_test\": (y_test == 1).sum(),\n", "})\n", "wandb.finish()" ] }, { "cell_type": "code", "execution_count": null, "id": "28", "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "6217f203-31b6-45c6-b829-c04fa4696fe8", "outputId": "59dda091-1380-4ab6-d910-8d22f8152e57" }, "outputs": [], "source": [ "\n", "# Save Model\n", "joblib.dump(model, 'ai_detector_model.pkl')\n", "joblib.dump(vectorizer, 'vectorizer.pkl')\n", "\n", "print(\"\\nšŸŽ‰ Model training completed and saved!\")" ] } ], "metadata": { "accelerator": "TPU", "colab": { "gpuType": "V28", "provenance": [] }, "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 5 }