{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0",
   "metadata": {
    "id": "a3074189-9ff0-41da-a99b-d42d2172a914"
   },
   "outputs": [],
   "source": [
    "#Installing dependent libraries\n",
    "%pip install pandas matplotlib\n",
    "%pip install imblearn\n",
    "%pip install nltk\n",
    "%pip install textstat   "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Connecting With Wandb(optional)\n",
    "%pip install wandb\n",
    "import wandb\n",
    "wandb.login()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Importing all the libraries\n",
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt\n",
    "from imblearn.under_sampling import RandomUnderSampler\n",
    "import numpy as np\n",
    "import random\n",
    "from collections import Counter\n",
    "import nltk\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics.pairwise import cosine_similarity\n",
    "from textstat import flesch_reading_ease\n",
    "import textstat\n",
    "import joblib\n",
    "from scipy.sparse import hstack\n",
    "from sklearn.linear_model import SGDClassifier\n",
    "from sklearn.utils import shuffle\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from multiprocessing import cpu_count\n",
    "import time\n",
    "import gc\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3",
   "metadata": {
    "id": "b2160971-e7b8-4bc0-812c-769dbaf2945e"
   },
   "outputs": [],
   "source": [
    "#Basic dataset handling and new file creation\n",
    "df = pd.read_csv(\"Datasets/AI_Human.csv\", engine='python', encoding='utf-8',on_bad_lines='skip')\n",
    "\n",
    "df.dropna(inplace=True)\n",
    "df = df[df[\"text\"].str.strip() != \"\"]\n",
    "df.drop_duplicates(inplace=True)\n",
    "df[\"text\"] = df[\"text\"].str.lower().str.strip()\n",
    "\n",
    "df.to_csv(\"Datasets/cleaned_dataset.csv\", index=False)\n",
    "\n",
    "del df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4",
   "metadata": {
    "id": "2b062d3a-e196-40c0-af09-26c5e3f6b2a3"
   },
   "outputs": [],
   "source": [
    "#Checking class distribution\n",
    "df = pd.read_csv(\"Datasets/cleaned_dataset.csv\",dtype={'generated': 'float'}, low_memory=False)\n",
    "gc.collect()\n",
    "print(df[\"generated\"].value_counts())\n",
    "\n",
    "# Plot distribution\n",
    "df[\"generated\"].value_counts().plot(kind=\"bar\", color=[\"blue\", \"red\"])\n",
    "plt.title(\"Distribution of AI vs. Human Texts\")\n",
    "plt.xlabel(\"Label (0=Human, 1=AI)\")\n",
    "plt.ylabel(\"Count\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5",
   "metadata": {
    "id": "2205b524-66b4-4d64-8b87-ec892f260590"
   },
   "outputs": [],
   "source": [
    "#Balancing dataset for equal class distribution\n",
    "\n",
    "rus = RandomUnderSampler(random_state=42)\n",
    "X_resampled, y_resampled = rus.fit_resample(df[[\"text\"]], df[\"generated\"])\n",
    "\n",
    "df_resampled = pd.DataFrame(X_resampled, columns=[\"text\"])\n",
    "df_resampled[\"generated\"] = y_resampled\n",
    "\n",
    "print(df_resampled[\"generated\"].value_counts())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6",
   "metadata": {
    "id": "a3a94a8f-c082-4c34-aae1-8d6310b6ac35"
   },
   "outputs": [],
   "source": [
    "#check for sentence length size\n",
    "df[\"text_length\"] = df[\"text\"].apply(len)\n",
    "\n",
    "# Plot text length distribution\n",
    "df.hist(column=\"text_length\", by=\"generated\", bins=50, figsize=(10, 5), color=[\"blue\"])\n",
    "plt.suptitle(\"Text Length Distribution for AI vs. Human\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7",
   "metadata": {
    "id": "1aa4110a-79cc-4e5c-80f5-b8f6ee8b9fdf"
   },
   "outputs": [],
   "source": [
    "#Checking for Words Lenght Distribution\n",
    "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split()))  # Count words\n",
    "\n",
    "# Plot histogram\n",
    "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
    "plt.xlabel(\"Words Length\")\n",
    "plt.ylabel(\"Frequency\")\n",
    "plt.title(\"Words Length Distribution\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8",
   "metadata": {
    "id": "1cb5091b-8c4d-45ff-8323-8c5a8ec45001"
   },
   "outputs": [],
   "source": [
    "#Trimming Long Text Length for balancing both classes\n",
    "\n",
    "def smart_truncate(text, max_length=700):\n",
    "    words = text.split()\n",
    "    length = len(words)\n",
    "\n",
    "    if length > max_length:\n",
    "        decay_factor = np.exp(-0.002 * (length - max_length)) \n",
    "        if random.random() > decay_factor:\n",
    "            trunc_limit = random.randint(600, 700) \n",
    "            return \" \".join(words[:trunc_limit])\n",
    "\n",
    "    return text  # Keep original if within limit\n",
    "\n",
    "df[\"text\"] = df[\"text\"].apply(smart_truncate)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9",
   "metadata": {
    "id": "662656fd-2202-47f0-a8d0-e45c83471797"
   },
   "outputs": [],
   "source": [
    "#check text length after trimming\n",
    "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split()))  # Count words\n",
    "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
    "plt.xlabel(\"Text Length (words)\")\n",
    "plt.ylabel(\"Frequency\")\n",
    "plt.title(\"Text Length Distribution\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "10",
   "metadata": {
    "id": "859bfa5d-1628-4c20-ad76-5cdc9d0c503f"
   },
   "outputs": [],
   "source": [
    "#check for data overlap\n",
    "nltk.download(\"stopwords\")\n",
    "\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "\n",
    "# Get the most common words in AI-generated vs. Human text\n",
    "ai_words = Counter(\" \".join(df[df[\"generated\"] == 1][\"text\"]).split())\n",
    "human_words = Counter(\" \".join(df[df[\"generated\"] == 0][\"text\"]).split())\n",
    "\n",
    "# Remove stopwords\n",
    "ai_words = {word: count for word, count in ai_words.items() if word.lower() not in stop_words}\n",
    "human_words = {word: count for word, count in human_words.items() if word.lower() not in stop_words}\n",
    "\n",
    "ai_words = Counter(ai_words)  # Convert to Counter\n",
    "human_words = Counter(human_words)  # Convert to Counter\n",
    "\n",
    "# Compare the top 20 words\n",
    "print(\"Top 20 AI-generated words:\", ai_words.most_common(20))\n",
    "print(\"Top 20 Human words:\", human_words.most_common(20))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11",
   "metadata": {
    "id": "4a7803ee-49bc-493f-aa88-b9d981161397"
   },
   "outputs": [],
   "source": [
    "#check for overlap percentage\n",
    "ai_top_words = set(word for word, _ in ai_words.most_common(50))\n",
    "human_top_words = set(word for word, _ in human_words.most_common(50))\n",
    "\n",
    "overlap = ai_top_words.intersection(human_top_words)\n",
    "overlap_percentage = (len(overlap) / len(ai_top_words)) * 100\n",
    "print(f\"Overlap Percentage: {overlap_percentage:.2f}%\")\n",
    "\n",
    "#checking graph distribution for overlap\n",
    "ai_freqs = [count for _, count in ai_words.most_common(20)]\n",
    "human_freqs = [count for _, count in human_words.most_common(20)]\n",
    "labels = [word for word, _ in ai_words.most_common(20)]\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.bar(labels, ai_freqs, color='blue', alpha=0.6, label=\"AI-generated\")\n",
    "plt.bar(labels, human_freqs, color='red', alpha=0.6, label=\"Human-written\")\n",
    "plt.xticks(rotation=45)\n",
    "plt.ylabel(\"Frequency\")\n",
    "plt.title(\"Word Frequency Comparison: AI vs. Human\")\n",
    "plt.legend()\n",
    "plt.show()\n",
    "\n",
    "#check for ai specific bias\n",
    "for word in [\"electoral\", \"students\", \"college\", \"may\"]:\n",
    "    ai_count = ai_words.get(word, 0)\n",
    "    human_count = human_words.get(word, 0)\n",
    "    print(f\"{word}: AI={ai_count}, Human={human_count}, Ratio={ai_count/human_count:.2f}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "12",
   "metadata": {
    "id": "3e501a40-6373-492d-862e-d4037645164d"
   },
   "outputs": [],
   "source": [
    "#checking for lexical diversity\n",
    "def lexical_diversity(texts):\n",
    "    total_words = sum(len(text.split()) for text in texts)\n",
    "    unique_words = len(set(\" \".join(texts).split()))\n",
    "    return unique_words / total_words\n",
    "\n",
    "ai_texts = df[df['generated'] == 1]['text'].tolist()\n",
    "human_texts = df[df['generated'] == 0]['text'].tolist()\n",
    "\n",
    "ai_diversity = lexical_diversity(ai_texts)  # List of AI-generated texts\n",
    "human_diversity = lexical_diversity(human_texts)  # List of human-written texts\n",
    "\n",
    "print(f\"Lexical Diversity - AI: {ai_diversity:.4f}, Human: {human_diversity:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13",
   "metadata": {
    "id": "20230773-aeca-4dad-a273-6418fd6a14d1"
   },
   "outputs": [],
   "source": [
    "#checking for context coherence\n",
    "\n",
    "ai_sample = ai_texts[:500]\n",
    "human_sample = human_texts[:500]\n",
    "\n",
    "\n",
    "texts = ai_sample + human_sample\n",
    "\n",
    "\n",
    "vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n",
    "tfidf_matrix = vectorizer.fit_transform(texts)\n",
    "\n",
    "\n",
    "ai_vectors = tfidf_matrix[:len(ai_sample)]\n",
    "human_vectors = tfidf_matrix[len(ai_sample):]\n",
    "\n",
    "ai_avg_vector = np.asarray(ai_vectors.mean(axis=0))\n",
    "human_avg_vector = np.asarray(human_vectors.mean(axis=0))\n",
    "\n",
    "# Compute similarity\n",
    "similarity_score = cosine_similarity(ai_avg_vector, human_avg_vector)[0][0]\n",
    "print(f\"Context Similarity (AI vs. Human): {similarity_score:.4f}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14",
   "metadata": {
    "id": "5d5dbc50-1689-4755-a66a-413999158f6e"
   },
   "outputs": [],
   "source": [
    "#Readablity Score\n",
    "\n",
    "ai_readability = sum(flesch_reading_ease(text) for text in ai_sample) / len(ai_sample)\n",
    "human_readability = sum(flesch_reading_ease(text) for text in human_sample) / len(human_sample)\n",
    "\n",
    "print(f\"AI Readability Score: {ai_readability:.2f}\")\n",
    "print(f\"Human Readability Score: {human_readability:.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "15",
   "metadata": {},
   "outputs": [],
   "source": [
    "nltk.download('punkt_tab')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "16",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.sample(frac=1, random_state=42).reset_index(drop=True) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "17",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Split into Train (90%) and Test (10%) to use more data for training\n",
    "train_size = int(0.9 * len(df))\n",
    "test_size = int(0.1 * len(df))\n",
    "df_train = df[:train_size]\n",
    "df_test = df[train_size:]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "18",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Initializing W&B (optional)\n",
    "wandb.init(\n",
    "    project=\"ai-text-detector\",\n",
    "    name=\"full_training\",\n",
    "    config={\"train_size\": train_size, \"test_size\": test_size}\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "19",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Defining feature extraction functions (optimized)\n",
    "def calculate_readability(text):\n",
    "    return textstat.flesch_reading_ease(text)\n",
    "\n",
    "def lexical_diversity(text):\n",
    "    words = nltk.word_tokenize(text)\n",
    "    return len(set(words)) / len(words) if len(words) > 0 else 0\n",
    "\n",
    "def sentence_length(text):\n",
    "    sentences = nltk.sent_tokenize(text)\n",
    "    return sum(len(nltk.word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "20",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Apply feature extraction\n",
    "print(\"Extracting features... (This may take some time)\")\n",
    "df_train['readability'] = df_train['text'].apply(calculate_readability)\n",
    "df_train['lexical_diversity'] = df_train['text'].apply(lexical_diversity)\n",
    "df_train['sentence_length'] = df_train['text'].apply(sentence_length)\n",
    "\n",
    "df_test['readability'] = df_test['text'].apply(calculate_readability)\n",
    "df_test['lexical_diversity'] = df_test['text'].apply(lexical_diversity)\n",
    "df_test['sentence_length'] = df_test['text'].apply(sentence_length)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Initialize TF-IDF Vectorizer with Parallel Processing\n",
    "vectorizer = TfidfVectorizer(max_features=5000, n_jobs=-1) \n",
    "X_train_tfidf = vectorizer.fit_transform(df_train['text'])\n",
    "X_test_tfidf = vectorizer.transform(df_test['text'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "22",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stack Sparse Matrices for Final Features\n",
    "X_train = hstack((X_train_tfidf, df_train[['readability', 'lexical_diversity', 'sentence_length']].values))\n",
    "X_test = hstack((X_test_tfidf, df_test[['readability', 'lexical_diversity', 'sentence_length']].values))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "23",
   "metadata": {},
   "outputs": [],
   "source": [
    "#Defining Train Test Dataset\n",
    "y_train = df_train['generated']\n",
    "y_test = df_test['generated']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize Model with Multi-core Processing\n",
    "model = SGDClassifier(loss='log_loss', max_iter=1000, n_jobs=-1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "25",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Training the Model\n",
    "start_time = time.time()\n",
    "print(\"\\n🚀 Training Model...\")\n",
    "\n",
    "model.fit(X_train, y_train)\n",
    "\n",
    "training_time = time.time() - start_time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Evaluate Model\n",
    "y_pred = model.predict(X_test)\n",
    "accuracy = accuracy_score(y_test, y_pred)\n",
    "print(f\"\\n✅ Training Completed in {training_time:.2f} sec - Accuracy: {accuracy:.4f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Log Metrics to W&B(Optional)\n",
    "wandb.log({\n",
    "    \"training_time\": training_time,\n",
    "    \"accuracy\": accuracy,\n",
    "    \"class_0_train\": (y_train == 0).sum(),\n",
    "    \"class_1_train\": (y_train == 1).sum(),\n",
    "    \"class_0_test\": (y_test == 0).sum(),\n",
    "    \"class_1_test\": (y_test == 1).sum(),\n",
    "})\n",
    "wandb.finish()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "6217f203-31b6-45c6-b829-c04fa4696fe8",
    "outputId": "59dda091-1380-4ab6-d910-8d22f8152e57"
   },
   "outputs": [],
   "source": [
    "\n",
    "# Save Model\n",
    "joblib.dump(model, 'ai_detector_model.pkl')\n",
    "joblib.dump(vectorizer, 'vectorizer.pkl')\n",
    "\n",
    "print(\"\\n🎉 Model training completed and saved!\")"
   ]
  }
 ],
 "metadata": {
  "accelerator": "TPU",
  "colab": {
   "gpuType": "V28",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}