Spaces:

SerialGuy
/

ai-vs-human

Sleeping

App Files Files Community

SerialGuy commited on Apr 18, 2025

Commit

d4e4738

0 Parent(s):

Clean start: removed git history and binary files

Browse files

Files changed (8) hide show

.gitattributes +35 -0
.gitignore +7 -0
LICENSE +20 -0
README.md +31 -0
app.py +19 -0
predictor.py +48 -0
requirements.txt +0 -0
train_Model.ipynb +583 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+*.pkl
+.ipynb_checkpoints/
+__pycache__/
+*.pyc
+.DS_Store
+venv/
+.vscode

LICENSE ADDED Viewed

	@@ -0,0 +1,20 @@

+MIT License
+Copyright (c) 2025 SerialGuy
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# 🧠 AI vs Human Text Detector
+This project is a machine learning-based system designed to distinguish between text written by a **human** and that generated by an **AI language model** (e.g., ChatGPT). It uses deep learning and text embeddings to analyze writing patterns and classify them accurately.
+---
+## 📌 Features
+- ✅ Binary classification: AI-generated vs Human-written text
+- 📈 Achieved **92% accuracy** and **0.89 F1-Score**
+- 🔍 Embedding + Deep Learning model
+- 📊 Evaluation on real-world prompts and datasets
+- 🧪 Trained and tested using clean, balanced samples
+---
+## 🚀 Live Demo (Optional)
+🔗 **[Try it on Hugging Face Spaces](https://huggingface.co/spaces/SerialGuy/ai-vs-human)**
+(*Coming Soon — stay tuned!*)
+> Enter a piece of text and the model will predict whether it's written by an AI or a human.
+---
+## 📂 Repository Structure
+```bash
+.
+├── train_model.ipynb      # Notebook to preprocess and train the model
+├── requirements.txt       # (Optional) Dependencies
+└── README.md              # Project overview and instructions

app.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# app.py
+import streamlit as st
+from predictor import predict_text
+st.set_page_config(page_title="AI vs Human Text Detector", page_icon="🤖", layout="centered")
+st.title("🤖 AI vs Human Text Detector")
+st.markdown("Check if the given text was written by an **AI model** or a **human**.")
+text_input = st.text_area("Enter text here", height=200)
+if st.button("Predict"):
+    if text_input.strip() == "":
+        st.warning("Please enter some text to analyze.")
+    else:
+        prediction, confidence = predict_text(text_input)
+        label = "🧠 Human" if prediction == 1 else "🤖 AI"
+        st.success(f"**Prediction:** {label}")
+        st.info(f"**Confidence:** {confidence*100:.2f}%")

predictor.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+import pandas as pd
+import textstat
+import joblib
+# Load model and vectorizer
+model = joblib.load("Models/ai_detector_model.pkl")
+vectorizer = joblib.load("Models/vectorizer.pkl")
+def calculate_readability(text):
+    """Calculate readability score for the text"""
+    return textstat.flesch_reading_ease(text)
+def lexical_diversity(text):
+    """Compute lexical diversity = unique words / total words"""
+    words = text.split()
+    return len(set(words)) / len(words) if words else 0
+def sentence_length(text):
+    """Compute average sentence length"""
+    sentences = text.split('.')
+    return sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
+def preprocess_text(text, vectorizer):
+    """Convert text to feature vectors (TF-IDF + readability metrics)"""
+    # Convert input text into a DataFrame
+    df_sample = pd.DataFrame({'text': [text]})
+    # Extract additional features
+    df_sample['readability'] = df_sample['text'].apply(calculate_readability)
+    df_sample['lexical_diversity'] = df_sample['text'].apply(lexical_diversity)
+    df_sample['sentence_length'] = df_sample['text'].apply(sentence_length)
+    # Convert text to TF-IDF vector
+    X_tfidf = vectorizer.transform(df_sample['text'])
+    # Combine TF-IDF features with extracted features
+    X_sample = np.hstack((X_tfidf.toarray(),
+                          df_sample[['readability', 'lexical_diversity', 'sentence_length']].values))
+    return X_sample
+def predict_text(text):
+    X_sample = preprocess_text(text)
+    prediction = model.predict(X_sample)[0]
+    confidence = model.predict_proba(X_sample)[0][:, 1]
+    return prediction, confidence

requirements.txt ADDED Viewed

Binary file (2.49 kB). View file

train_Model.ipynb ADDED Viewed

	@@ -0,0 +1,583 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0",
+   "metadata": {
+    "id": "a3074189-9ff0-41da-a99b-d42d2172a914"
+   },
+   "outputs": [],
+   "source": [
+    "#Installing dependent libraries\n",
+    "%pip install pandas matplotlib\n",
+    "%pip install imblearn\n",
+    "%pip install nltk\n",
+    "%pip install textstat   "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Connecting With Wandb(optional)\n",
+    "%pip install wandb\n",
+    "import wandb\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Importing all the libraries\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "from imblearn.under_sampling import RandomUnderSampler\n",
+    "import numpy as np\n",
+    "import random\n",
+    "from collections import Counter\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.metrics.pairwise import cosine_similarity\n",
+    "from textstat import flesch_reading_ease\n",
+    "import textstat\n",
+    "import joblib\n",
+    "from scipy.sparse import hstack\n",
+    "from sklearn.linear_model import SGDClassifier\n",
+    "from sklearn.utils import shuffle\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "from multiprocessing import cpu_count\n",
+    "import time\n",
+    "import gc\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3",
+   "metadata": {
+    "id": "b2160971-e7b8-4bc0-812c-769dbaf2945e"
+   },
+   "outputs": [],
+   "source": [
+    "#Basic dataset handling and new file creation\n",
+    "df = pd.read_csv(\"Datasets/AI_Human.csv\", engine='python', encoding='utf-8',on_bad_lines='skip')\n",
+    "\n",
+    "df.dropna(inplace=True)\n",
+    "df = df[df[\"text\"].str.strip() != \"\"]\n",
+    "df.drop_duplicates(inplace=True)\n",
+    "df[\"text\"] = df[\"text\"].str.lower().str.strip()\n",
+    "\n",
+    "df.to_csv(\"Datasets/cleaned_dataset.csv\", index=False)\n",
+    "\n",
+    "del df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {
+    "id": "2b062d3a-e196-40c0-af09-26c5e3f6b2a3"
+   },
+   "outputs": [],
+   "source": [
+    "#Checking class distribution\n",
+    "df = pd.read_csv(\"Datasets/cleaned_dataset.csv\",dtype={'generated': 'float'}, low_memory=False)\n",
+    "gc.collect()\n",
+    "print(df[\"generated\"].value_counts())\n",
+    "\n",
+    "# Plot distribution\n",
+    "df[\"generated\"].value_counts().plot(kind=\"bar\", color=[\"blue\", \"red\"])\n",
+    "plt.title(\"Distribution of AI vs. Human Texts\")\n",
+    "plt.xlabel(\"Label (0=Human, 1=AI)\")\n",
+    "plt.ylabel(\"Count\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5",
+   "metadata": {
+    "id": "2205b524-66b4-4d64-8b87-ec892f260590"
+   },
+   "outputs": [],
+   "source": [
+    "#Balancing dataset for equal class distribution\n",
+    "\n",
+    "rus = RandomUnderSampler(random_state=42)\n",
+    "X_resampled, y_resampled = rus.fit_resample(df[[\"text\"]], df[\"generated\"])\n",
+    "\n",
+    "df_resampled = pd.DataFrame(X_resampled, columns=[\"text\"])\n",
+    "df_resampled[\"generated\"] = y_resampled\n",
+    "\n",
+    "print(df_resampled[\"generated\"].value_counts())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {
+    "id": "a3a94a8f-c082-4c34-aae1-8d6310b6ac35"
+   },
+   "outputs": [],
+   "source": [
+    "#check for sentence length size\n",
+    "df[\"text_length\"] = df[\"text\"].apply(len)\n",
+    "\n",
+    "# Plot text length distribution\n",
+    "df.hist(column=\"text_length\", by=\"generated\", bins=50, figsize=(10, 5), color=[\"blue\"])\n",
+    "plt.suptitle(\"Text Length Distribution for AI vs. Human\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {
+    "id": "1aa4110a-79cc-4e5c-80f5-b8f6ee8b9fdf"
+   },
+   "outputs": [],
+   "source": [
+    "#Checking for Words Lenght Distribution\n",
+    "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split()))  # Count words\n",
+    "\n",
+    "# Plot histogram\n",
+    "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
+    "plt.xlabel(\"Words Length\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.title(\"Words Length Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {
+    "id": "1cb5091b-8c4d-45ff-8323-8c5a8ec45001"
+   },
+   "outputs": [],
+   "source": [
+    "#Trimming Long Text Length for balancing both classes\n",
+    "\n",
+    "def smart_truncate(text, max_length=700):\n",
+    "    words = text.split()\n",
+    "    length = len(words)\n",
+    "\n",
+    "    if length > max_length:\n",
+    "        decay_factor = np.exp(-0.002 * (length - max_length)) \n",
+    "        if random.random() > decay_factor:\n",
+    "            trunc_limit = random.randint(600, 700) \n",
+    "            return \" \".join(words[:trunc_limit])\n",
+    "\n",
+    "    return text  # Keep original if within limit\n",
+    "\n",
+    "df[\"text\"] = df[\"text\"].apply(smart_truncate)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {
+    "id": "662656fd-2202-47f0-a8d0-e45c83471797"
+   },
+   "outputs": [],
+   "source": [
+    "#check text length after trimming\n",
+    "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split()))  # Count words\n",
+    "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
+    "plt.xlabel(\"Text Length (words)\")\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.title(\"Text Length Distribution\")\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {
+    "id": "859bfa5d-1628-4c20-ad76-5cdc9d0c503f"
+   },
+   "outputs": [],
+   "source": [
+    "#check for data overlap\n",
+    "nltk.download(\"stopwords\")\n",
+    "\n",
+    "stop_words = set(stopwords.words(\"english\"))\n",
+    "\n",
+    "# Get the most common words in AI-generated vs. Human text\n",
+    "ai_words = Counter(\" \".join(df[df[\"generated\"] == 1][\"text\"]).split())\n",
+    "human_words = Counter(\" \".join(df[df[\"generated\"] == 0][\"text\"]).split())\n",
+    "\n",
+    "# Remove stopwords\n",
+    "ai_words = {word: count for word, count in ai_words.items() if word.lower() not in stop_words}\n",
+    "human_words = {word: count for word, count in human_words.items() if word.lower() not in stop_words}\n",
+    "\n",
+    "ai_words = Counter(ai_words)  # Convert to Counter\n",
+    "human_words = Counter(human_words)  # Convert to Counter\n",
+    "\n",
+    "# Compare the top 20 words\n",
+    "print(\"Top 20 AI-generated words:\", ai_words.most_common(20))\n",
+    "print(\"Top 20 Human words:\", human_words.most_common(20))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11",
+   "metadata": {
+    "id": "4a7803ee-49bc-493f-aa88-b9d981161397"
+   },
+   "outputs": [],
+   "source": [
+    "#check for overlap percentage\n",
+    "ai_top_words = set(word for word, _ in ai_words.most_common(50))\n",
+    "human_top_words = set(word for word, _ in human_words.most_common(50))\n",
+    "\n",
+    "overlap = ai_top_words.intersection(human_top_words)\n",
+    "overlap_percentage = (len(overlap) / len(ai_top_words)) * 100\n",
+    "print(f\"Overlap Percentage: {overlap_percentage:.2f}%\")\n",
+    "\n",
+    "#checking graph distribution for overlap\n",
+    "ai_freqs = [count for _, count in ai_words.most_common(20)]\n",
+    "human_freqs = [count for _, count in human_words.most_common(20)]\n",
+    "labels = [word for word, _ in ai_words.most_common(20)]\n",
+    "\n",
+    "plt.figure(figsize=(12, 6))\n",
+    "plt.bar(labels, ai_freqs, color='blue', alpha=0.6, label=\"AI-generated\")\n",
+    "plt.bar(labels, human_freqs, color='red', alpha=0.6, label=\"Human-written\")\n",
+    "plt.xticks(rotation=45)\n",
+    "plt.ylabel(\"Frequency\")\n",
+    "plt.title(\"Word Frequency Comparison: AI vs. Human\")\n",
+    "plt.legend()\n",
+    "plt.show()\n",
+    "\n",
+    "#check for ai specific bias\n",
+    "for word in [\"electoral\", \"students\", \"college\", \"may\"]:\n",
+    "    ai_count = ai_words.get(word, 0)\n",
+    "    human_count = human_words.get(word, 0)\n",
+    "    print(f\"{word}: AI={ai_count}, Human={human_count}, Ratio={ai_count/human_count:.2f}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {
+    "id": "3e501a40-6373-492d-862e-d4037645164d"
+   },
+   "outputs": [],
+   "source": [
+    "#checking for lexical diversity\n",
+    "def lexical_diversity(texts):\n",
+    "    total_words = sum(len(text.split()) for text in texts)\n",
+    "    unique_words = len(set(\" \".join(texts).split()))\n",
+    "    return unique_words / total_words\n",
+    "\n",
+    "ai_texts = df[df['generated'] == 1]['text'].tolist()\n",
+    "human_texts = df[df['generated'] == 0]['text'].tolist()\n",
+    "\n",
+    "ai_diversity = lexical_diversity(ai_texts)  # List of AI-generated texts\n",
+    "human_diversity = lexical_diversity(human_texts)  # List of human-written texts\n",
+    "\n",
+    "print(f\"Lexical Diversity - AI: {ai_diversity:.4f}, Human: {human_diversity:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {
+    "id": "20230773-aeca-4dad-a273-6418fd6a14d1"
+   },
+   "outputs": [],
+   "source": [
+    "#checking for context coherence\n",
+    "\n",
+    "ai_sample = ai_texts[:500]\n",
+    "human_sample = human_texts[:500]\n",
+    "\n",
+    "\n",
+    "texts = ai_sample + human_sample\n",
+    "\n",
+    "\n",
+    "vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n",
+    "tfidf_matrix = vectorizer.fit_transform(texts)\n",
+    "\n",
+    "\n",
+    "ai_vectors = tfidf_matrix[:len(ai_sample)]\n",
+    "human_vectors = tfidf_matrix[len(ai_sample):]\n",
+    "\n",
+    "ai_avg_vector = np.asarray(ai_vectors.mean(axis=0))\n",
+    "human_avg_vector = np.asarray(human_vectors.mean(axis=0))\n",
+    "\n",
+    "# Compute similarity\n",
+    "similarity_score = cosine_similarity(ai_avg_vector, human_avg_vector)[0][0]\n",
+    "print(f\"Context Similarity (AI vs. Human): {similarity_score:.4f}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {
+    "id": "5d5dbc50-1689-4755-a66a-413999158f6e"
+   },
+   "outputs": [],
+   "source": [
+    "#Readablity Score\n",
+    "\n",
+    "ai_readability = sum(flesch_reading_ease(text) for text in ai_sample) / len(ai_sample)\n",
+    "human_readability = sum(flesch_reading_ease(text) for text in human_sample) / len(human_sample)\n",
+    "\n",
+    "print(f\"AI Readability Score: {ai_readability:.2f}\")\n",
+    "print(f\"Human Readability Score: {human_readability:.2f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "nltk.download('punkt_tab')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.sample(frac=1, random_state=42).reset_index(drop=True) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Split into Train (90%) and Test (10%) to use more data for training\n",
+    "train_size = int(0.9 * len(df))\n",
+    "test_size = int(0.1 * len(df))\n",
+    "df_train = df[:train_size]\n",
+    "df_test = df[train_size:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Initializing W&B (optional)\n",
+    "wandb.init(\n",
+    "    project=\"ai-text-detector\",\n",
+    "    name=\"full_training\",\n",
+    "    config={\"train_size\": train_size, \"test_size\": test_size}\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Defining feature extraction functions (optimized)\n",
+    "def calculate_readability(text):\n",
+    "    return textstat.flesch_reading_ease(text)\n",
+    "\n",
+    "def lexical_diversity(text):\n",
+    "    words = nltk.word_tokenize(text)\n",
+    "    return len(set(words)) / len(words) if len(words) > 0 else 0\n",
+    "\n",
+    "def sentence_length(text):\n",
+    "    sentences = nltk.sent_tokenize(text)\n",
+    "    return sum(len(nltk.word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Apply feature extraction\n",
+    "print(\"Extracting features... (This may take some time)\")\n",
+    "df_train['readability'] = df_train['text'].apply(calculate_readability)\n",
+    "df_train['lexical_diversity'] = df_train['text'].apply(lexical_diversity)\n",
+    "df_train['sentence_length'] = df_train['text'].apply(sentence_length)\n",
+    "\n",
+    "df_test['readability'] = df_test['text'].apply(calculate_readability)\n",
+    "df_test['lexical_diversity'] = df_test['text'].apply(lexical_diversity)\n",
+    "df_test['sentence_length'] = df_test['text'].apply(sentence_length)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Initialize TF-IDF Vectorizer with Parallel Processing\n",
+    "vectorizer = TfidfVectorizer(max_features=5000, n_jobs=-1) \n",
+    "X_train_tfidf = vectorizer.fit_transform(df_train['text'])\n",
+    "X_test_tfidf = vectorizer.transform(df_test['text'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Stack Sparse Matrices for Final Features\n",
+    "X_train = hstack((X_train_tfidf, df_train[['readability', 'lexical_diversity', 'sentence_length']].values))\n",
+    "X_test = hstack((X_test_tfidf, df_test[['readability', 'lexical_diversity', 'sentence_length']].values))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Defining Train Test Dataset\n",
+    "y_train = df_train['generated']\n",
+    "y_test = df_test['generated']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize Model with Multi-core Processing\n",
+    "model = SGDClassifier(loss='log_loss', max_iter=1000, n_jobs=-1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Training the Model\n",
+    "start_time = time.time()\n",
+    "print(\"\\n🚀 Training Model...\")\n",
+    "\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "training_time = time.time() - start_time"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Evaluate Model\n",
+    "y_pred = model.predict(X_test)\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f\"\\n✅ Training Completed in {training_time:.2f} sec - Accuracy: {accuracy:.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Log Metrics to W&B(Optional)\n",
+    "wandb.log({\n",
+    "    \"training_time\": training_time,\n",
+    "    \"accuracy\": accuracy,\n",
+    "    \"class_0_train\": (y_train == 0).sum(),\n",
+    "    \"class_1_train\": (y_train == 1).sum(),\n",
+    "    \"class_0_test\": (y_test == 0).sum(),\n",
+    "    \"class_1_test\": (y_test == 1).sum(),\n",
+    "})\n",
+    "wandb.finish()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "28",
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "6217f203-31b6-45c6-b829-c04fa4696fe8",
+    "outputId": "59dda091-1380-4ab6-d910-8d22f8152e57"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "# Save Model\n",
+    "joblib.dump(model, 'ai_detector_model.pkl')\n",
+    "joblib.dump(vectorizer, 'vectorizer.pkl')\n",
+    "\n",
+    "print(\"\\n🎉 Model training completed and saved!\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "TPU",
+  "colab": {
+   "gpuType": "V28",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}