SerialGuy commited on
Commit
d4e4738
·
0 Parent(s):

Clean start: removed git history and binary files

Browse files
Files changed (8) hide show
  1. .gitattributes +35 -0
  2. .gitignore +7 -0
  3. LICENSE +20 -0
  4. README.md +31 -0
  5. app.py +19 -0
  6. predictor.py +48 -0
  7. requirements.txt +0 -0
  8. train_Model.ipynb +583 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ *.pkl
2
+ .ipynb_checkpoints/
3
+ __pycache__/
4
+ *.pyc
5
+ .DS_Store
6
+ venv/
7
+ .vscode
LICENSE ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 SerialGuy
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights to
8
+ use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
9
+ of the Software, and to permit persons to whom the Software is furnished to do so,
10
+ subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in
13
+ all copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
16
+ INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
17
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
18
+ HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
19
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20
+ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🧠 AI vs Human Text Detector
2
+
3
+ This project is a machine learning-based system designed to distinguish between text written by a **human** and that generated by an **AI language model** (e.g., ChatGPT). It uses deep learning and text embeddings to analyze writing patterns and classify them accurately.
4
+
5
+ ---
6
+
7
+ ## 📌 Features
8
+
9
+ - ✅ Binary classification: AI-generated vs Human-written text
10
+ - 📈 Achieved **92% accuracy** and **0.89 F1-Score**
11
+ - 🔍 Embedding + Deep Learning model
12
+ - 📊 Evaluation on real-world prompts and datasets
13
+ - 🧪 Trained and tested using clean, balanced samples
14
+
15
+ ---
16
+
17
+ ## 🚀 Live Demo (Optional)
18
+
19
+ 🔗 **[Try it on Hugging Face Spaces](https://huggingface.co/spaces/SerialGuy/ai-vs-human)**
20
+ (*Coming Soon — stay tuned!*)
21
+ > Enter a piece of text and the model will predict whether it's written by an AI or a human.
22
+
23
+ ---
24
+
25
+ ## 📂 Repository Structure
26
+
27
+ ```bash
28
+ .
29
+ ├── train_model.ipynb # Notebook to preprocess and train the model
30
+ ├── requirements.txt # (Optional) Dependencies
31
+ └── README.md # Project overview and instructions
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import streamlit as st
3
+ from predictor import predict_text
4
+
5
+ st.set_page_config(page_title="AI vs Human Text Detector", page_icon="🤖", layout="centered")
6
+
7
+ st.title("🤖 AI vs Human Text Detector")
8
+ st.markdown("Check if the given text was written by an **AI model** or a **human**.")
9
+
10
+ text_input = st.text_area("Enter text here", height=200)
11
+
12
+ if st.button("Predict"):
13
+ if text_input.strip() == "":
14
+ st.warning("Please enter some text to analyze.")
15
+ else:
16
+ prediction, confidence = predict_text(text_input)
17
+ label = "🧠 Human" if prediction == 1 else "🤖 AI"
18
+ st.success(f"**Prediction:** {label}")
19
+ st.info(f"**Confidence:** {confidence*100:.2f}%")
predictor.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import textstat
4
+ import joblib
5
+
6
+ # Load model and vectorizer
7
+ model = joblib.load("Models/ai_detector_model.pkl")
8
+ vectorizer = joblib.load("Models/vectorizer.pkl")
9
+
10
+ def calculate_readability(text):
11
+ """Calculate readability score for the text"""
12
+ return textstat.flesch_reading_ease(text)
13
+
14
+ def lexical_diversity(text):
15
+ """Compute lexical diversity = unique words / total words"""
16
+ words = text.split()
17
+ return len(set(words)) / len(words) if words else 0
18
+
19
+ def sentence_length(text):
20
+ """Compute average sentence length"""
21
+ sentences = text.split('.')
22
+ return sum(len(s.split()) for s in sentences) / len(sentences) if sentences else 0
23
+
24
+ def preprocess_text(text, vectorizer):
25
+ """Convert text to feature vectors (TF-IDF + readability metrics)"""
26
+
27
+ # Convert input text into a DataFrame
28
+ df_sample = pd.DataFrame({'text': [text]})
29
+
30
+ # Extract additional features
31
+ df_sample['readability'] = df_sample['text'].apply(calculate_readability)
32
+ df_sample['lexical_diversity'] = df_sample['text'].apply(lexical_diversity)
33
+ df_sample['sentence_length'] = df_sample['text'].apply(sentence_length)
34
+
35
+ # Convert text to TF-IDF vector
36
+ X_tfidf = vectorizer.transform(df_sample['text'])
37
+
38
+ # Combine TF-IDF features with extracted features
39
+ X_sample = np.hstack((X_tfidf.toarray(),
40
+ df_sample[['readability', 'lexical_diversity', 'sentence_length']].values))
41
+
42
+ return X_sample
43
+
44
+ def predict_text(text):
45
+ X_sample = preprocess_text(text)
46
+ prediction = model.predict(X_sample)[0]
47
+ confidence = model.predict_proba(X_sample)[0][:, 1]
48
+ return prediction, confidence
requirements.txt ADDED
Binary file (2.49 kB). View file
 
train_Model.ipynb ADDED
@@ -0,0 +1,583 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "id": "0",
7
+ "metadata": {
8
+ "id": "a3074189-9ff0-41da-a99b-d42d2172a914"
9
+ },
10
+ "outputs": [],
11
+ "source": [
12
+ "#Installing dependent libraries\n",
13
+ "%pip install pandas matplotlib\n",
14
+ "%pip install imblearn\n",
15
+ "%pip install nltk\n",
16
+ "%pip install textstat "
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": null,
22
+ "id": "1",
23
+ "metadata": {},
24
+ "outputs": [],
25
+ "source": [
26
+ "#Connecting With Wandb(optional)\n",
27
+ "%pip install wandb\n",
28
+ "import wandb\n",
29
+ "wandb.login()"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "2",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "#Importing all the libraries\n",
40
+ "import pandas as pd\n",
41
+ "import matplotlib.pyplot as plt\n",
42
+ "from imblearn.under_sampling import RandomUnderSampler\n",
43
+ "import numpy as np\n",
44
+ "import random\n",
45
+ "from collections import Counter\n",
46
+ "import nltk\n",
47
+ "from nltk.corpus import stopwords\n",
48
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
49
+ "from sklearn.metrics.pairwise import cosine_similarity\n",
50
+ "from textstat import flesch_reading_ease\n",
51
+ "import textstat\n",
52
+ "import joblib\n",
53
+ "from scipy.sparse import hstack\n",
54
+ "from sklearn.linear_model import SGDClassifier\n",
55
+ "from sklearn.utils import shuffle\n",
56
+ "from sklearn.metrics import accuracy_score, classification_report\n",
57
+ "from multiprocessing import cpu_count\n",
58
+ "import time\n",
59
+ "import gc\n"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "code",
64
+ "execution_count": null,
65
+ "id": "3",
66
+ "metadata": {
67
+ "id": "b2160971-e7b8-4bc0-812c-769dbaf2945e"
68
+ },
69
+ "outputs": [],
70
+ "source": [
71
+ "#Basic dataset handling and new file creation\n",
72
+ "df = pd.read_csv(\"Datasets/AI_Human.csv\", engine='python', encoding='utf-8',on_bad_lines='skip')\n",
73
+ "\n",
74
+ "df.dropna(inplace=True)\n",
75
+ "df = df[df[\"text\"].str.strip() != \"\"]\n",
76
+ "df.drop_duplicates(inplace=True)\n",
77
+ "df[\"text\"] = df[\"text\"].str.lower().str.strip()\n",
78
+ "\n",
79
+ "df.to_csv(\"Datasets/cleaned_dataset.csv\", index=False)\n",
80
+ "\n",
81
+ "del df"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "id": "4",
88
+ "metadata": {
89
+ "id": "2b062d3a-e196-40c0-af09-26c5e3f6b2a3"
90
+ },
91
+ "outputs": [],
92
+ "source": [
93
+ "#Checking class distribution\n",
94
+ "df = pd.read_csv(\"Datasets/cleaned_dataset.csv\",dtype={'generated': 'float'}, low_memory=False)\n",
95
+ "gc.collect()\n",
96
+ "print(df[\"generated\"].value_counts())\n",
97
+ "\n",
98
+ "# Plot distribution\n",
99
+ "df[\"generated\"].value_counts().plot(kind=\"bar\", color=[\"blue\", \"red\"])\n",
100
+ "plt.title(\"Distribution of AI vs. Human Texts\")\n",
101
+ "plt.xlabel(\"Label (0=Human, 1=AI)\")\n",
102
+ "plt.ylabel(\"Count\")\n",
103
+ "plt.show()"
104
+ ]
105
+ },
106
+ {
107
+ "cell_type": "code",
108
+ "execution_count": null,
109
+ "id": "5",
110
+ "metadata": {
111
+ "id": "2205b524-66b4-4d64-8b87-ec892f260590"
112
+ },
113
+ "outputs": [],
114
+ "source": [
115
+ "#Balancing dataset for equal class distribution\n",
116
+ "\n",
117
+ "rus = RandomUnderSampler(random_state=42)\n",
118
+ "X_resampled, y_resampled = rus.fit_resample(df[[\"text\"]], df[\"generated\"])\n",
119
+ "\n",
120
+ "df_resampled = pd.DataFrame(X_resampled, columns=[\"text\"])\n",
121
+ "df_resampled[\"generated\"] = y_resampled\n",
122
+ "\n",
123
+ "print(df_resampled[\"generated\"].value_counts())"
124
+ ]
125
+ },
126
+ {
127
+ "cell_type": "code",
128
+ "execution_count": null,
129
+ "id": "6",
130
+ "metadata": {
131
+ "id": "a3a94a8f-c082-4c34-aae1-8d6310b6ac35"
132
+ },
133
+ "outputs": [],
134
+ "source": [
135
+ "#check for sentence length size\n",
136
+ "df[\"text_length\"] = df[\"text\"].apply(len)\n",
137
+ "\n",
138
+ "# Plot text length distribution\n",
139
+ "df.hist(column=\"text_length\", by=\"generated\", bins=50, figsize=(10, 5), color=[\"blue\"])\n",
140
+ "plt.suptitle(\"Text Length Distribution for AI vs. Human\")\n",
141
+ "plt.show()"
142
+ ]
143
+ },
144
+ {
145
+ "cell_type": "code",
146
+ "execution_count": null,
147
+ "id": "7",
148
+ "metadata": {
149
+ "id": "1aa4110a-79cc-4e5c-80f5-b8f6ee8b9fdf"
150
+ },
151
+ "outputs": [],
152
+ "source": [
153
+ "#Checking for Words Lenght Distribution\n",
154
+ "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split())) # Count words\n",
155
+ "\n",
156
+ "# Plot histogram\n",
157
+ "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
158
+ "plt.xlabel(\"Words Length\")\n",
159
+ "plt.ylabel(\"Frequency\")\n",
160
+ "plt.title(\"Words Length Distribution\")\n",
161
+ "plt.show()"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "id": "8",
168
+ "metadata": {
169
+ "id": "1cb5091b-8c4d-45ff-8323-8c5a8ec45001"
170
+ },
171
+ "outputs": [],
172
+ "source": [
173
+ "#Trimming Long Text Length for balancing both classes\n",
174
+ "\n",
175
+ "def smart_truncate(text, max_length=700):\n",
176
+ " words = text.split()\n",
177
+ " length = len(words)\n",
178
+ "\n",
179
+ " if length > max_length:\n",
180
+ " decay_factor = np.exp(-0.002 * (length - max_length)) \n",
181
+ " if random.random() > decay_factor:\n",
182
+ " trunc_limit = random.randint(600, 700) \n",
183
+ " return \" \".join(words[:trunc_limit])\n",
184
+ "\n",
185
+ " return text # Keep original if within limit\n",
186
+ "\n",
187
+ "df[\"text\"] = df[\"text\"].apply(smart_truncate)\n"
188
+ ]
189
+ },
190
+ {
191
+ "cell_type": "code",
192
+ "execution_count": null,
193
+ "id": "9",
194
+ "metadata": {
195
+ "id": "662656fd-2202-47f0-a8d0-e45c83471797"
196
+ },
197
+ "outputs": [],
198
+ "source": [
199
+ "#check text length after trimming\n",
200
+ "df[\"words_length\"] = df[\"text\"].apply(lambda x: len(x.split())) # Count words\n",
201
+ "plt.hist(df[\"words_length\"], bins=50, color=\"blue\", alpha=0.7)\n",
202
+ "plt.xlabel(\"Text Length (words)\")\n",
203
+ "plt.ylabel(\"Frequency\")\n",
204
+ "plt.title(\"Text Length Distribution\")\n",
205
+ "plt.show()"
206
+ ]
207
+ },
208
+ {
209
+ "cell_type": "code",
210
+ "execution_count": null,
211
+ "id": "10",
212
+ "metadata": {
213
+ "id": "859bfa5d-1628-4c20-ad76-5cdc9d0c503f"
214
+ },
215
+ "outputs": [],
216
+ "source": [
217
+ "#check for data overlap\n",
218
+ "nltk.download(\"stopwords\")\n",
219
+ "\n",
220
+ "stop_words = set(stopwords.words(\"english\"))\n",
221
+ "\n",
222
+ "# Get the most common words in AI-generated vs. Human text\n",
223
+ "ai_words = Counter(\" \".join(df[df[\"generated\"] == 1][\"text\"]).split())\n",
224
+ "human_words = Counter(\" \".join(df[df[\"generated\"] == 0][\"text\"]).split())\n",
225
+ "\n",
226
+ "# Remove stopwords\n",
227
+ "ai_words = {word: count for word, count in ai_words.items() if word.lower() not in stop_words}\n",
228
+ "human_words = {word: count for word, count in human_words.items() if word.lower() not in stop_words}\n",
229
+ "\n",
230
+ "ai_words = Counter(ai_words) # Convert to Counter\n",
231
+ "human_words = Counter(human_words) # Convert to Counter\n",
232
+ "\n",
233
+ "# Compare the top 20 words\n",
234
+ "print(\"Top 20 AI-generated words:\", ai_words.most_common(20))\n",
235
+ "print(\"Top 20 Human words:\", human_words.most_common(20))\n"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "id": "11",
242
+ "metadata": {
243
+ "id": "4a7803ee-49bc-493f-aa88-b9d981161397"
244
+ },
245
+ "outputs": [],
246
+ "source": [
247
+ "#check for overlap percentage\n",
248
+ "ai_top_words = set(word for word, _ in ai_words.most_common(50))\n",
249
+ "human_top_words = set(word for word, _ in human_words.most_common(50))\n",
250
+ "\n",
251
+ "overlap = ai_top_words.intersection(human_top_words)\n",
252
+ "overlap_percentage = (len(overlap) / len(ai_top_words)) * 100\n",
253
+ "print(f\"Overlap Percentage: {overlap_percentage:.2f}%\")\n",
254
+ "\n",
255
+ "#checking graph distribution for overlap\n",
256
+ "ai_freqs = [count for _, count in ai_words.most_common(20)]\n",
257
+ "human_freqs = [count for _, count in human_words.most_common(20)]\n",
258
+ "labels = [word for word, _ in ai_words.most_common(20)]\n",
259
+ "\n",
260
+ "plt.figure(figsize=(12, 6))\n",
261
+ "plt.bar(labels, ai_freqs, color='blue', alpha=0.6, label=\"AI-generated\")\n",
262
+ "plt.bar(labels, human_freqs, color='red', alpha=0.6, label=\"Human-written\")\n",
263
+ "plt.xticks(rotation=45)\n",
264
+ "plt.ylabel(\"Frequency\")\n",
265
+ "plt.title(\"Word Frequency Comparison: AI vs. Human\")\n",
266
+ "plt.legend()\n",
267
+ "plt.show()\n",
268
+ "\n",
269
+ "#check for ai specific bias\n",
270
+ "for word in [\"electoral\", \"students\", \"college\", \"may\"]:\n",
271
+ " ai_count = ai_words.get(word, 0)\n",
272
+ " human_count = human_words.get(word, 0)\n",
273
+ " print(f\"{word}: AI={ai_count}, Human={human_count}, Ratio={ai_count/human_count:.2f}\")\n",
274
+ "\n"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": null,
280
+ "id": "12",
281
+ "metadata": {
282
+ "id": "3e501a40-6373-492d-862e-d4037645164d"
283
+ },
284
+ "outputs": [],
285
+ "source": [
286
+ "#checking for lexical diversity\n",
287
+ "def lexical_diversity(texts):\n",
288
+ " total_words = sum(len(text.split()) for text in texts)\n",
289
+ " unique_words = len(set(\" \".join(texts).split()))\n",
290
+ " return unique_words / total_words\n",
291
+ "\n",
292
+ "ai_texts = df[df['generated'] == 1]['text'].tolist()\n",
293
+ "human_texts = df[df['generated'] == 0]['text'].tolist()\n",
294
+ "\n",
295
+ "ai_diversity = lexical_diversity(ai_texts) # List of AI-generated texts\n",
296
+ "human_diversity = lexical_diversity(human_texts) # List of human-written texts\n",
297
+ "\n",
298
+ "print(f\"Lexical Diversity - AI: {ai_diversity:.4f}, Human: {human_diversity:.4f}\")\n"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": null,
304
+ "id": "13",
305
+ "metadata": {
306
+ "id": "20230773-aeca-4dad-a273-6418fd6a14d1"
307
+ },
308
+ "outputs": [],
309
+ "source": [
310
+ "#checking for context coherence\n",
311
+ "\n",
312
+ "ai_sample = ai_texts[:500]\n",
313
+ "human_sample = human_texts[:500]\n",
314
+ "\n",
315
+ "\n",
316
+ "texts = ai_sample + human_sample\n",
317
+ "\n",
318
+ "\n",
319
+ "vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')\n",
320
+ "tfidf_matrix = vectorizer.fit_transform(texts)\n",
321
+ "\n",
322
+ "\n",
323
+ "ai_vectors = tfidf_matrix[:len(ai_sample)]\n",
324
+ "human_vectors = tfidf_matrix[len(ai_sample):]\n",
325
+ "\n",
326
+ "ai_avg_vector = np.asarray(ai_vectors.mean(axis=0))\n",
327
+ "human_avg_vector = np.asarray(human_vectors.mean(axis=0))\n",
328
+ "\n",
329
+ "# Compute similarity\n",
330
+ "similarity_score = cosine_similarity(ai_avg_vector, human_avg_vector)[0][0]\n",
331
+ "print(f\"Context Similarity (AI vs. Human): {similarity_score:.4f}\")\n"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": null,
337
+ "id": "14",
338
+ "metadata": {
339
+ "id": "5d5dbc50-1689-4755-a66a-413999158f6e"
340
+ },
341
+ "outputs": [],
342
+ "source": [
343
+ "#Readablity Score\n",
344
+ "\n",
345
+ "ai_readability = sum(flesch_reading_ease(text) for text in ai_sample) / len(ai_sample)\n",
346
+ "human_readability = sum(flesch_reading_ease(text) for text in human_sample) / len(human_sample)\n",
347
+ "\n",
348
+ "print(f\"AI Readability Score: {ai_readability:.2f}\")\n",
349
+ "print(f\"Human Readability Score: {human_readability:.2f}\")"
350
+ ]
351
+ },
352
+ {
353
+ "cell_type": "code",
354
+ "execution_count": null,
355
+ "id": "15",
356
+ "metadata": {},
357
+ "outputs": [],
358
+ "source": [
359
+ "nltk.download('punkt_tab')"
360
+ ]
361
+ },
362
+ {
363
+ "cell_type": "code",
364
+ "execution_count": null,
365
+ "id": "16",
366
+ "metadata": {},
367
+ "outputs": [],
368
+ "source": [
369
+ "df = df.sample(frac=1, random_state=42).reset_index(drop=True) "
370
+ ]
371
+ },
372
+ {
373
+ "cell_type": "code",
374
+ "execution_count": null,
375
+ "id": "17",
376
+ "metadata": {},
377
+ "outputs": [],
378
+ "source": [
379
+ "#Split into Train (90%) and Test (10%) to use more data for training\n",
380
+ "train_size = int(0.9 * len(df))\n",
381
+ "test_size = int(0.1 * len(df))\n",
382
+ "df_train = df[:train_size]\n",
383
+ "df_test = df[train_size:]"
384
+ ]
385
+ },
386
+ {
387
+ "cell_type": "code",
388
+ "execution_count": null,
389
+ "id": "18",
390
+ "metadata": {},
391
+ "outputs": [],
392
+ "source": [
393
+ "#Initializing W&B (optional)\n",
394
+ "wandb.init(\n",
395
+ " project=\"ai-text-detector\",\n",
396
+ " name=\"full_training\",\n",
397
+ " config={\"train_size\": train_size, \"test_size\": test_size}\n",
398
+ ")"
399
+ ]
400
+ },
401
+ {
402
+ "cell_type": "code",
403
+ "execution_count": null,
404
+ "id": "19",
405
+ "metadata": {},
406
+ "outputs": [],
407
+ "source": [
408
+ "# Defining feature extraction functions (optimized)\n",
409
+ "def calculate_readability(text):\n",
410
+ " return textstat.flesch_reading_ease(text)\n",
411
+ "\n",
412
+ "def lexical_diversity(text):\n",
413
+ " words = nltk.word_tokenize(text)\n",
414
+ " return len(set(words)) / len(words) if len(words) > 0 else 0\n",
415
+ "\n",
416
+ "def sentence_length(text):\n",
417
+ " sentences = nltk.sent_tokenize(text)\n",
418
+ " return sum(len(nltk.word_tokenize(sent)) for sent in sentences) / len(sentences) if len(sentences) > 0 else 0"
419
+ ]
420
+ },
421
+ {
422
+ "cell_type": "code",
423
+ "execution_count": null,
424
+ "id": "20",
425
+ "metadata": {},
426
+ "outputs": [],
427
+ "source": [
428
+ "# Apply feature extraction\n",
429
+ "print(\"Extracting features... (This may take some time)\")\n",
430
+ "df_train['readability'] = df_train['text'].apply(calculate_readability)\n",
431
+ "df_train['lexical_diversity'] = df_train['text'].apply(lexical_diversity)\n",
432
+ "df_train['sentence_length'] = df_train['text'].apply(sentence_length)\n",
433
+ "\n",
434
+ "df_test['readability'] = df_test['text'].apply(calculate_readability)\n",
435
+ "df_test['lexical_diversity'] = df_test['text'].apply(lexical_diversity)\n",
436
+ "df_test['sentence_length'] = df_test['text'].apply(sentence_length)\n"
437
+ ]
438
+ },
439
+ {
440
+ "cell_type": "code",
441
+ "execution_count": null,
442
+ "id": "21",
443
+ "metadata": {},
444
+ "outputs": [],
445
+ "source": [
446
+ "#Initialize TF-IDF Vectorizer with Parallel Processing\n",
447
+ "vectorizer = TfidfVectorizer(max_features=5000, n_jobs=-1) \n",
448
+ "X_train_tfidf = vectorizer.fit_transform(df_train['text'])\n",
449
+ "X_test_tfidf = vectorizer.transform(df_test['text'])"
450
+ ]
451
+ },
452
+ {
453
+ "cell_type": "code",
454
+ "execution_count": null,
455
+ "id": "22",
456
+ "metadata": {},
457
+ "outputs": [],
458
+ "source": [
459
+ "# Stack Sparse Matrices for Final Features\n",
460
+ "X_train = hstack((X_train_tfidf, df_train[['readability', 'lexical_diversity', 'sentence_length']].values))\n",
461
+ "X_test = hstack((X_test_tfidf, df_test[['readability', 'lexical_diversity', 'sentence_length']].values))\n"
462
+ ]
463
+ },
464
+ {
465
+ "cell_type": "code",
466
+ "execution_count": null,
467
+ "id": "23",
468
+ "metadata": {},
469
+ "outputs": [],
470
+ "source": [
471
+ "#Defining Train Test Dataset\n",
472
+ "y_train = df_train['generated']\n",
473
+ "y_test = df_test['generated']"
474
+ ]
475
+ },
476
+ {
477
+ "cell_type": "code",
478
+ "execution_count": null,
479
+ "id": "24",
480
+ "metadata": {},
481
+ "outputs": [],
482
+ "source": [
483
+ "# Initialize Model with Multi-core Processing\n",
484
+ "model = SGDClassifier(loss='log_loss', max_iter=1000, n_jobs=-1)"
485
+ ]
486
+ },
487
+ {
488
+ "cell_type": "code",
489
+ "execution_count": null,
490
+ "id": "25",
491
+ "metadata": {},
492
+ "outputs": [],
493
+ "source": [
494
+ "# Training the Model\n",
495
+ "start_time = time.time()\n",
496
+ "print(\"\\n🚀 Training Model...\")\n",
497
+ "\n",
498
+ "model.fit(X_train, y_train)\n",
499
+ "\n",
500
+ "training_time = time.time() - start_time"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "code",
505
+ "execution_count": null,
506
+ "id": "26",
507
+ "metadata": {},
508
+ "outputs": [],
509
+ "source": [
510
+ "# Evaluate Model\n",
511
+ "y_pred = model.predict(X_test)\n",
512
+ "accuracy = accuracy_score(y_test, y_pred)\n",
513
+ "print(f\"\\n✅ Training Completed in {training_time:.2f} sec - Accuracy: {accuracy:.4f}\")"
514
+ ]
515
+ },
516
+ {
517
+ "cell_type": "code",
518
+ "execution_count": null,
519
+ "id": "27",
520
+ "metadata": {},
521
+ "outputs": [],
522
+ "source": [
523
+ "# Log Metrics to W&B(Optional)\n",
524
+ "wandb.log({\n",
525
+ " \"training_time\": training_time,\n",
526
+ " \"accuracy\": accuracy,\n",
527
+ " \"class_0_train\": (y_train == 0).sum(),\n",
528
+ " \"class_1_train\": (y_train == 1).sum(),\n",
529
+ " \"class_0_test\": (y_test == 0).sum(),\n",
530
+ " \"class_1_test\": (y_test == 1).sum(),\n",
531
+ "})\n",
532
+ "wandb.finish()"
533
+ ]
534
+ },
535
+ {
536
+ "cell_type": "code",
537
+ "execution_count": null,
538
+ "id": "28",
539
+ "metadata": {
540
+ "colab": {
541
+ "base_uri": "https://localhost:8080/"
542
+ },
543
+ "id": "6217f203-31b6-45c6-b829-c04fa4696fe8",
544
+ "outputId": "59dda091-1380-4ab6-d910-8d22f8152e57"
545
+ },
546
+ "outputs": [],
547
+ "source": [
548
+ "\n",
549
+ "# Save Model\n",
550
+ "joblib.dump(model, 'ai_detector_model.pkl')\n",
551
+ "joblib.dump(vectorizer, 'vectorizer.pkl')\n",
552
+ "\n",
553
+ "print(\"\\n🎉 Model training completed and saved!\")"
554
+ ]
555
+ }
556
+ ],
557
+ "metadata": {
558
+ "accelerator": "TPU",
559
+ "colab": {
560
+ "gpuType": "V28",
561
+ "provenance": []
562
+ },
563
+ "kernelspec": {
564
+ "display_name": "venv",
565
+ "language": "python",
566
+ "name": "python3"
567
+ },
568
+ "language_info": {
569
+ "codemirror_mode": {
570
+ "name": "ipython",
571
+ "version": 3
572
+ },
573
+ "file_extension": ".py",
574
+ "mimetype": "text/x-python",
575
+ "name": "python",
576
+ "nbconvert_exporter": "python",
577
+ "pygments_lexer": "ipython3",
578
+ "version": "3.11.4"
579
+ }
580
+ },
581
+ "nbformat": 4,
582
+ "nbformat_minor": 5
583
+ }