Spaces:
Sleeping
Sleeping
Regino
commited on
Commit
·
045e883
1
Parent(s):
3f37371
jlsdndfnds
Browse files- Train Model.ipynb +17 -20
Train Model.ipynb
CHANGED
|
@@ -9,7 +9,7 @@
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
-
"execution_count":
|
| 13 |
"metadata": {},
|
| 14 |
"outputs": [
|
| 15 |
{
|
|
@@ -33,24 +33,22 @@
|
|
| 33 |
}
|
| 34 |
],
|
| 35 |
"source": [
|
| 36 |
-
"import pandas as pd
|
| 37 |
"\n",
|
| 38 |
-
"
|
| 39 |
-
"
|
| 40 |
-
"\n",
|
| 41 |
-
"# Load training dataset\n",
|
| 42 |
"train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
|
| 43 |
"\n",
|
| 44 |
-
"#
|
| 45 |
"test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
|
| 46 |
"\n",
|
| 47 |
-
"
|
| 48 |
"print(train_df.head())\n"
|
| 49 |
]
|
| 50 |
},
|
| 51 |
{
|
| 52 |
"cell_type": "code",
|
| 53 |
-
"execution_count":
|
| 54 |
"metadata": {},
|
| 55 |
"outputs": [
|
| 56 |
{
|
|
@@ -149,11 +147,11 @@
|
|
| 149 |
"import nltk\n",
|
| 150 |
"from nltk.corpus import stopwords\n",
|
| 151 |
"\n",
|
| 152 |
-
"#
|
| 153 |
"nltk.download(\"stopwords\")\n",
|
| 154 |
"stop_words = set(stopwords.words(\"english\"))\n",
|
| 155 |
"\n",
|
| 156 |
-
"#
|
| 157 |
"def preprocess_text(text):\n",
|
| 158 |
" if isinstance(text, float): # Handle missing values\n",
|
| 159 |
" return \"\"\n",
|
|
@@ -168,14 +166,13 @@
|
|
| 168 |
"train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
|
| 169 |
"test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
|
| 170 |
"\n",
|
| 171 |
-
"# Display a sample of the cleaned text\n",
|
| 172 |
"print(\"Sample cleaned text:\")\n",
|
| 173 |
"display(train_df[[\"text\", \"clean_text\"]].head())\n"
|
| 174 |
]
|
| 175 |
},
|
| 176 |
{
|
| 177 |
"cell_type": "code",
|
| 178 |
-
"execution_count":
|
| 179 |
"metadata": {},
|
| 180 |
"outputs": [
|
| 181 |
{
|
|
@@ -191,14 +188,14 @@
|
|
| 191 |
"source": [
|
| 192 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 193 |
"\n",
|
| 194 |
-
"#
|
| 195 |
-
"vectorizer = TfidfVectorizer(max_features=5000)
|
| 196 |
"\n",
|
| 197 |
"# Fit and transform training data, then transform test data\n",
|
| 198 |
"X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
|
| 199 |
"X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
|
| 200 |
"\n",
|
| 201 |
-
"# Extract labels
|
| 202 |
"y_train = train_df[\"label\"]\n",
|
| 203 |
"y_test = test_df[\"label\"]\n",
|
| 204 |
"\n",
|
|
@@ -209,7 +206,7 @@
|
|
| 209 |
},
|
| 210 |
{
|
| 211 |
"cell_type": "code",
|
| 212 |
-
"execution_count":
|
| 213 |
"metadata": {},
|
| 214 |
"outputs": [
|
| 215 |
{
|
|
@@ -237,11 +234,11 @@
|
|
| 237 |
"from sklearn.linear_model import LogisticRegression\n",
|
| 238 |
"from sklearn.metrics import accuracy_score, classification_report\n",
|
| 239 |
"\n",
|
| 240 |
-
"#
|
| 241 |
-
"model = LogisticRegression(max_iter=1000)
|
| 242 |
"model.fit(X_train, y_train)\n",
|
| 243 |
"\n",
|
| 244 |
-
"# Make predictions
|
| 245 |
"y_pred = model.predict(X_test)\n",
|
| 246 |
"\n",
|
| 247 |
"# Evaluate the model\n",
|
|
|
|
| 9 |
},
|
| 10 |
{
|
| 11 |
"cell_type": "code",
|
| 12 |
+
"execution_count": null,
|
| 13 |
"metadata": {},
|
| 14 |
"outputs": [
|
| 15 |
{
|
|
|
|
| 33 |
}
|
| 34 |
],
|
| 35 |
"source": [
|
| 36 |
+
"import pandas as pd \n",
|
| 37 |
"\n",
|
| 38 |
+
"column_names = ['id',\"place\",\"label\", \"text\"]\n",
|
| 39 |
+
"#Train Dataset\n",
|
|
|
|
|
|
|
| 40 |
"train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
|
| 41 |
"\n",
|
| 42 |
+
"#Test Dataset\n",
|
| 43 |
"test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
|
| 44 |
"\n",
|
| 45 |
+
"\n",
|
| 46 |
"print(train_df.head())\n"
|
| 47 |
]
|
| 48 |
},
|
| 49 |
{
|
| 50 |
"cell_type": "code",
|
| 51 |
+
"execution_count": null,
|
| 52 |
"metadata": {},
|
| 53 |
"outputs": [
|
| 54 |
{
|
|
|
|
| 147 |
"import nltk\n",
|
| 148 |
"from nltk.corpus import stopwords\n",
|
| 149 |
"\n",
|
| 150 |
+
"# Stopwords\n",
|
| 151 |
"nltk.download(\"stopwords\")\n",
|
| 152 |
"stop_words = set(stopwords.words(\"english\"))\n",
|
| 153 |
"\n",
|
| 154 |
+
"# Clean Text\n",
|
| 155 |
"def preprocess_text(text):\n",
|
| 156 |
" if isinstance(text, float): # Handle missing values\n",
|
| 157 |
" return \"\"\n",
|
|
|
|
| 166 |
"train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
|
| 167 |
"test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
|
| 168 |
"\n",
|
|
|
|
| 169 |
"print(\"Sample cleaned text:\")\n",
|
| 170 |
"display(train_df[[\"text\", \"clean_text\"]].head())\n"
|
| 171 |
]
|
| 172 |
},
|
| 173 |
{
|
| 174 |
"cell_type": "code",
|
| 175 |
+
"execution_count": null,
|
| 176 |
"metadata": {},
|
| 177 |
"outputs": [
|
| 178 |
{
|
|
|
|
| 188 |
"source": [
|
| 189 |
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
| 190 |
"\n",
|
| 191 |
+
"# TF-IDF Vectorizer\n",
|
| 192 |
+
"vectorizer = TfidfVectorizer(max_features=5000)\n",
|
| 193 |
"\n",
|
| 194 |
"# Fit and transform training data, then transform test data\n",
|
| 195 |
"X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
|
| 196 |
"X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
|
| 197 |
"\n",
|
| 198 |
+
"# Extract labels\n",
|
| 199 |
"y_train = train_df[\"label\"]\n",
|
| 200 |
"y_test = test_df[\"label\"]\n",
|
| 201 |
"\n",
|
|
|
|
| 206 |
},
|
| 207 |
{
|
| 208 |
"cell_type": "code",
|
| 209 |
+
"execution_count": null,
|
| 210 |
"metadata": {},
|
| 211 |
"outputs": [
|
| 212 |
{
|
|
|
|
| 234 |
"from sklearn.linear_model import LogisticRegression\n",
|
| 235 |
"from sklearn.metrics import accuracy_score, classification_report\n",
|
| 236 |
"\n",
|
| 237 |
+
"# Train the model\n",
|
| 238 |
+
"model = LogisticRegression(max_iter=1000)\n",
|
| 239 |
"model.fit(X_train, y_train)\n",
|
| 240 |
"\n",
|
| 241 |
+
"# Make predictions\n",
|
| 242 |
"y_pred = model.predict(X_test)\n",
|
| 243 |
"\n",
|
| 244 |
"# Evaluate the model\n",
|