File size: 2,919 Bytes
cf0cf5a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8101bc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
    "from tensorflow.keras.models import Sequential\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dense\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "\n",
    "# Load the preprocessed data\n",
    "train_data = pd.read_csv(\"/Users/saish/Downloads/preprocessed_train_data.csv\")\n",
    "test_data = pd.read_csv(\"/Users/saish/Downloads/preprocessed_test_data.csv\")\n",
    "\n",
    "# Tokenize the text data\n",
    "tokenizer = Tokenizer()\n",
    "tokenizer.fit_on_texts(train_data['text'])\n",
    "\n",
    "train_sequences = tokenizer.texts_to_sequences(train_data['text'])\n",
    "test_sequences = tokenizer.texts_to_sequences(test_data['text'])\n",
    "\n",
    "# Pad sequences to ensure uniform length\n",
    "max_length = max(len(seq) for seq in train_sequences)\n",
    "train_sequences = pad_sequences(train_sequences, maxlen=max_length)\n",
    "test_sequences = pad_sequences(test_sequences, maxlen=max_length)\n",
    "\n",
    "# Encode sentiment labels\n",
    "label_encoder = LabelEncoder()\n",
    "train_labels = label_encoder.fit_transform(train_data['sentiment'])\n",
    "test_labels = label_encoder.transform(test_data['sentiment'])\n",
    "\n",
    "# Define and compile the model\n",
    "model = Sequential()\n",
    "model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))\n",
    "model.add(LSTM(units=128))\n",
    "model.add(Dense(units=len(label_encoder.classes_), activation='softmax'))\n",
    "\n",
    "model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])\n",
    "\n",
    "# Train the model\n",
    "model.fit(train_sequences, train_labels, epochs=3, batch_size=16, validation_split=0.2)\n",
    "\n",
    "# Evaluate the model\n",
    "test_loss, test_accuracy = model.evaluate(test_sequences, test_labels)\n",
    "print(f'Test Accuracy: {test_accuracy}')\n",
    "\n",
    "# Save the trained model\n",
    "model.save(\"/Users/saish/Downloads/sentitensor1.keras\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}