Spaces:

Rejeno
/

SentimentAnalysis

Running

App Files Files Community

Regino commited on Feb 13, 2025

Commit

0e876c8

1 Parent(s): 7ef995c

first commit

Browse files

Files changed (9) hide show

Train Model.ipynb +303 -0
app.py +154 -0
confusion_matrix.png +0 -0
requirements.txt +8 -0
sentiment_distribution.png +0 -0
sentiment_model.pkl +3 -0
tfidf_vectorizer.pkl +3 -0
twitter_training.csv +0 -0
twitter_validation.csv +0 -0

Train Model.ipynb ADDED Viewed

	@@ -0,0 +1,303 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Dataset from hugging face"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "     id        place     label  \\\n",
+      "0  2401  Borderlands  Positive   \n",
+      "1  2401  Borderlands  Positive   \n",
+      "2  2401  Borderlands  Positive   \n",
+      "3  2401  Borderlands  Positive   \n",
+      "4  2401  Borderlands  Positive   \n",
+      "\n",
+      "                                                text  \n",
+      "0  im getting on borderlands and i will murder yo...  \n",
+      "1  I am coming to the borders and I will kill you...  \n",
+      "2  im getting on borderlands and i will kill you ...  \n",
+      "3  im coming on borderlands and i will murder you...  \n",
+      "4  im getting on borderlands 2 and i will murder ...  \n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd  \n",
+    "\n",
+    "# Define column names manually\n",
+    "column_names = ['id',\"place\",\"label\", \"text\"]  # Change this based on your dataset\n",
+    "\n",
+    "# Load training dataset\n",
+    "train_df = pd.read_csv(\"twitter_training.csv\", names=column_names, header=None)\n",
+    "\n",
+    "# Load test dataset\n",
+    "test_df = pd.read_csv(\"twitter_validation.csv\", names=column_names, header=None)\n",
+    "\n",
+    "# Display first few rows\n",
+    "print(train_df.head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "[nltk_data] Downloading package stopwords to C:\\Users\\Regino Balogo\n",
+      "[nltk_data]     Jr\\AppData\\Roaming\\nltk_data...\n",
+      "[nltk_data]   Package stopwords is already up-to-date!\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Sample cleaned text:\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>text</th>\n",
+       "      <th>clean_text</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>im getting on borderlands and i will murder yo...</td>\n",
+       "      <td>im getting borderlands murder</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>I am coming to the borders and I will kill you...</td>\n",
+       "      <td>coming borders kill</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>im getting on borderlands and i will kill you ...</td>\n",
+       "      <td>im getting borderlands kill</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>im coming on borderlands and i will murder you...</td>\n",
+       "      <td>im coming borderlands murder</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>im getting on borderlands 2 and i will murder ...</td>\n",
+       "      <td>im getting borderlands 2 murder</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                text  \\\n",
+       "0  im getting on borderlands and i will murder yo...   \n",
+       "1  I am coming to the borders and I will kill you...   \n",
+       "2  im getting on borderlands and i will kill you ...   \n",
+       "3  im coming on borderlands and i will murder you...   \n",
+       "4  im getting on borderlands 2 and i will murder ...   \n",
+       "\n",
+       "                        clean_text  \n",
+       "0    im getting borderlands murder  \n",
+       "1              coming borders kill  \n",
+       "2      im getting borderlands kill  \n",
+       "3     im coming borderlands murder  \n",
+       "4  im getting borderlands 2 murder  "
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import re\n",
+    "import nltk\n",
+    "from nltk.corpus import stopwords\n",
+    "\n",
+    "# Download stopwords if not already downloaded\n",
+    "nltk.download(\"stopwords\")\n",
+    "stop_words = set(stopwords.words(\"english\"))\n",
+    "\n",
+    "# Function to clean text\n",
+    "def preprocess_text(text):\n",
+    "    if isinstance(text, float):  # Handle missing values\n",
+    "        return \"\"\n",
+    "    \n",
+    "    text = text.lower()  # Convert to lowercase\n",
+    "    text = re.sub(r\"\\W\", \" \", text)  # Remove special characters\n",
+    "    text = re.sub(r\"\\s+\", \" \", text).strip()  # Remove extra spaces\n",
+    "    text = \" \".join([word for word in text.split() if word not in stop_words])  # Remove stopwords\n",
+    "    return text\n",
+    "\n",
+    "# Apply preprocessing to the text column\n",
+    "train_df[\"clean_text\"] = train_df[\"text\"].apply(preprocess_text)\n",
+    "test_df[\"clean_text\"] = test_df[\"text\"].apply(preprocess_text)\n",
+    "\n",
+    "# Display a sample of the cleaned text\n",
+    "print(\"Sample cleaned text:\")\n",
+    "display(train_df[[\"text\", \"clean_text\"]].head())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "TF-IDF vectorization complete! ✅\n",
+      "Training data shape: (74682, 5000)\n",
+      "Testing data shape: (1000, 5000)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "\n",
+    "# Initialize TF-IDF Vectorizer\n",
+    "vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 most important words\n",
+    "\n",
+    "# Fit and transform training data, then transform test data\n",
+    "X_train = vectorizer.fit_transform(train_df[\"clean_text\"])\n",
+    "X_test = vectorizer.transform(test_df[\"clean_text\"])\n",
+    "\n",
+    "# Extract labels (assuming the sentiment column is named \"label\")\n",
+    "y_train = train_df[\"label\"]\n",
+    "y_test = test_df[\"label\"]\n",
+    "\n",
+    "print(\"TF-IDF vectorization complete! ✅\")\n",
+    "print(f\"Training data shape: {X_train.shape}\")\n",
+    "print(f\"Testing data shape: {X_test.shape}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model Accuracy: 0.8120\n",
+      "\n",
+      "Classification Report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "  Irrelevant       0.82      0.73      0.77       172\n",
+      "    Negative       0.78      0.89      0.83       266\n",
+      "     Neutral       0.85      0.76      0.80       285\n",
+      "    Positive       0.81      0.84      0.82       277\n",
+      "\n",
+      "    accuracy                           0.81      1000\n",
+      "   macro avg       0.81      0.81      0.81      1000\n",
+      "weighted avg       0.81      0.81      0.81      1000\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.metrics import accuracy_score, classification_report\n",
+    "\n",
+    "# Initialize and train the model\n",
+    "model = LogisticRegression(max_iter=1000)  # Increase iterations to ensure convergence\n",
+    "model.fit(X_train, y_train)\n",
+    "\n",
+    "# Make predictions on the test set\n",
+    "y_pred = model.predict(X_test)\n",
+    "\n",
+    "# Evaluate the model\n",
+    "accuracy = accuracy_score(y_test, y_pred)\n",
+    "print(f\"Model Accuracy: {accuracy:.4f}\")\n",
+    "\n",
+    "# Display classification report\n",
+    "print(\"\\nClassification Report:\")\n",
+    "print(classification_report(y_test, y_pred))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Model and vectorizer saved successfully! ✅\n"
+     ]
+    }
+   ],
+   "source": [
+    "import joblib\n",
+    "\n",
+    "# Save the trained model\n",
+    "joblib.dump(model, \"sentiment_model.pkl\")\n",
+    "\n",
+    "# Save the TF-IDF vectorizer\n",
+    "joblib.dump(vectorizer, \"tfidf_vectorizer.pkl\")\n",
+    "\n",
+    "print(\"Model and vectorizer saved successfully! ✅\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py ADDED Viewed

	@@ -0,0 +1,154 @@

+import joblib
+import streamlit as st
+import pandas as pd
+import re
+import nltk
+import matplotlib.pyplot as plt
+import seaborn as sns
+from wordcloud import WordCloud
+from nltk.corpus import stopwords
+from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
+# Download stopwords if not already available
+nltk.download("stopwords")
+stop_words = set(stopwords.words("english"))
+# Load the trained model and TF-IDF vectorizer
+model = joblib.load("sentiment_model.pkl")
+vectorizer = joblib.load("tfidf_vectorizer.pkl")
+# Load dataset with manually defined headers
+column_names = ["id", "place", "label", "text"]
+df = pd.read_csv("twitter_training.csv", names=column_names, header=None)
+# Function to preprocess text
+def preprocess_text(text):
+    text = str(text).lower()
+    text = re.sub(r"\W", " ", text)  # Remove special characters
+    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
+    text = " ".join([word for word in text.split() if word not in stop_words])  # Remove stopwords
+    return text
+# Load test dataset and compute model metrics
+try:
+    test_df = pd.read_csv("twitter_validation.csv", names=column_names, header=None)
+    X_test = vectorizer.transform(test_df["text"].astype(str))
+    y_test = test_df["label"]
+    y_pred = model.predict(X_test)
+    # Model metrics
+    accuracy = accuracy_score(y_test, y_pred)
+    classification_report_text = classification_report(y_test, y_pred, output_dict=True)
+    class_report_df = pd.DataFrame(classification_report_text).T.round(2)
+    # Compute confusion matrix
+    cm = confusion_matrix(y_test, y_pred, labels=["Positive", "Neutral", "Negative"])
+except Exception as e:
+    accuracy = None
+    class_report_df = None
+    cm = None
+# Function to predict sentiment
+def predict_sentiment(user_input):
+    cleaned_text = preprocess_text(user_input)
+    text_vector = vectorizer.transform([cleaned_text])
+    prediction = model.predict(text_vector)[0]
+    return prediction
+# Sidebar Navigation
+st.sidebar.title("🔍 Sentiment Analysis App")
+st.sidebar.markdown(
+    "This app performs **Sentiment Analysis** on text using **Machine Learning**. "
+    "It classifies text as **Positive, Neutral, or Negative** based on its sentiment."
+)
+st.sidebar.header("📌 Navigation")
+page = st.sidebar.radio(
+    "Go to:",
+    ["📂 Dataset", "📊 Visualizations", "📈 Model Metrics", "🤖 Sentiment Predictor"]
+)
+# App Title and Explanation
+st.title("📢 Twitter Sentiment Analysis")
+st.markdown(
+    "This application uses **Natural Language Processing (NLP)** and "
+    "**Logistic Regression** to analyze the sentiment of tweets. The model is trained using a dataset "
+    "of tweets labeled as **Positive, Neutral, or Negative**."
+)
+# 📂 Dataset Page
+if page == "📂 Dataset":
+    st.header("📂 Dataset Preview")
+    st.markdown("### Displaying Rows **50-55** from the Training Data:")
+    st.dataframe(df.iloc[49:55])
+# 📊 Visualization Page
+elif page == "📊 Visualizations":
+    st.header("📊 Data Visualizations")
+    # Pie Chart of Sentiments
+    st.subheader("🥧 Sentiment Distribution")
+    fig, ax = plt.subplots(figsize=(5, 5))
+    df["label"].value_counts().plot(kind="pie", autopct="%1.1f%%", colors=["green", "gray", "red", "blue"], ax=ax)
+    plt.title("Sentiment Distribution")
+    plt.ylabel("")
+    st.pyplot(fig)
+    # Bar Chart of Sentiment Counts
+    st.subheader("📊 Sentiment Count (Bar Chart)")
+    fig, ax = plt.subplots(figsize=(6, 4))
+    sns.countplot(x=df["label"], palette={"Positive": "green", "Neutral": "gray", "Negative": "red", "Irrelevant": "blue"}, ax=ax)
+    plt.xlabel("Sentiment Type")
+    plt.ylabel("Count")
+    plt.title("Distribution of Sentiments")
+    st.pyplot(fig)
+    # Word Cloud for Most Frequent Words
+    st.subheader("☁️ Word Cloud of Most Common Words")
+    text_data = " ".join(df["text"].astype(str))
+    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text_data)
+    fig, ax = plt.subplots(figsize=(8, 4))
+    ax.imshow(wordcloud, interpolation="bilinear")
+    ax.axis("off")
+    st.pyplot(fig)
+# 📈 Model Metrics Page
+elif page == "📈 Model Metrics":
+    st.header("📈 Model Performance")
+    if accuracy is not None:
+        st.write(f"✅ **Accuracy:** {accuracy * 100:.2f}%")
+    else:
+        st.warning("⚠️ Could not calculate accuracy. Please check the test dataset.")
+    if class_report_df is not None and not class_report_df.empty:
+        st.subheader("📌 Classification Report")
+        st.dataframe(class_report_df)
+    else:
+        st.warning("⚠️ Classification report is empty.")
+    if cm is not None and cm.any():
+        st.subheader("🔥 Confusion Matrix")
+        fig, ax = plt.subplots(figsize=(6, 5))
+        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Positive", "Neutral", "Negative"], yticklabels=["Positive", "Neutral", "Negative"], ax=ax)
+        plt.xlabel("Predicted")
+        plt.ylabel("Actual")
+        plt.title("Confusion Matrix")
+        st.pyplot(fig)
+    else:
+        st.warning("⚠️ Confusion matrix could not be generated.")
+# 🤖 Sentiment Predictor Page
+elif page == "🤖 Sentiment Predictor":
+    st.header("🤖 Sentiment Analysis")
+    st.markdown("Enter a sentence below, and the model will predict whether it is **Positive, Neutral, or Negative**.")
+    user_input = st.text_area("Type your sentence here:", "")
+    if st.button("Analyze Sentiment"):
+        if user_input.strip():
+            sentiment_result = predict_sentiment(user_input)
+            st.markdown(f"### 🔍 Prediction: **{sentiment_result}**")
+        else:
+            st.warning("Please enter some text to analyze.")

confusion_matrix.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+streamlit
+joblib
+pandas
+nltk
+matplotlib
+seaborn
+wordcloud
+scikit-learn

sentiment_distribution.png ADDED Viewed

sentiment_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5061ba50ae5dfc7b3f1415eade952be7b8764ade9d1945e2ec27f5ad85e63092
+size 161127

tfidf_vectorizer.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24722296250083368688b553d01fb5b3723364fea155b7d64820200e681c149f
+size 181291

twitter_training.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

twitter_validation.csv ADDED Viewed

The diff for this file is too large to render. See raw diff