reysarms's picture
updated environment
d23d393
import streamlit as st
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder
# Load dataset from Kaggle dataset file
def load_data():
data = pd.read_csv("twitter_training.csv", header=None, encoding='utf-8')
data.columns = ["id", "entity", "sentiment", "tweet"] # Rename columns
data = data[["tweet", "sentiment"]] # Keep only relevant columns
data.dropna(inplace=True)
return data
data = load_data()
# Preprocess text
def clean_text(text):
text = re.sub(r"http\S+|www\S+", "", text) # Remove URLs
text = re.sub(r"[^a-zA-Z ]", "", text) # Keep only letters and spaces
return text.lower().strip()
data['clean_text'] = data['tweet'].apply(clean_text)
# Encode labels
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])
# Train Random Forest model
def train_model():
X_train, X_test, y_train, y_test = train_test_split(
data['clean_text'], data['sentiment_encoded'], test_size=0.2, random_state=42)
pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100, random_state=42))
pipeline.fit(X_train, y_train)
return pipeline
model = train_model()
# Streamlit UI
st.title("📢 Twitter Sentiment Analysis with Random Forest")
st.write("Enter a tweet to analyze its sentiment!")
# User input
tweet_input = st.text_area("Enter Tweet:")
if st.button("Analyze Sentiment"):
cleaned_tweet = clean_text(tweet_input)
prediction = model.predict([cleaned_tweet])[0]
sentiment_result = label_encoder.inverse_transform([prediction])[0]
st.success(f"Predicted Sentiment: {sentiment_result}")
st.write("Dataset: [Twitter Entity Sentiment Analysis](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis/data)")