import streamlit as st import pandas as pd import numpy as np import re from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.ensemble import RandomForestClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import LabelEncoder # Load dataset from Kaggle dataset file def load_data(): data = pd.read_csv("twitter_training.csv", header=None, encoding='utf-8') data.columns = ["id", "entity", "sentiment", "tweet"] # Rename columns data = data[["tweet", "sentiment"]] # Keep only relevant columns data.dropna(inplace=True) return data data = load_data() # Preprocess text def clean_text(text): text = re.sub(r"http\S+|www\S+", "", text) # Remove URLs text = re.sub(r"[^a-zA-Z ]", "", text) # Keep only letters and spaces return text.lower().strip() data['clean_text'] = data['tweet'].apply(clean_text) # Encode labels label_encoder = LabelEncoder() data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment']) # Train Random Forest model def train_model(): X_train, X_test, y_train, y_test = train_test_split( data['clean_text'], data['sentiment_encoded'], test_size=0.2, random_state=42) pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100, random_state=42)) pipeline.fit(X_train, y_train) return pipeline model = train_model() # Streamlit UI st.title("📢 Twitter Sentiment Analysis with Random Forest") st.write("Enter a tweet to analyze its sentiment!") # User input tweet_input = st.text_area("Enter Tweet:") if st.button("Analyze Sentiment"): cleaned_tweet = clean_text(tweet_input) prediction = model.predict([cleaned_tweet])[0] sentiment_result = label_encoder.inverse_transform([prediction])[0] st.success(f"Predicted Sentiment: {sentiment_result}") st.write("Dataset: [Twitter Entity Sentiment Analysis](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis/data)")