import streamlit as st
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelEncoder

# Load dataset from Kaggle dataset file
def load_data():
    data = pd.read_csv("twitter_training.csv", header=None, encoding='utf-8')
    data.columns = ["id", "entity", "sentiment", "tweet"]  # Rename columns
    data = data[["tweet", "sentiment"]]  # Keep only relevant columns
    data.dropna(inplace=True)
    return data

data = load_data()

# Preprocess text
def clean_text(text):
    text = re.sub(r"http\S+|www\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z ]", "", text)  # Keep only letters and spaces
    return text.lower().strip()

data['clean_text'] = data['tweet'].apply(clean_text)

# Encode labels
label_encoder = LabelEncoder()
data['sentiment_encoded'] = label_encoder.fit_transform(data['sentiment'])

# Train Random Forest model
def train_model():
    X_train, X_test, y_train, y_test = train_test_split(
        data['clean_text'], data['sentiment_encoded'], test_size=0.2, random_state=42)
    
    pipeline = make_pipeline(TfidfVectorizer(), RandomForestClassifier(n_estimators=100, random_state=42))
    pipeline.fit(X_train, y_train)
    return pipeline

model = train_model()

# Streamlit UI
st.title("📢 Twitter Sentiment Analysis with Random Forest")
st.write("Enter a tweet to analyze its sentiment!")

# User input
tweet_input = st.text_area("Enter Tweet:")

if st.button("Analyze Sentiment"):
    cleaned_tweet = clean_text(tweet_input)
    prediction = model.predict([cleaned_tweet])[0]
    sentiment_result = label_encoder.inverse_transform([prediction])[0]
    st.success(f"Predicted Sentiment: {sentiment_result}")

st.write("Dataset: [Twitter Entity Sentiment Analysis](https://www.kaggle.com/datasets/jp797498e/twitter-entity-sentiment-analysis/data)")