Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -19,10 +19,16 @@ from nltk.tokenize import word_tokenize
|
|
| 19 |
from nltk.stem import WordNetLemmatizer
|
| 20 |
import re
|
| 21 |
import json
|
| 22 |
-
import os
|
| 23 |
-
import pickle
|
| 24 |
from textblob import TextBlob
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Download necessary NLTK data
|
| 27 |
try:
|
| 28 |
nltk.data.find('tokenizers/punkt')
|
|
@@ -34,14 +40,6 @@ except LookupError:
|
|
| 34 |
nltk.download('stopwords')
|
| 35 |
nltk.download('wordnet')
|
| 36 |
|
| 37 |
-
# Page configuration
|
| 38 |
-
st.set_page_config(
|
| 39 |
-
page_title="SentiMind Pro - Advanced Sentiment Analysis",
|
| 40 |
-
page_icon="📊",
|
| 41 |
-
layout="wide",
|
| 42 |
-
initial_sidebar_state="expanded"
|
| 43 |
-
)
|
| 44 |
-
|
| 45 |
# Custom CSS
|
| 46 |
st.markdown("""
|
| 47 |
<style>
|
|
@@ -99,8 +97,6 @@ if 'initialized' not in st.session_state:
|
|
| 99 |
st.session_state.initialized = False
|
| 100 |
st.session_state.user_input = ""
|
| 101 |
st.session_state.analysis_done = False
|
| 102 |
-
st.session_state.historical_data = None
|
| 103 |
-
st.session_state.sentiment_models = {}
|
| 104 |
st.session_state.historical_inputs = []
|
| 105 |
st.session_state.historical_results = []
|
| 106 |
|
|
@@ -108,35 +104,28 @@ if 'initialized' not in st.session_state:
|
|
| 108 |
|
| 109 |
def preprocess_text(text):
|
| 110 |
"""Preprocess text for sentiment analysis"""
|
| 111 |
-
# Convert to lowercase
|
| 112 |
text = text.lower()
|
| 113 |
-
# Remove URLs
|
| 114 |
-
text = re.sub(r'
|
| 115 |
-
# Remove
|
| 116 |
-
text = re.sub(r'
|
| 117 |
-
# Remove punctuation
|
| 118 |
-
text = re.sub(r'[^\w\s]', '', text)
|
| 119 |
-
# Remove extra whitespace
|
| 120 |
-
text = re.sub(r'\s+', ' ', text).strip()
|
| 121 |
-
|
| 122 |
-
# Tokenize
|
| 123 |
-
tokens = word_tokenize(text)
|
| 124 |
|
| 125 |
-
#
|
| 126 |
stop_words = set(stopwords.words('english'))
|
| 127 |
-
tokens = [word for word in tokens if word not in stop_words]
|
| 128 |
|
| 129 |
-
# Lemmatize
|
| 130 |
lemmatizer = WordNetLemmatizer()
|
| 131 |
-
tokens = [lemmatizer.lemmatize(word) for word in tokens]
|
| 132 |
|
| 133 |
return ' '.join(tokens)
|
| 134 |
|
| 135 |
def initialize_models():
|
| 136 |
"""Initialize sentiment analysis models with loading spinner"""
|
| 137 |
with st.spinner('Initializing sentiment analysis models...'):
|
| 138 |
-
|
| 139 |
-
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# BERT Sentiment Analysis
|
| 142 |
try:
|
|
@@ -147,9 +136,6 @@ def initialize_models():
|
|
| 147 |
except Exception as e:
|
| 148 |
st.error(f"Error loading BERT model: {e}")
|
| 149 |
st.session_state.sentiment_models['bert'] = pipeline("sentiment-analysis")
|
| 150 |
-
|
| 151 |
-
# TextBlob for additional analysis
|
| 152 |
-
st.session_state.sentiment_models['textblob'] = TextBlob
|
| 153 |
|
| 154 |
def generate_sample_data():
|
| 155 |
"""Generate realistic sample data for demonstration"""
|
|
@@ -157,7 +143,6 @@ def generate_sample_data():
|
|
| 157 |
start_date = end_date - timedelta(days=30)
|
| 158 |
dates = pd.date_range(start=start_date, end=end_date, freq='D')
|
| 159 |
|
| 160 |
-
# Generate more realistic sentiment patterns
|
| 161 |
weekday_effect = np.array([0.1 if d.weekday() >= 5 else 0 for d in dates])
|
| 162 |
trend = np.linspace(-0.2, 0.3, len(dates))
|
| 163 |
seasonal = np.array([-0.15 if d.weekday() == 0 else 0.05 if d.weekday() == 4 else 0 for d in dates])
|
|
@@ -178,54 +163,9 @@ def generate_sample_data():
|
|
| 178 |
|
| 179 |
return df
|
| 180 |
|
| 181 |
-
def train_prediction_models(df):
|
| 182 |
-
"""Train multiple prediction models and return the best one"""
|
| 183 |
-
X = df.copy()
|
| 184 |
-
X['day_of_week'] = X['Date'].dt.dayofweek
|
| 185 |
-
X['day_of_month'] = X['Date'].dt.day
|
| 186 |
-
X['month'] = X['Date'].dt.month
|
| 187 |
-
X['trend'] = np.arange(len(X))
|
| 188 |
-
|
| 189 |
-
features = ['day_of_week', 'day_of_month', 'month', 'trend']
|
| 190 |
-
X_train = X[features].values
|
| 191 |
-
y_train = X['Sentiment Score'].values
|
| 192 |
-
|
| 193 |
-
models = {
|
| 194 |
-
'Linear Regression': LinearRegression(),
|
| 195 |
-
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
|
| 196 |
-
}
|
| 197 |
-
|
| 198 |
-
for name, model in models.items():
|
| 199 |
-
model.fit(X_train, y_train)
|
| 200 |
-
|
| 201 |
-
future_dates = pd.date_range(
|
| 202 |
-
start=df['Date'].max() + timedelta(days=1),
|
| 203 |
-
periods=14,
|
| 204 |
-
freq='D'
|
| 205 |
-
)
|
| 206 |
-
|
| 207 |
-
X_future = pd.DataFrame({
|
| 208 |
-
'Date': future_dates,
|
| 209 |
-
'day_of_week': future_dates.dayofweek,
|
| 210 |
-
'day_of_month': future_dates.day,
|
| 211 |
-
'month': future_dates.month,
|
| 212 |
-
'trend': np.arange(len(X_train), len(X_train) + len(future_dates))
|
| 213 |
-
})
|
| 214 |
-
|
| 215 |
-
predictions = {}
|
| 216 |
-
for name, model in models.items():
|
| 217 |
-
y_pred = model.predict(X_future[features].values)
|
| 218 |
-
predictions[name] = pd.DataFrame({
|
| 219 |
-
'Date': future_dates,
|
| 220 |
-
'Predicted Sentiment': np.clip(y_pred, -1, 1)
|
| 221 |
-
})
|
| 222 |
-
|
| 223 |
-
return models['Random Forest'], predictions
|
| 224 |
-
|
| 225 |
def generate_wordcloud(text, sentiment_score):
|
| 226 |
"""Generate a wordcloud colored by sentiment"""
|
| 227 |
text = preprocess_text(text)
|
| 228 |
-
|
| 229 |
stopwords = set(STOPWORDS)
|
| 230 |
|
| 231 |
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
|
|
|
|
| 19 |
from nltk.stem import WordNetLemmatizer
|
| 20 |
import re
|
| 21 |
import json
|
|
|
|
|
|
|
| 22 |
from textblob import TextBlob
|
| 23 |
|
| 24 |
+
# Page configuration
|
| 25 |
+
st.set_page_config(
|
| 26 |
+
page_title="SentiMind Pro - Advanced Sentiment Analysis",
|
| 27 |
+
page_icon="📊",
|
| 28 |
+
layout="wide",
|
| 29 |
+
initial_sidebar_state="expanded"
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
# Download necessary NLTK data
|
| 33 |
try:
|
| 34 |
nltk.data.find('tokenizers/punkt')
|
|
|
|
| 40 |
nltk.download('stopwords')
|
| 41 |
nltk.download('wordnet')
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
# Custom CSS
|
| 44 |
st.markdown("""
|
| 45 |
<style>
|
|
|
|
| 97 |
st.session_state.initialized = False
|
| 98 |
st.session_state.user_input = ""
|
| 99 |
st.session_state.analysis_done = False
|
|
|
|
|
|
|
| 100 |
st.session_state.historical_inputs = []
|
| 101 |
st.session_state.historical_results = []
|
| 102 |
|
|
|
|
| 104 |
|
| 105 |
def preprocess_text(text):
|
| 106 |
"""Preprocess text for sentiment analysis"""
|
|
|
|
| 107 |
text = text.lower()
|
| 108 |
+
text = re.sub(r'http\S+|www\S+|https\S+', '', text) # Remove URLs
|
| 109 |
+
text = re.sub(r'@\w+|#\w+', '', text) # Remove mentions and hashtags
|
| 110 |
+
text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
|
| 111 |
+
text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
+
tokens = word_tokenize(text) # Tokenize
|
| 114 |
stop_words = set(stopwords.words('english'))
|
| 115 |
+
tokens = [word for word in tokens if word not in stop_words] # Remove stopwords
|
| 116 |
|
|
|
|
| 117 |
lemmatizer = WordNetLemmatizer()
|
| 118 |
+
tokens = [lemmatizer.lemmatize(word) for word in tokens] # Lemmatize
|
| 119 |
|
| 120 |
return ' '.join(tokens)
|
| 121 |
|
| 122 |
def initialize_models():
|
| 123 |
"""Initialize sentiment analysis models with loading spinner"""
|
| 124 |
with st.spinner('Initializing sentiment analysis models...'):
|
| 125 |
+
st.session_state.sentiment_models = {
|
| 126 |
+
'vader': SentimentIntensityAnalyzer(),
|
| 127 |
+
'textblob': TextBlob
|
| 128 |
+
}
|
| 129 |
|
| 130 |
# BERT Sentiment Analysis
|
| 131 |
try:
|
|
|
|
| 136 |
except Exception as e:
|
| 137 |
st.error(f"Error loading BERT model: {e}")
|
| 138 |
st.session_state.sentiment_models['bert'] = pipeline("sentiment-analysis")
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
def generate_sample_data():
|
| 141 |
"""Generate realistic sample data for demonstration"""
|
|
|
|
| 143 |
start_date = end_date - timedelta(days=30)
|
| 144 |
dates = pd.date_range(start=start_date, end=end_date, freq='D')
|
| 145 |
|
|
|
|
| 146 |
weekday_effect = np.array([0.1 if d.weekday() >= 5 else 0 for d in dates])
|
| 147 |
trend = np.linspace(-0.2, 0.3, len(dates))
|
| 148 |
seasonal = np.array([-0.15 if d.weekday() == 0 else 0.05 if d.weekday() == 4 else 0 for d in dates])
|
|
|
|
| 163 |
|
| 164 |
return df
|
| 165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
def generate_wordcloud(text, sentiment_score):
|
| 167 |
"""Generate a wordcloud colored by sentiment"""
|
| 168 |
text = preprocess_text(text)
|
|
|
|
| 169 |
stopwords = set(STOPWORDS)
|
| 170 |
|
| 171 |
def color_func(word, font_size, position, orientation, random_state=None, **kwargs):
|