hchevva commited on
Commit
ee0bba8
·
verified ·
1 Parent(s): 7fd82d1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +363 -0
app.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from pathlib import Path
5
+
6
+ import numpy as np
7
+ import pandas as pd
8
+ import gradio as gr
9
+
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ from nltk.stem import WordNetLemmatizer
13
+ from nltk.sentiment import SentimentIntensityAnalyzer
14
+
15
+ import matplotlib.pyplot as plt
16
+ import seaborn as sns
17
+ from wordcloud import WordCloud
18
+
19
+
20
+ # -----------------------------
21
+ # NLTK setup (downloads once)
22
+ # -----------------------------
23
+ _NLTK_READY = False
24
+
25
+ def ensure_nltk():
26
+ global _NLTK_READY
27
+ if _NLTK_READY:
28
+ return
29
+
30
+ # Download required resources (safe to call multiple times)
31
+ nltk.download("stopwords", quiet=True)
32
+ nltk.download("punkt", quiet=True)
33
+ nltk.download("punkt_tab", quiet=True) # some environments need this
34
+ nltk.download("wordnet", quiet=True)
35
+ nltk.download("vader_lexicon", quiet=True)
36
+
37
+ _NLTK_READY = True
38
+
39
+
40
+ # -----------------------------
41
+ # Text preprocessing (close to notebook intent)
42
+ # -----------------------------
43
+ def extract_comment_body(text: str) -> str:
44
+ """
45
+ Notebook-style datasets sometimes store comment bodies inside brackets like: [...comment...]
46
+ If bracketed content exists, extract it; else return the original text.
47
+ """
48
+ if text is None:
49
+ return ""
50
+ s = str(text)
51
+
52
+ # Try bracket extraction: first [ ... ]
53
+ m = re.search(r"\[(.*?)\]", s)
54
+ if m and m.group(1).strip():
55
+ return m.group(1).strip()
56
+
57
+ return s.strip()
58
+
59
+
60
+ def normalize_text(text: str, stop_words: set, lemmatizer: WordNetLemmatizer) -> str:
61
+ """
62
+ Basic normalization: ASCII cleanup, lowercase, remove URLs, punctuation,
63
+ tokenize, remove stopwords, lemmatize, re-join.
64
+ """
65
+ if text is None:
66
+ return ""
67
+
68
+ # keep only ascii
69
+ text = text.encode("ascii", errors="ignore").decode("ascii")
70
+ text = text.lower()
71
+
72
+ # remove urls
73
+ text = re.sub(r"http\S+|www\.\S+", " ", text)
74
+
75
+ # remove punctuation / non-word
76
+ text = re.sub(r"[^a-z0-9\s]", " ", text)
77
+
78
+ # collapse whitespace
79
+ text = re.sub(r"\s+", " ", text).strip()
80
+
81
+ if not text:
82
+ return ""
83
+
84
+ tokens = nltk.word_tokenize(text)
85
+ tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
86
+ tokens = [lemmatizer.lemmatize(t) for t in tokens]
87
+
88
+ return " ".join(tokens)
89
+
90
+
91
+ def vader_label(sia: SentimentIntensityAnalyzer, text: str) -> str:
92
+ """
93
+ Standard VADER thresholds:
94
+ compound >= 0.05 => Positive
95
+ compound <= -0.05 => Negative
96
+ else Neutral
97
+ """
98
+ scores = sia.polarity_scores(text or "")
99
+ c = scores.get("compound", 0.0)
100
+ if c >= 0.05:
101
+ return "Positive"
102
+ if c <= -0.05:
103
+ return "Negative"
104
+ return "Neutral"
105
+
106
+
107
+ # -----------------------------
108
+ # Core analysis pipeline
109
+ # -----------------------------
110
+ def auto_detect_columns(df: pd.DataFrame):
111
+ """
112
+ Best-effort detection of player + text columns.
113
+ Uses common names from lab-style datasets.
114
+ """
115
+ cols = [c.lower() for c in df.columns]
116
+
117
+ # Player column candidates
118
+ player_candidates = ["player", "player_name", "name", "prospect", "athlete"]
119
+ player_col = None
120
+ for cand in player_candidates:
121
+ if cand in cols:
122
+ player_col = df.columns[cols.index(cand)]
123
+ break
124
+
125
+ # Text column candidates
126
+ text_candidates = ["text", "body", "comment", "comment_body", "content", "message"]
127
+ text_col = None
128
+ for cand in text_candidates:
129
+ if cand in cols:
130
+ text_col = df.columns[cols.index(cand)]
131
+ break
132
+
133
+ # Fallbacks: first object-like columns
134
+ if player_col is None:
135
+ obj_cols = [c for c in df.columns if df[c].dtype == "object"]
136
+ if obj_cols:
137
+ player_col = obj_cols[0]
138
+
139
+ if text_col is None:
140
+ obj_cols = [c for c in df.columns if df[c].dtype == "object"]
141
+ if len(obj_cols) >= 2:
142
+ text_col = obj_cols[1]
143
+ elif obj_cols:
144
+ text_col = obj_cols[0]
145
+
146
+ return player_col, text_col
147
+
148
+
149
+ def run_analysis(file_obj, player_col_in, text_col_in, max_rows, make_wordcloud):
150
+ """
151
+ Returns:
152
+ preview_df, processed_csv_file, player_csv_file, top25_csv_file,
153
+ fig_distribution, fig_top25, fig_wordcloud, status_text
154
+ """
155
+ ensure_nltk()
156
+
157
+ if file_obj is None:
158
+ return None, None, None, None, None, None, None, "Please upload a CSV file."
159
+
160
+ # Load CSV
161
+ df = pd.read_csv(file_obj.name)
162
+
163
+ if df.empty:
164
+ return None, None, None, None, None, None, None, "Uploaded CSV is empty."
165
+
166
+ # Choose columns (manual overrides if provided)
167
+ auto_player, auto_text = auto_detect_columns(df)
168
+ player_col = player_col_in if player_col_in and player_col_in in df.columns else auto_player
169
+ text_col = text_col_in if text_col_in and text_col_in in df.columns else auto_text
170
+
171
+ if player_col is None or text_col is None:
172
+ return None, None, None, None, None, None, None, (
173
+ "Could not detect player/text columns. "
174
+ "Please specify them in the dropdowns."
175
+ )
176
+
177
+ # Optionally limit rows for speed
178
+ if max_rows and max_rows > 0:
179
+ df = df.head(int(max_rows)).copy()
180
+ else:
181
+ df = df.copy()
182
+
183
+ # Basic cleanup (match lab intent: remove possible metadata-ish rows if any)
184
+ # If text_col contains a header-like row embedded, filter it out.
185
+ df[text_col] = df[text_col].astype(str)
186
+ df = df[~df[text_col].str.contains(r"body,score,controversiality", case=False, na=False)]
187
+
188
+ # Preprocess
189
+ stop_words = set(stopwords.words("english"))
190
+ lemmatizer = WordNetLemmatizer()
191
+ sia = SentimentIntensityAnalyzer()
192
+
193
+ df["player"] = df[player_col].astype(str)
194
+ df["raw_text"] = df[text_col].astype(str)
195
+
196
+ # Extract bracket body (if present), then normalize
197
+ df["comment_body"] = df["raw_text"].apply(extract_comment_body)
198
+ df["clean_text"] = df["comment_body"].apply(lambda t: normalize_text(t, stop_words, lemmatizer))
199
+
200
+ # Sentiment
201
+ df["sentiment"] = df["clean_text"].apply(lambda t: vader_label(sia, t))
202
+
203
+ # Comment-level output
204
+ processed_cols = ["player", "raw_text", "comment_body", "clean_text", "sentiment"]
205
+ processed = df[processed_cols].copy()
206
+
207
+ # Player-level aggregation
208
+ counts = (
209
+ processed.groupby("player")["sentiment"]
210
+ .value_counts()
211
+ .unstack(fill_value=0)
212
+ .rename_axis(None, axis=1)
213
+ )
214
+
215
+ # Ensure all columns exist
216
+ for c in ["Positive", "Neutral", "Negative"]:
217
+ if c not in counts.columns:
218
+ counts[c] = 0
219
+
220
+ counts["total"] = counts[["Positive", "Neutral", "Negative"]].sum(axis=1)
221
+ counts["percent_positive"] = np.where(counts["total"] > 0, (counts["Positive"] / counts["total"]) * 100, 0.0)
222
+
223
+ # Overall sentiment score: (pos - neg) / total (range [-1, 1])
224
+ counts["overall_sentiment_score"] = np.where(
225
+ counts["total"] > 0,
226
+ (counts["Positive"] - counts["Negative"]) / counts["total"],
227
+ 0.0
228
+ )
229
+
230
+ # Sort top 25 by score, then by total volume
231
+ top25 = counts.sort_values(["overall_sentiment_score", "total"], ascending=[False, False]).head(25).copy()
232
+
233
+ # Save outputs to temp files for download
234
+ tmpdir = Path(tempfile.mkdtemp(prefix="nfl_sentiment_"))
235
+
236
+ processed_path = tmpdir / "NFL_reddit_sentiment_analysis.csv"
237
+ players_path = tmpdir / "player_sentiment_results.csv"
238
+ top25_path = tmpdir / "top_25_players.csv"
239
+
240
+ processed.to_csv(processed_path, index=False)
241
+ counts.reset_index().to_csv(players_path, index=False)
242
+ top25.reset_index().to_csv(top25_path, index=False)
243
+
244
+ # ---- Plots ----
245
+ # 1) Sentiment distribution
246
+ fig1 = plt.figure()
247
+ ax1 = fig1.add_subplot(111)
248
+ sns.countplot(data=processed, x="sentiment", ax=ax1)
249
+ ax1.set_title("Overall Sentiment Distribution")
250
+ ax1.set_xlabel("Sentiment")
251
+ ax1.set_ylabel("Count")
252
+ fig1.tight_layout()
253
+
254
+ # 2) Top 25 bar plot
255
+ fig2 = plt.figure(figsize=(10, 6))
256
+ ax2 = fig2.add_subplot(111)
257
+ top25_plot = top25.reset_index()
258
+ sns.barplot(data=top25_plot, x="overall_sentiment_score", y="player", ax=ax2)
259
+ ax2.set_title("Top 25 Players by Overall Sentiment Score")
260
+ ax2.set_xlabel("Overall Sentiment Score")
261
+ ax2.set_ylabel("Player")
262
+ fig2.tight_layout()
263
+
264
+ # 3) Word cloud (positive only)
265
+ fig3 = None
266
+ if make_wordcloud:
267
+ positive_text = " ".join(processed.loc[processed["sentiment"] == "Positive", "clean_text"].dropna().astype(str).tolist())
268
+ if positive_text.strip():
269
+ wc = WordCloud(width=1200, height=600, background_color="white").generate(positive_text)
270
+ fig3 = plt.figure(figsize=(10, 5))
271
+ ax3 = fig3.add_subplot(111)
272
+ ax3.imshow(wc, interpolation="bilinear")
273
+ ax3.axis("off")
274
+ ax3.set_title("Word Cloud (Positive Comments)")
275
+ fig3.tight_layout()
276
+
277
+ # Preview table
278
+ preview = processed.head(25)
279
+
280
+ status = (
281
+ f"Loaded {len(df):,} rows. "
282
+ f"Using player column: '{player_col}', text column: '{text_col}'. "
283
+ f"Outputs saved for download."
284
+ )
285
+
286
+ return preview, str(processed_path), str(players_path), str(top25_path), fig1, fig2, fig3, status
287
+
288
+
289
+ def sentiment_single_text(player_name, comment_text):
290
+ ensure_nltk()
291
+ sia = SentimentIntensityAnalyzer()
292
+ stop_words = set(stopwords.words("english"))
293
+ lemmatizer = WordNetLemmatizer()
294
+
295
+ body = extract_comment_body(comment_text or "")
296
+ clean = normalize_text(body, stop_words, lemmatizer)
297
+ label = vader_label(sia, clean)
298
+ scores = sia.polarity_scores(clean)
299
+
300
+ out = {
301
+ "player": player_name or "",
302
+ "comment_body": body,
303
+ "clean_text": clean,
304
+ "sentiment": label,
305
+ "vader_scores": scores
306
+ }
307
+ return out
308
+
309
+
310
+ # -----------------------------
311
+ # Gradio UI
312
+ # -----------------------------
313
+ with gr.Blocks(title="NFL Reddit Sentiment (NLP Lab App)") as demo:
314
+ gr.Markdown("# NFL Reddit Sentiment Analysis (NLP Lab)")
315
+
316
+ with gr.Tab("Batch Analysis (Upload CSV)"):
317
+ with gr.Row():
318
+ file_in = gr.File(label="Upload NFL Reddit CSV", file_types=[".csv"])
319
+ with gr.Row():
320
+ player_col_in = gr.Textbox(label="Player column name (optional)", placeholder="e.g., player")
321
+ text_col_in = gr.Textbox(label="Text/comment column name (optional)", placeholder="e.g., text")
322
+ with gr.Row():
323
+ max_rows = gr.Number(label="Max rows (0 = all)", value=0, precision=0)
324
+ make_wordcloud = gr.Checkbox(label="Generate word cloud (positive comments)", value=True)
325
+
326
+ run_btn = gr.Button("Run Sentiment Analysis")
327
+
328
+ status = gr.Textbox(label="Status", interactive=False)
329
+
330
+ preview_df = gr.Dataframe(label="Preview (first 25 processed rows)", interactive=False)
331
+
332
+ with gr.Row():
333
+ processed_out = gr.File(label="Download: Comment-level sentiment CSV")
334
+ players_out = gr.File(label="Download: Player-level sentiment results CSV")
335
+ top25_out = gr.File(label="Download: Top 25 players CSV")
336
+
337
+ dist_plot = gr.Plot(label="Plot: Sentiment Distribution")
338
+ top25_plot = gr.Plot(label="Plot: Top 25 Players")
339
+ wc_plot = gr.Plot(label="Plot: Word Cloud (Positive)")
340
+
341
+ run_btn.click(
342
+ fn=run_analysis,
343
+ inputs=[file_in, player_col_in, text_col_in, max_rows, make_wordcloud],
344
+ outputs=[preview_df, processed_out, players_out, top25_out, dist_plot, top25_plot, wc_plot, status]
345
+ )
346
+
347
+ with gr.Tab("Single Comment Sentiment"):
348
+ gr.Markdown("Test sentiment on one comment using the same preprocessing + VADER logic.")
349
+ player_name = gr.Textbox(label="Player name (optional)")
350
+ comment_text = gr.Textbox(label="Comment text", lines=6, placeholder="Paste a Reddit comment here...")
351
+ single_btn = gr.Button("Analyze Sentiment")
352
+ single_out = gr.JSON(label="Result")
353
+
354
+ single_btn.click(
355
+ fn=sentiment_single_text,
356
+ inputs=[player_name, comment_text],
357
+ outputs=[single_out]
358
+ )
359
+
360
+ if __name__ == "__main__":
361
+ # For local runs; on hosting platforms, PORT may be provided
362
+ port = int(os.environ.get("PORT", "7860"))
363
+ demo.launch(server_name="0.0.0.0", server_port=port)