Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 14 days ago

Commit

ee0bba8

verified ·

1 Parent(s): 7fd82d1

Create app.py

Browse files

Files changed (1) hide show

app.py +363 -0

app.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import os
+import re
+import tempfile
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import gradio as gr
+import nltk
+from nltk.corpus import stopwords
+from nltk.stem import WordNetLemmatizer
+from nltk.sentiment import SentimentIntensityAnalyzer
+import matplotlib.pyplot as plt
+import seaborn as sns
+from wordcloud import WordCloud
+# -----------------------------
+# NLTK setup (downloads once)
+# -----------------------------
+_NLTK_READY = False
+def ensure_nltk():
+    global _NLTK_READY
+    if _NLTK_READY:
+        return
+    # Download required resources (safe to call multiple times)
+    nltk.download("stopwords", quiet=True)
+    nltk.download("punkt", quiet=True)
+    nltk.download("punkt_tab", quiet=True)  # some environments need this
+    nltk.download("wordnet", quiet=True)
+    nltk.download("vader_lexicon", quiet=True)
+    _NLTK_READY = True
+# -----------------------------
+# Text preprocessing (close to notebook intent)
+# -----------------------------
+def extract_comment_body(text: str) -> str:
+    """
+    Notebook-style datasets sometimes store comment bodies inside brackets like: [...comment...]
+    If bracketed content exists, extract it; else return the original text.
+    """
+    if text is None:
+        return ""
+    s = str(text)
+    # Try bracket extraction: first [ ... ]
+    m = re.search(r"\[(.*?)\]", s)
+    if m and m.group(1).strip():
+        return m.group(1).strip()
+    return s.strip()
+def normalize_text(text: str, stop_words: set, lemmatizer: WordNetLemmatizer) -> str:
+    """
+    Basic normalization: ASCII cleanup, lowercase, remove URLs, punctuation,
+    tokenize, remove stopwords, lemmatize, re-join.
+    """
+    if text is None:
+        return ""
+    # keep only ascii
+    text = text.encode("ascii", errors="ignore").decode("ascii")
+    text = text.lower()
+    # remove urls
+    text = re.sub(r"http\S+|www\.\S+", " ", text)
+    # remove punctuation / non-word
+    text = re.sub(r"[^a-z0-9\s]", " ", text)
+    # collapse whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+    if not text:
+        return ""
+    tokens = nltk.word_tokenize(text)
+    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
+    tokens = [lemmatizer.lemmatize(t) for t in tokens]
+    return " ".join(tokens)
+def vader_label(sia: SentimentIntensityAnalyzer, text: str) -> str:
+    """
+    Standard VADER thresholds:
+      compound >= 0.05 => Positive
+      compound <= -0.05 => Negative
+      else Neutral
+    """
+    scores = sia.polarity_scores(text or "")
+    c = scores.get("compound", 0.0)
+    if c >= 0.05:
+        return "Positive"
+    if c <= -0.05:
+        return "Negative"
+    return "Neutral"
+# -----------------------------
+# Core analysis pipeline
+# -----------------------------
+def auto_detect_columns(df: pd.DataFrame):
+    """
+    Best-effort detection of player + text columns.
+    Uses common names from lab-style datasets.
+    """
+    cols = [c.lower() for c in df.columns]
+    # Player column candidates
+    player_candidates = ["player", "player_name", "name", "prospect", "athlete"]
+    player_col = None
+    for cand in player_candidates:
+        if cand in cols:
+            player_col = df.columns[cols.index(cand)]
+            break
+    # Text column candidates
+    text_candidates = ["text", "body", "comment", "comment_body", "content", "message"]
+    text_col = None
+    for cand in text_candidates:
+        if cand in cols:
+            text_col = df.columns[cols.index(cand)]
+            break
+    # Fallbacks: first object-like columns
+    if player_col is None:
+        obj_cols = [c for c in df.columns if df[c].dtype == "object"]
+        if obj_cols:
+            player_col = obj_cols[0]
+    if text_col is None:
+        obj_cols = [c for c in df.columns if df[c].dtype == "object"]
+        if len(obj_cols) >= 2:
+            text_col = obj_cols[1]
+        elif obj_cols:
+            text_col = obj_cols[0]
+    return player_col, text_col
+def run_analysis(file_obj, player_col_in, text_col_in, max_rows, make_wordcloud):
+    """
+    Returns:
+      preview_df, processed_csv_file, player_csv_file, top25_csv_file,
+      fig_distribution, fig_top25, fig_wordcloud, status_text
+    """
+    ensure_nltk()
+    if file_obj is None:
+        return None, None, None, None, None, None, None, "Please upload a CSV file."
+    # Load CSV
+    df = pd.read_csv(file_obj.name)
+    if df.empty:
+        return None, None, None, None, None, None, None, "Uploaded CSV is empty."
+    # Choose columns (manual overrides if provided)
+    auto_player, auto_text = auto_detect_columns(df)
+    player_col = player_col_in if player_col_in and player_col_in in df.columns else auto_player
+    text_col = text_col_in if text_col_in and text_col_in in df.columns else auto_text
+    if player_col is None or text_col is None:
+        return None, None, None, None, None, None, None, (
+            "Could not detect player/text columns. "
+            "Please specify them in the dropdowns."
+        )
+    # Optionally limit rows for speed
+    if max_rows and max_rows > 0:
+        df = df.head(int(max_rows)).copy()
+    else:
+        df = df.copy()
+    # Basic cleanup (match lab intent: remove possible metadata-ish rows if any)
+    # If text_col contains a header-like row embedded, filter it out.
+    df[text_col] = df[text_col].astype(str)
+    df = df[~df[text_col].str.contains(r"body,score,controversiality", case=False, na=False)]
+    # Preprocess
+    stop_words = set(stopwords.words("english"))
+    lemmatizer = WordNetLemmatizer()
+    sia = SentimentIntensityAnalyzer()
+    df["player"] = df[player_col].astype(str)
+    df["raw_text"] = df[text_col].astype(str)
+    # Extract bracket body (if present), then normalize
+    df["comment_body"] = df["raw_text"].apply(extract_comment_body)
+    df["clean_text"] = df["comment_body"].apply(lambda t: normalize_text(t, stop_words, lemmatizer))
+    # Sentiment
+    df["sentiment"] = df["clean_text"].apply(lambda t: vader_label(sia, t))
+    # Comment-level output
+    processed_cols = ["player", "raw_text", "comment_body", "clean_text", "sentiment"]
+    processed = df[processed_cols].copy()
+    # Player-level aggregation
+    counts = (
+        processed.groupby("player")["sentiment"]
+        .value_counts()
+        .unstack(fill_value=0)
+        .rename_axis(None, axis=1)
+    )
+    # Ensure all columns exist
+    for c in ["Positive", "Neutral", "Negative"]:
+        if c not in counts.columns:
+            counts[c] = 0
+    counts["total"] = counts[["Positive", "Neutral", "Negative"]].sum(axis=1)
+    counts["percent_positive"] = np.where(counts["total"] > 0, (counts["Positive"] / counts["total"]) * 100, 0.0)
+    # Overall sentiment score: (pos - neg) / total  (range [-1, 1])
+    counts["overall_sentiment_score"] = np.where(
+        counts["total"] > 0,
+        (counts["Positive"] - counts["Negative"]) / counts["total"],
+        0.0
+    )
+    # Sort top 25 by score, then by total volume
+    top25 = counts.sort_values(["overall_sentiment_score", "total"], ascending=[False, False]).head(25).copy()
+    # Save outputs to temp files for download
+    tmpdir = Path(tempfile.mkdtemp(prefix="nfl_sentiment_"))
+    processed_path = tmpdir / "NFL_reddit_sentiment_analysis.csv"
+    players_path = tmpdir / "player_sentiment_results.csv"
+    top25_path = tmpdir / "top_25_players.csv"
+    processed.to_csv(processed_path, index=False)
+    counts.reset_index().to_csv(players_path, index=False)
+    top25.reset_index().to_csv(top25_path, index=False)
+    # ---- Plots ----
+    # 1) Sentiment distribution
+    fig1 = plt.figure()
+    ax1 = fig1.add_subplot(111)
+    sns.countplot(data=processed, x="sentiment", ax=ax1)
+    ax1.set_title("Overall Sentiment Distribution")
+    ax1.set_xlabel("Sentiment")
+    ax1.set_ylabel("Count")
+    fig1.tight_layout()
+    # 2) Top 25 bar plot
+    fig2 = plt.figure(figsize=(10, 6))
+    ax2 = fig2.add_subplot(111)
+    top25_plot = top25.reset_index()
+    sns.barplot(data=top25_plot, x="overall_sentiment_score", y="player", ax=ax2)
+    ax2.set_title("Top 25 Players by Overall Sentiment Score")
+    ax2.set_xlabel("Overall Sentiment Score")
+    ax2.set_ylabel("Player")
+    fig2.tight_layout()
+    # 3) Word cloud (positive only)
+    fig3 = None
+    if make_wordcloud:
+        positive_text = " ".join(processed.loc[processed["sentiment"] == "Positive", "clean_text"].dropna().astype(str).tolist())
+        if positive_text.strip():
+            wc = WordCloud(width=1200, height=600, background_color="white").generate(positive_text)
+            fig3 = plt.figure(figsize=(10, 5))
+            ax3 = fig3.add_subplot(111)
+            ax3.imshow(wc, interpolation="bilinear")
+            ax3.axis("off")
+            ax3.set_title("Word Cloud (Positive Comments)")
+            fig3.tight_layout()
+    # Preview table
+    preview = processed.head(25)
+    status = (
+        f"Loaded {len(df):,} rows. "
+        f"Using player column: '{player_col}', text column: '{text_col}'. "
+        f"Outputs saved for download."
+    )
+    return preview, str(processed_path), str(players_path), str(top25_path), fig1, fig2, fig3, status
+def sentiment_single_text(player_name, comment_text):
+    ensure_nltk()
+    sia = SentimentIntensityAnalyzer()
+    stop_words = set(stopwords.words("english"))
+    lemmatizer = WordNetLemmatizer()
+    body = extract_comment_body(comment_text or "")
+    clean = normalize_text(body, stop_words, lemmatizer)
+    label = vader_label(sia, clean)
+    scores = sia.polarity_scores(clean)
+    out = {
+        "player": player_name or "",
+        "comment_body": body,
+        "clean_text": clean,
+        "sentiment": label,
+        "vader_scores": scores
+    }
+    return out
+# -----------------------------
+# Gradio UI
+# -----------------------------
+with gr.Blocks(title="NFL Reddit Sentiment (NLP Lab App)") as demo:
+    gr.Markdown("# NFL Reddit Sentiment Analysis (NLP Lab)")
+    with gr.Tab("Batch Analysis (Upload CSV)"):
+        with gr.Row():
+            file_in = gr.File(label="Upload NFL Reddit CSV", file_types=[".csv"])
+        with gr.Row():
+            player_col_in = gr.Textbox(label="Player column name (optional)", placeholder="e.g., player")
+            text_col_in = gr.Textbox(label="Text/comment column name (optional)", placeholder="e.g., text")
+        with gr.Row():
+            max_rows = gr.Number(label="Max rows (0 = all)", value=0, precision=0)
+            make_wordcloud = gr.Checkbox(label="Generate word cloud (positive comments)", value=True)
+        run_btn = gr.Button("Run Sentiment Analysis")
+        status = gr.Textbox(label="Status", interactive=False)
+        preview_df = gr.Dataframe(label="Preview (first 25 processed rows)", interactive=False)
+        with gr.Row():
+            processed_out = gr.File(label="Download: Comment-level sentiment CSV")
+            players_out = gr.File(label="Download: Player-level sentiment results CSV")
+            top25_out = gr.File(label="Download: Top 25 players CSV")
+        dist_plot = gr.Plot(label="Plot: Sentiment Distribution")
+        top25_plot = gr.Plot(label="Plot: Top 25 Players")
+        wc_plot = gr.Plot(label="Plot: Word Cloud (Positive)")
+        run_btn.click(
+            fn=run_analysis,
+            inputs=[file_in, player_col_in, text_col_in, max_rows, make_wordcloud],
+            outputs=[preview_df, processed_out, players_out, top25_out, dist_plot, top25_plot, wc_plot, status]
+        )
+    with gr.Tab("Single Comment Sentiment"):
+        gr.Markdown("Test sentiment on one comment using the same preprocessing + VADER logic.")
+        player_name = gr.Textbox(label="Player name (optional)")
+        comment_text = gr.Textbox(label="Comment text", lines=6, placeholder="Paste a Reddit comment here...")
+        single_btn = gr.Button("Analyze Sentiment")
+        single_out = gr.JSON(label="Result")
+        single_btn.click(
+            fn=sentiment_single_text,
+            inputs=[player_name, comment_text],
+            outputs=[single_out]
+        )
+if __name__ == "__main__":
+    # For local runs; on hosting platforms, PORT may be provided
+    port = int(os.environ.get("PORT", "7860"))
+    demo.launch(server_name="0.0.0.0", server_port=port)