Spaces:

hchevva
/

NLP_Project

Running

App Files Files Community

hchevva commited on 14 days ago

Commit

5f37dd4

verified ·

1 Parent(s): 8c43de8

Update app.py

Browse files

Files changed (1) hide show

app.py +414 -266

app.py CHANGED Viewed

@@ -1,21 +1,29 @@
 import os
 import re
 import tempfile
 from pathlib import Path
 import numpy as np
 import pandas as pd
-import gradio as gr
 import nltk
-from nltk.corpus import stopwords
-from nltk.stem import WordNetLemmatizer
 from nltk.sentiment import SentimentIntensityAnalyzer
 import matplotlib.pyplot as plt
 import seaborn as sns
 from wordcloud import WordCloud
 # -----------------------------
 # NLTK setup (downloads once)
@@ -26,338 +34,478 @@ def ensure_nltk():
     global _NLTK_READY
     if _NLTK_READY:
         return
-    # Download required resources (safe to call multiple times)
-    nltk.download("stopwords", quiet=True)
     nltk.download("punkt", quiet=True)
-    nltk.download("punkt_tab", quiet=True)  # some environments need this
-    nltk.download("wordnet", quiet=True)
     nltk.download("vader_lexicon", quiet=True)
     _NLTK_READY = True
 # -----------------------------
-# Text preprocessing (close to notebook intent)
 # -----------------------------
-def extract_comment_body(text: str) -> str:
     """
-    Notebook-style datasets sometimes store comment bodies inside brackets like: [...comment...]
-    If bracketed content exists, extract it; else return the original text.
     """
-    if text is None:
-        return ""
-    s = str(text)
-    # Try bracket extraction: first [ ... ]
-    m = re.search(r"\[(.*?)\]", s)
-    if m and m.group(1).strip():
-        return m.group(1).strip()
-    return s.strip()
-def normalize_text(text: str, stop_words: set, lemmatizer: WordNetLemmatizer) -> str:
     """
-    Basic normalization: ASCII cleanup, lowercase, remove URLs, punctuation,
-    tokenize, remove stopwords, lemmatize, re-join.
     """
-    if text is None:
-        return ""
-    # keep only ascii
-    text = text.encode("ascii", errors="ignore").decode("ascii")
-    text = text.lower()
-    # remove urls
-    text = re.sub(r"http\S+|www\.\S+", " ", text)
-    # remove punctuation / non-word
-    text = re.sub(r"[^a-z0-9\s]", " ", text)
-    # collapse whitespace
-    text = re.sub(r"\s+", " ", text).strip()
     if not text:
         return ""
-    tokens = nltk.word_tokenize(text)
-    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
-    tokens = [lemmatizer.lemmatize(t) for t in tokens]
-    return " ".join(tokens)
-def vader_label(sia: SentimentIntensityAnalyzer, text: str) -> str:
     """
-    Standard VADER thresholds:
-      compound >= 0.05 => Positive
-      compound <= -0.05 => Negative
-      else Neutral
     """
-    scores = sia.polarity_scores(text or "")
-    c = scores.get("compound", 0.0)
-    if c >= 0.05:
-        return "Positive"
-    if c <= -0.05:
-        return "Negative"
-    return "Neutral"
-# -----------------------------
-# Core analysis pipeline
-# -----------------------------
-def auto_detect_columns(df: pd.DataFrame):
     """
-    Best-effort detection of player + text columns.
-    Uses common names from lab-style datasets.
     """
-    cols = [c.lower() for c in df.columns]
-    # Player column candidates
-    player_candidates = ["player", "player_name", "name", "prospect", "athlete"]
-    player_col = None
-    for cand in player_candidates:
-        if cand in cols:
-            player_col = df.columns[cols.index(cand)]
-            break
-    # Text column candidates
-    text_candidates = ["text", "body", "comment", "comment_body", "content", "message"]
-    text_col = None
-    for cand in text_candidates:
-        if cand in cols:
-            text_col = df.columns[cols.index(cand)]
             break
-    # Fallbacks: first object-like columns
-    if player_col is None:
-        obj_cols = [c for c in df.columns if df[c].dtype == "object"]
-        if obj_cols:
-            player_col = obj_cols[0]
-    if text_col is None:
-        obj_cols = [c for c in df.columns if df[c].dtype == "object"]
-        if len(obj_cols) >= 2:
-            text_col = obj_cols[1]
-        elif obj_cols:
-            text_col = obj_cols[0]
-    return player_col, text_col
-def run_analysis(file_obj, player_col_in, text_col_in, max_rows, make_wordcloud):
     """
-    Returns:
-      preview_df, processed_csv_file, player_csv_file, top25_csv_file,
-      fig_distribution, fig_top25, fig_wordcloud, status_text
     """
-    ensure_nltk()
-    if file_obj is None:
-        return None, None, None, None, None, None, None, "Please upload a CSV file."
-    # Load CSV
-    df = pd.read_csv(file_obj.name)
-    if df.empty:
-        return None, None, None, None, None, None, None, "Uploaded CSV is empty."
-    # Choose columns (manual overrides if provided)
-    auto_player, auto_text = auto_detect_columns(df)
-    player_col = player_col_in if player_col_in and player_col_in in df.columns else auto_player
-    text_col = text_col_in if text_col_in and text_col_in in df.columns else auto_text
-    if player_col is None or text_col is None:
-        return None, None, None, None, None, None, None, (
-            "Could not detect player/text columns. "
-            "Please specify them in the dropdowns."
         )
-    # Optionally limit rows for speed
-    if max_rows and max_rows > 0:
-        df = df.head(int(max_rows)).copy()
-    else:
-        df = df.copy()
-    # Basic cleanup (match lab intent: remove possible metadata-ish rows if any)
-    # If text_col contains a header-like row embedded, filter it out.
-    df[text_col] = df[text_col].astype(str)
-    df = df[~df[text_col].str.contains(r"body,score,controversiality", case=False, na=False)]
-    # Preprocess
-    stop_words = set(stopwords.words("english"))
-    lemmatizer = WordNetLemmatizer()
-    sia = SentimentIntensityAnalyzer()
-    df["player"] = df[player_col].astype(str)
-    df["raw_text"] = df[text_col].astype(str)
-    # Extract bracket body (if present), then normalize
-    df["comment_body"] = df["raw_text"].apply(extract_comment_body)
-    df["clean_text"] = df["comment_body"].apply(lambda t: normalize_text(t, stop_words, lemmatizer))
-    # Sentiment
-    df["sentiment"] = df["clean_text"].apply(lambda t: vader_label(sia, t))
-    # Comment-level output
-    processed_cols = ["player", "raw_text", "comment_body", "clean_text", "sentiment"]
-    processed = df[processed_cols].copy()
-    # Player-level aggregation
-    counts = (
-        processed.groupby("player")["sentiment"]
-        .value_counts()
-        .unstack(fill_value=0)
-        .rename_axis(None, axis=1)
-    )
-    # Ensure all columns exist
-    for c in ["Positive", "Neutral", "Negative"]:
-        if c not in counts.columns:
-            counts[c] = 0
-    counts["total"] = counts[["Positive", "Neutral", "Negative"]].sum(axis=1)
-    counts["percent_positive"] = np.where(counts["total"] > 0, (counts["Positive"] / counts["total"]) * 100, 0.0)
-    # Overall sentiment score: (pos - neg) / total  (range [-1, 1])
-    counts["overall_sentiment_score"] = np.where(
-        counts["total"] > 0,
-        (counts["Positive"] - counts["Negative"]) / counts["total"],
-        0.0
-    )
-    # Sort top 25 by score, then by total volume
-    top25 = counts.sort_values(["overall_sentiment_score", "total"], ascending=[False, False]).head(25).copy()
-    # Save outputs to temp files for download
-    tmpdir = Path(tempfile.mkdtemp(prefix="nfl_sentiment_"))
-    processed_path = tmpdir / "NFL_reddit_sentiment_analysis.csv"
-    players_path = tmpdir / "player_sentiment_results.csv"
-    top25_path = tmpdir / "top_25_players.csv"
-    processed.to_csv(processed_path, index=False)
-    counts.reset_index().to_csv(players_path, index=False)
-    top25.reset_index().to_csv(top25_path, index=False)
-    # ---- Plots ----
-    # 1) Sentiment distribution
-    fig1 = plt.figure()
-    ax1 = fig1.add_subplot(111)
-    sns.countplot(data=processed, x="sentiment", ax=ax1)
-    ax1.set_title("Overall Sentiment Distribution")
-    ax1.set_xlabel("Sentiment")
-    ax1.set_ylabel("Count")
-    fig1.tight_layout()
-    # 2) Top 25 bar plot
-    fig2 = plt.figure(figsize=(10, 6))
-    ax2 = fig2.add_subplot(111)
-    top25_plot = top25.reset_index()
-    sns.barplot(data=top25_plot, x="overall_sentiment_score", y="player", ax=ax2)
-    ax2.set_title("Top 25 Players by Overall Sentiment Score")
-    ax2.set_xlabel("Overall Sentiment Score")
-    ax2.set_ylabel("Player")
-    fig2.tight_layout()
-    # 3) Word cloud (positive only)
-    fig3 = None
     if make_wordcloud:
-        positive_text = " ".join(processed.loc[processed["sentiment"] == "Positive", "clean_text"].dropna().astype(str).tolist())
-        if positive_text.strip():
-            wc = WordCloud(width=1200, height=600, background_color="white").generate(positive_text)
-            fig3 = plt.figure(figsize=(10, 5))
-            ax3 = fig3.add_subplot(111)
-            ax3.imshow(wc, interpolation="bilinear")
-            ax3.axis("off")
-            ax3.set_title("Word Cloud (Positive Comments)")
-            fig3.tight_layout()
-    # Preview table
-    preview = processed.head(25)
-    status = (
-        f"Loaded {len(df):,} rows. "
-        f"Using player column: '{player_col}', text column: '{text_col}'. "
-        f"Outputs saved for download."
-    )
-    return preview, str(processed_path), str(players_path), str(top25_path), fig1, fig2, fig3, status
-def sentiment_single_text(player_name, comment_text):
-    ensure_nltk()
-    sia = SentimentIntensityAnalyzer()
-    stop_words = set(stopwords.words("english"))
-    lemmatizer = WordNetLemmatizer()
-    body = extract_comment_body(comment_text or "")
-    clean = normalize_text(body, stop_words, lemmatizer)
-    label = vader_label(sia, clean)
-    scores = sia.polarity_scores(clean)
-    out = {
-        "player": player_name or "",
-        "comment_body": body,
-        "clean_text": clean,
-        "sentiment": label,
-        "vader_scores": scores
-    }
-    return out
 # -----------------------------
 # Gradio UI
 # -----------------------------
-with gr.Blocks(title="NFL Reddit Sentiment (NLP Lab App)") as demo:
-    gr.Markdown("# NFL Reddit Sentiment Analysis (NLP Lab)")
-    with gr.Tab("Batch Analysis (Upload CSV)"):
-        with gr.Row():
-            file_in = gr.File(label="Upload NFL Reddit CSV", file_types=[".csv"])
         with gr.Row():
-            player_col_in = gr.Textbox(label="Player column name (optional)", placeholder="e.g., player")
-            text_col_in = gr.Textbox(label="Text/comment column name (optional)", placeholder="e.g., text")
         with gr.Row():
-            max_rows = gr.Number(label="Max rows (0 = all)", value=0, precision=0)
-            make_wordcloud = gr.Checkbox(label="Generate word cloud (positive comments)", value=True)
-        run_btn = gr.Button("Run Sentiment Analysis")
         status = gr.Textbox(label="Status", interactive=False)
-        preview_df = gr.Dataframe(label="Preview (first 25 processed rows)", interactive=False)
         with gr.Row():
-            processed_out = gr.File(label="Download: Comment-level sentiment CSV")
-            players_out = gr.File(label="Download: Player-level sentiment results CSV")
-            top25_out = gr.File(label="Download: Top 25 players CSV")
-        dist_plot = gr.Plot(label="Plot: Sentiment Distribution")
-        top25_plot = gr.Plot(label="Plot: Top 25 Players")
-        wc_plot = gr.Plot(label="Plot: Word Cloud (Positive)")
         run_btn.click(
-            fn=run_analysis,
-            inputs=[file_in, player_col_in, text_col_in, max_rows, make_wordcloud],
-            outputs=[preview_df, processed_out, players_out, top25_out, dist_plot, top25_plot, wc_plot, status]
         )
-    with gr.Tab("Single Comment Sentiment"):
-        gr.Markdown("Test sentiment on one comment using the same preprocessing + VADER logic.")
-        player_name = gr.Textbox(label="Player name (optional)")
-        comment_text = gr.Textbox(label="Comment text", lines=6, placeholder="Paste a Reddit comment here...")
-        single_btn = gr.Button("Analyze Sentiment")
-        single_out = gr.JSON(label="Result")
-        single_btn.click(
-            fn=sentiment_single_text,
-            inputs=[player_name, comment_text],
-            outputs=[single_out]
         )
 if __name__ == "__main__":
-    # For local runs; on hosting platforms, PORT may be provided
     port = int(os.environ.get("PORT", "7860"))
     demo.launch(server_name="0.0.0.0", server_port=port)

 import os
 import re
+import math
 import tempfile
 from pathlib import Path
+from typing import Dict, List, Tuple
+import gradio as gr
 import numpy as np
 import pandas as pd
 import nltk
 from nltk.sentiment import SentimentIntensityAnalyzer
+from pypdf import PdfReader
+from sklearn.feature_extraction.text import TfidfVectorizer
 import matplotlib.pyplot as plt
 import seaborn as sns
 from wordcloud import WordCloud
+from sumy.parsers.plaintext import PlaintextParser
+from sumy.nlp.tokenizers import Tokenizer
+from sumy.summarizers.text_rank import TextRankSummarizer
 # -----------------------------
 # NLTK setup (downloads once)
     global _NLTK_READY
     if _NLTK_READY:
         return
     nltk.download("punkt", quiet=True)
+    nltk.download("punkt_tab", quiet=True)  # some envs need this
     nltk.download("vader_lexicon", quiet=True)
     _NLTK_READY = True
 # -----------------------------
+# PDF extraction
 # -----------------------------
+def extract_text_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[str, int]:
     """
+    Returns (text, page_count). max_pages=0 means all pages.
+    Note: scanned-image PDFs may yield little/no text.
     """
+    reader = PdfReader(pdf_path)
+    page_count = len(reader.pages)
+    pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, max_pages)
+    parts = []
+    for i in range(pages_to_read):
+        try:
+            t = reader.pages[i].extract_text() or ""
+        except Exception:
+            t = ""
+        if t.strip():
+            parts.append(t)
+    return "\n".join(parts).strip(), page_count
+# -----------------------------
+# Utilities
+# -----------------------------
+def clean_whitespace(text: str) -> str:
+    text = text or ""
+    text = text.replace("\x00", " ")
+    text = re.sub(r"\s+", " ", text).strip()
+    return text
+def split_into_chunks(text: str, chunk_chars: int = 3000) -> List[str]:
     """
+    Chunk by sentences into ~chunk_chars blocks.
     """
+    text = text or ""
+    if not text.strip():
+        return []
+    sentences = nltk.sent_tokenize(text)
+    chunks = []
+    cur = []
+    cur_len = 0
+    for s in sentences:
+        s = s.strip()
+        if not s:
+            continue
+        if cur_len + len(s) + 1 > chunk_chars and cur:
+            chunks.append(" ".join(cur))
+            cur = [s]
+            cur_len = len(s)
+        else:
+            cur.append(s)
+            cur_len += len(s) + 1
+    if cur:
+        chunks.append(" ".join(cur))
+    return chunks
+def vader_doc_sentiment(text: str, chunk_chars: int = 3000) -> Tuple[float, str, List[float]]:
+    """
+    Returns: (avg_compound_score, label, chunk_scores)
+    """
+    ensure_nltk()
+    sia = SentimentIntensityAnalyzer()
+    chunks = split_into_chunks(text, chunk_chars=chunk_chars)
+    if not chunks:
+        return 0.0, "Neutral", []
+    scores = [sia.polarity_scores(c).get("compound", 0.0) for c in chunks]
+    avg = float(np.mean(scores))
+    if avg >= 0.05:
+        label = "Positive"
+    elif avg <= -0.05:
+        label = "Negative"
+    else:
+        label = "Neutral"
+    return avg, label, scores
+def extract_keywords_tfidf(text: str, top_k: int = 20) -> List[Tuple[str, float]]:
+    """
+    TF-IDF keywords for a single document.
+    Uses unigrams + bigrams; returns list of (term, score).
+    """
+    text = text or ""
+    if not text.strip():
+        return []
+    vectorizer = TfidfVectorizer(
+        stop_words="english",
+        ngram_range=(1, 2),
+        max_features=5000
+    )
+    X = vectorizer.fit_transform([text])
+    feats = np.array(vectorizer.get_feature_names_out())
+    scores = X.toarray().ravel()
+    if scores.size == 0:
+        return []
+    idx = np.argsort(scores)[::-1]
+    idx = idx[: max(1, int(top_k))]
+    return [(feats[i], float(scores[i])) for i in idx if scores[i] > 0]
+def make_wordcloud_figure(text: str):
+    text = text or ""
+    if not text.strip():
+        return None
+    wc = WordCloud(width=1200, height=600, background_color="white").generate(text)
+    fig = plt.figure(figsize=(10, 5))
+    ax = fig.add_subplot(111)
+    ax.imshow(wc, interpolation="bilinear")
+    ax.axis("off")
+    fig.tight_layout()
+    return fig
+def textrank_summary(text: str, num_sentences: int = 6) -> str:
+    text = (text or "").strip()
     if not text:
         return ""
+    num_sentences = max(1, int(num_sentences))
+    parser = PlaintextParser.from_string(text, Tokenizer("english"))
+    summarizer = TextRankSummarizer()
+    sents = summarizer(parser.document, num_sentences)
+    return " ".join(str(s) for s in sents)
+def detect_title(text: str) -> str:
     """
+    Heuristic: pick the first 'strong' line from the first ~30 lines.
     """
+    raw = text or ""
+    lines = [l.strip() for l in raw.splitlines() if l.strip()]
+    lines = lines[:30]
+    for l in lines:
+        if 8 <= len(l) <= 200 and not l.lower().startswith(("abstract", "introduction")):
+            # avoid obvious author lines
+            if not re.search(r"\b(university|department|email|corresponding)\b", l.lower()):
+                return l
+    return lines[0] if lines else ""
+def extract_abstract(text: str) -> str:
     """
+    Try: ABSTRACT ... INTRODUCTION
     """
+    t = text or ""
+    m = re.search(r"\babstract\b(.*?)(\bintroduction\b|\b1\.\s*introduction\b)", t, flags=re.IGNORECASE | re.DOTALL)
+    if not m:
+        return ""
+    abs_text = clean_whitespace(m.group(1))
+    # keep reasonable length
+    return abs_text[:2000]
+def extract_section_headings(text: str, max_headings: int = 20) -> List[str]:
+    """
+    Simple heading heuristic:
+    - Lines that look like: "1. Introduction", "2 Methods", "RESULTS", etc.
+    """
+    lines = [l.strip() for l in (text or "").splitlines()]
+    headings = []
+    for l in lines:
+        if not l or len(l) > 120:
+            continue
+        if re.match(r"^\d+(\.\d+)*\s+[A-Z].{2,}$", l):
+            headings.append(l)
+        elif l.isupper() and 4 <= len(l) <= 60:
+            headings.append(l)
+        if len(headings) >= max_headings:
             break
+    # dedupe while preserving order
+    seen = set()
+    out = []
+    for h in headings:
+        key = h.lower()
+        if key not in seen:
+            seen.add(key)
+            out.append(h)
+    return out
+def detect_cas_numbers(text: str) -> List[str]:
     """
+    CAS format: 2-7 digits - 2 digits - 1 digit
     """
+    cas = re.findall(r"\b\d{2,7}-\d{2}-\d\b", text or "")
+    # unique preserve order
+    seen = set()
+    out = []
+    for c in cas:
+        if c not in seen:
+            seen.add(c)
+            out.append(c)
+    return out
+TOX_TERMS = [
+    "hazard", "risk", "exposure", "dose", "response", "toxicity",
+    "adverse", "noael", "loael", "benchmark dose", "bmd", "bmdl",
+    "carcinogenic", "mutagen", "genotoxic", "teratogenic",
+    "lc50", "ld50", "in vitro", "in vivo", "metabolite"
+]
+def tox_term_counts(text: str) -> List[Tuple[str, int]]:
+    t = (text or "").lower()
+    counts = []
+    for term in TOX_TERMS:
+        c = len(re.findall(r"\b" + re.escape(term) + r"\b", t))
+        if c > 0:
+            counts.append((term, c))
+    counts.sort(key=lambda x: x[1], reverse=True)
+    return counts
+# -----------------------------
+# Batch pipeline + reporting
+# -----------------------------
+def build_context_report(
+    filename: str,
+    title: str,
+    pages: int,
+    word_count: int,
+    sent_score: float,
+    sent_label: str,
+    keywords: List[Tuple[str, float]],
+    abstract: str,
+    headings: List[str],
+    summary: str,
+    cas: List[str],
+    tox_counts: List[Tuple[str, int]]
+) -> str:
+    kw = ", ".join([k for k, _ in keywords[:15]]) if keywords else "(none)"
+    cas_str = ", ".join(cas[:15]) + (" ..." if len(cas) > 15 else "") if cas else "(none)"
+    headings_str = "\n".join([f"- {h}" for h in headings]) if headings else "- (none detected)"
+    tox_str = "\n".join([f"- {t}: {c}" for t, c in tox_counts[:12]]) if tox_counts else "- (none detected)"
+    abs_block = abstract if abstract else "(abstract not detected)"
+    sum_block = summary if summary else "(summary unavailable)"
+    return f"""## {filename}
+**Title (heuristic):** {title or "(not detected)"}
+**Pages:** {pages}
+**Approx. word count:** {word_count:,}
+### Sentiment / Tone
+- **Average compound score:** {sent_score:.3f}
+- **Label:** **{sent_label}**
+> Interpretation note: for research papers, this is best read as *tone polarity* rather than emotion.
+### Keywords (TF-IDF)
+{kw}
+### Abstract (if detected)
+{abs_block}
+### Extractive summary (TextRank)
+{sum_block}
+### Section outline (heuristic)
+{headings_str}
+### CAS numbers detected
+{cas_str}
+### Toxicology concept coverage
+{tox_str}
+"""
+def analyze_pdfs(files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud):
+    ensure_nltk()
+    if not files:
+        return None, None, [], "", None, None, None, "Upload one or more PDFs."
+    top_k_keywords = int(top_k_keywords)
+    summary_sentences = int(summary_sentences)
+    chunk_chars = int(chunk_chars)
+    max_pages = int(max_pages)
+    results_rows = []
+    details: Dict[str, Dict] = {}
+    tmpdir = Path(tempfile.mkdtemp(prefix="tox_paper_nlp_"))
+    for f in files:
+        pdf_path = f.name
+        filename = os.path.basename(pdf_path)
+        raw_text, pages = extract_text_from_pdf(pdf_path, max_pages=max_pages)
+        raw_text = raw_text or ""
+        word_count = len(clean_whitespace(raw_text).split())
+        # sentiment
+        sent_score, sent_label, chunk_scores = vader_doc_sentiment(raw_text, chunk_chars=chunk_chars)
+        # keywords + summary + context
+        keywords = extract_keywords_tfidf(raw_text, top_k=top_k_keywords)
+        abstract = extract_abstract(raw_text)
+        title = detect_title(raw_text)
+        headings = extract_section_headings(raw_text)
+        summary = textrank_summary(raw_text, num_sentences=summary_sentences)
+        cas = detect_cas_numbers(raw_text)
+        tox_counts = tox_term_counts(raw_text)
+        report_md = build_context_report(
+            filename=filename,
+            title=title,
+            pages=pages,
+            word_count=word_count,
+            sent_score=sent_score,
+            sent_label=sent_label,
+            keywords=keywords,
+            abstract=abstract,
+            headings=headings,
+            summary=summary,
+            cas=cas,
+            tox_counts=tox_counts
         )
+        # Save extracted text + per-doc JSON for portability
+        txt_path = tmpdir / f"{Path(filename).stem}.txt"
+        txt_path.write_text(raw_text, encoding="utf-8", errors="ignore")
+        details[filename] = {
+            "filename": filename,
+            "pages": pages,
+            "word_count": word_count,
+            "sentiment_score": sent_score,
+            "sentiment_label": sent_label,
+            "chunk_scores": chunk_scores,
+            "keywords": keywords,
+            "abstract": abstract,
+            "title": title,
+            "headings": headings,
+            "summary": summary,
+            "cas_numbers": cas,
+            "tox_term_counts": tox_counts,
+            "report_md": report_md,
+            "text_path": str(txt_path),
+            "raw_text_preview": (raw_text[:6000] + " ...") if len(raw_text) > 6000 else raw_text
+        }
+        results_rows.append({
+            "file": filename,
+            "pages": pages,
+            "word_count": word_count,
+            "sentiment_score": round(sent_score, 4),
+            "sentiment_label": sent_label,
+            "top_keywords": ", ".join([k for k, _ in keywords[:10]]),
+            "cas_count": len(cas),
+        })
+    df = pd.DataFrame(results_rows).sort_values(["sentiment_score", "word_count"], ascending=[False, False])
+    # Save table as CSV for download
+    csv_path = tmpdir / "pdf_nlp_results.csv"
+    df.to_csv(csv_path, index=False)
+    # Populate doc selector and default view
+    doc_names = list(details.keys())
+    first = doc_names[0]
+    state = details
+    report_md = details[first]["report_md"]
+    # sentiment distribution plot for first doc
+    fig_sent = None
+    scores = details[first]["chunk_scores"]
+    if scores:
+        fig_sent = plt.figure()
+        ax = fig_sent.add_subplot(111)
+        sns.histplot(scores, kde=True, ax=ax)
+        ax.set_title(f"Chunk Sentiment Distribution: {first}")
+        ax.set_xlabel("VADER compound score")
+        ax.set_ylabel("Chunk count")
+        fig_sent.tight_layout()
+    fig_wc = None
+    if make_wordcloud:
+        fig_wc = make_wordcloud_figure(details[first]["raw_text_preview"])
+    return df, str(csv_path), doc_names, report_md, fig_sent, fig_wc, details[first]["raw_text_preview"], "Done."
+def render_doc(doc_name, state, make_wordcloud):
+    if not state or not doc_name or doc_name not in state:
+        return "", None, None, ""
+    d = state[doc_name]
+    report_md = d["report_md"]
+    preview = d["raw_text_preview"]
+    fig_sent = None
+    scores = d.get("chunk_scores", [])
+    if scores:
+        fig_sent = plt.figure()
+        ax = fig_sent.add_subplot(111)
+        sns.histplot(scores, kde=True, ax=ax)
+        ax.set_title(f"Chunk Sentiment Distribution: {doc_name}")
+        ax.set_xlabel("VADER compound score")
+        ax.set_ylabel("Chunk count")
+        fig_sent.tight_layout()
+    fig_wc = None
     if make_wordcloud:
+        fig_wc = make_wordcloud_figure(preview)
+    return report_md, fig_sent, fig_wc, preview
 # -----------------------------
 # Gradio UI
 # -----------------------------
+with gr.Blocks(title="Toxicology PDF NLP Analyzer") as demo:
+    gr.Markdown("# Toxicology PDF NLP Analyzer")
+    state = gr.State({})
+    with gr.Tab("Batch (Upload PDFs)"):
+        files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
         with gr.Row():
+            top_k_keywords = gr.Slider(5, 50, value=20, step=1, label="Top keywords (TF-IDF)")
+            summary_sentences = gr.Slider(2, 12, value=6, step=1, label="Summary sentences (TextRank)")
         with gr.Row():
+            chunk_chars = gr.Slider(800, 8000, value=3000, step=100, label="Chunk size for sentiment (chars)")
+            max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
+        make_wordcloud = gr.Checkbox(label="Generate word cloud", value=True)
+        run_btn = gr.Button("Analyze PDFs")
         status = gr.Textbox(label="Status", interactive=False)
+        results_df = gr.Dataframe(label="Batch Results", interactive=False)
+        results_csv = gr.File(label="Download: results CSV")
         with gr.Row():
+            doc_selector = gr.Dropdown(label="Select a document for details", choices=[], value=None)
+        report_md = gr.Markdown()
+        sent_plot = gr.Plot(label="Sentiment Distribution (by chunk)")
+        wc_plot = gr.Plot(label="Word Cloud")
+        raw_preview = gr.Textbox(label="Extracted text preview (first ~6k chars)", lines=10)
         run_btn.click(
+            fn=analyze_pdfs,
+            inputs=[files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud],
+            outputs=[results_df, results_csv, doc_selector, report_md, sent_plot, wc_plot, raw_preview, status]
+        ).then(
+            fn=lambda d: d, inputs=None, outputs=state
         )
+        # Update details view on selection change
+        doc_selector.change(
+            fn=render_doc,
+            inputs=[doc_selector, state, make_wordcloud],
+            outputs=[report_md, sent_plot, wc_plot, raw_preview]
         )
 if __name__ == "__main__":
     port = int(os.environ.get("PORT", "7860"))
     demo.launch(server_name="0.0.0.0", server_port=port)