hchevva commited on
Commit
5f37dd4
·
verified ·
1 Parent(s): 8c43de8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +414 -266
app.py CHANGED
@@ -1,21 +1,29 @@
1
  import os
2
  import re
 
3
  import tempfile
4
  from pathlib import Path
 
5
 
 
6
  import numpy as np
7
  import pandas as pd
8
- import gradio as gr
9
 
10
  import nltk
11
- from nltk.corpus import stopwords
12
- from nltk.stem import WordNetLemmatizer
13
  from nltk.sentiment import SentimentIntensityAnalyzer
14
 
 
 
 
 
15
  import matplotlib.pyplot as plt
16
  import seaborn as sns
17
  from wordcloud import WordCloud
18
 
 
 
 
 
19
 
20
  # -----------------------------
21
  # NLTK setup (downloads once)
@@ -26,338 +34,478 @@ def ensure_nltk():
26
  global _NLTK_READY
27
  if _NLTK_READY:
28
  return
29
-
30
- # Download required resources (safe to call multiple times)
31
- nltk.download("stopwords", quiet=True)
32
  nltk.download("punkt", quiet=True)
33
- nltk.download("punkt_tab", quiet=True) # some environments need this
34
- nltk.download("wordnet", quiet=True)
35
  nltk.download("vader_lexicon", quiet=True)
36
-
37
  _NLTK_READY = True
38
 
39
 
40
  # -----------------------------
41
- # Text preprocessing (close to notebook intent)
42
  # -----------------------------
43
- def extract_comment_body(text: str) -> str:
44
  """
45
- Notebook-style datasets sometimes store comment bodies inside brackets like: [...comment...]
46
- If bracketed content exists, extract it; else return the original text.
47
  """
48
- if text is None:
49
- return ""
50
- s = str(text)
51
 
52
- # Try bracket extraction: first [ ... ]
53
- m = re.search(r"\[(.*?)\]", s)
54
- if m and m.group(1).strip():
55
- return m.group(1).strip()
 
 
 
 
56
 
57
- return s.strip()
58
 
59
 
60
- def normalize_text(text: str, stop_words: set, lemmatizer: WordNetLemmatizer) -> str:
 
 
 
 
 
 
 
 
 
61
  """
62
- Basic normalization: ASCII cleanup, lowercase, remove URLs, punctuation,
63
- tokenize, remove stopwords, lemmatize, re-join.
64
  """
65
- if text is None:
66
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # keep only ascii
69
- text = text.encode("ascii", errors="ignore").decode("ascii")
70
- text = text.lower()
71
 
72
- # remove urls
73
- text = re.sub(r"http\S+|www\.\S+", " ", text)
74
 
75
- # remove punctuation / non-word
76
- text = re.sub(r"[^a-z0-9\s]", " ", text)
 
 
 
 
77
 
78
- # collapse whitespace
79
- text = re.sub(r"\s+", " ", text).strip()
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if not text:
82
  return ""
 
83
 
84
- tokens = nltk.word_tokenize(text)
85
- tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
86
- tokens = [lemmatizer.lemmatize(t) for t in tokens]
87
-
88
- return " ".join(tokens)
89
-
90
 
91
- def vader_label(sia: SentimentIntensityAnalyzer, text: str) -> str:
92
  """
93
- Standard VADER thresholds:
94
- compound >= 0.05 => Positive
95
- compound <= -0.05 => Negative
96
- else Neutral
97
  """
98
- scores = sia.polarity_scores(text or "")
99
- c = scores.get("compound", 0.0)
100
- if c >= 0.05:
101
- return "Positive"
102
- if c <= -0.05:
103
- return "Negative"
104
- return "Neutral"
105
-
106
-
107
- # -----------------------------
108
- # Core analysis pipeline
109
- # -----------------------------
110
- def auto_detect_columns(df: pd.DataFrame):
111
  """
112
- Best-effort detection of player + text columns.
113
- Uses common names from lab-style datasets.
114
  """
115
- cols = [c.lower() for c in df.columns]
116
-
117
- # Player column candidates
118
- player_candidates = ["player", "player_name", "name", "prospect", "athlete"]
119
- player_col = None
120
- for cand in player_candidates:
121
- if cand in cols:
122
- player_col = df.columns[cols.index(cand)]
123
- break
124
 
125
- # Text column candidates
126
- text_candidates = ["text", "body", "comment", "comment_body", "content", "message"]
127
- text_col = None
128
- for cand in text_candidates:
129
- if cand in cols:
130
- text_col = df.columns[cols.index(cand)]
 
 
 
 
 
 
 
 
 
131
  break
 
 
 
 
 
 
 
 
 
132
 
133
- # Fallbacks: first object-like columns
134
- if player_col is None:
135
- obj_cols = [c for c in df.columns if df[c].dtype == "object"]
136
- if obj_cols:
137
- player_col = obj_cols[0]
138
-
139
- if text_col is None:
140
- obj_cols = [c for c in df.columns if df[c].dtype == "object"]
141
- if len(obj_cols) >= 2:
142
- text_col = obj_cols[1]
143
- elif obj_cols:
144
- text_col = obj_cols[0]
145
-
146
- return player_col, text_col
147
-
148
-
149
- def run_analysis(file_obj, player_col_in, text_col_in, max_rows, make_wordcloud):
150
  """
151
- Returns:
152
- preview_df, processed_csv_file, player_csv_file, top25_csv_file,
153
- fig_distribution, fig_top25, fig_wordcloud, status_text
154
  """
155
- ensure_nltk()
 
 
 
 
 
 
 
 
156
 
157
- if file_obj is None:
158
- return None, None, None, None, None, None, None, "Please upload a CSV file."
 
 
 
 
159
 
160
- # Load CSV
161
- df = pd.read_csv(file_obj.name)
 
 
 
 
 
 
 
162
 
163
- if df.empty:
164
- return None, None, None, None, None, None, None, "Uploaded CSV is empty."
165
 
166
- # Choose columns (manual overrides if provided)
167
- auto_player, auto_text = auto_detect_columns(df)
168
- player_col = player_col_in if player_col_in and player_col_in in df.columns else auto_player
169
- text_col = text_col_in if text_col_in and text_col_in in df.columns else auto_text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
170
 
171
- if player_col is None or text_col is None:
172
- return None, None, None, None, None, None, None, (
173
- "Could not detect player/text columns. "
174
- "Please specify them in the dropdowns."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  )
176
 
177
- # Optionally limit rows for speed
178
- if max_rows and max_rows > 0:
179
- df = df.head(int(max_rows)).copy()
180
- else:
181
- df = df.copy()
182
-
183
- # Basic cleanup (match lab intent: remove possible metadata-ish rows if any)
184
- # If text_col contains a header-like row embedded, filter it out.
185
- df[text_col] = df[text_col].astype(str)
186
- df = df[~df[text_col].str.contains(r"body,score,controversiality", case=False, na=False)]
187
-
188
- # Preprocess
189
- stop_words = set(stopwords.words("english"))
190
- lemmatizer = WordNetLemmatizer()
191
- sia = SentimentIntensityAnalyzer()
192
-
193
- df["player"] = df[player_col].astype(str)
194
- df["raw_text"] = df[text_col].astype(str)
195
-
196
- # Extract bracket body (if present), then normalize
197
- df["comment_body"] = df["raw_text"].apply(extract_comment_body)
198
- df["clean_text"] = df["comment_body"].apply(lambda t: normalize_text(t, stop_words, lemmatizer))
199
-
200
- # Sentiment
201
- df["sentiment"] = df["clean_text"].apply(lambda t: vader_label(sia, t))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
202
 
203
- # Comment-level output
204
- processed_cols = ["player", "raw_text", "comment_body", "clean_text", "sentiment"]
205
- processed = df[processed_cols].copy()
206
 
207
- # Player-level aggregation
208
- counts = (
209
- processed.groupby("player")["sentiment"]
210
- .value_counts()
211
- .unstack(fill_value=0)
212
- .rename_axis(None, axis=1)
213
- )
214
 
215
- # Ensure all columns exist
216
- for c in ["Positive", "Neutral", "Negative"]:
217
- if c not in counts.columns:
218
- counts[c] = 0
219
 
220
- counts["total"] = counts[["Positive", "Neutral", "Negative"]].sum(axis=1)
221
- counts["percent_positive"] = np.where(counts["total"] > 0, (counts["Positive"] / counts["total"]) * 100, 0.0)
 
222
 
223
- # Overall sentiment score: (pos - neg) / total (range [-1, 1])
224
- counts["overall_sentiment_score"] = np.where(
225
- counts["total"] > 0,
226
- (counts["Positive"] - counts["Negative"]) / counts["total"],
227
- 0.0
228
- )
 
 
 
 
229
 
230
- # Sort top 25 by score, then by total volume
231
- top25 = counts.sort_values(["overall_sentiment_score", "total"], ascending=[False, False]).head(25).copy()
232
-
233
- # Save outputs to temp files for download
234
- tmpdir = Path(tempfile.mkdtemp(prefix="nfl_sentiment_"))
235
-
236
- processed_path = tmpdir / "NFL_reddit_sentiment_analysis.csv"
237
- players_path = tmpdir / "player_sentiment_results.csv"
238
- top25_path = tmpdir / "top_25_players.csv"
239
-
240
- processed.to_csv(processed_path, index=False)
241
- counts.reset_index().to_csv(players_path, index=False)
242
- top25.reset_index().to_csv(top25_path, index=False)
243
-
244
- # ---- Plots ----
245
- # 1) Sentiment distribution
246
- fig1 = plt.figure()
247
- ax1 = fig1.add_subplot(111)
248
- sns.countplot(data=processed, x="sentiment", ax=ax1)
249
- ax1.set_title("Overall Sentiment Distribution")
250
- ax1.set_xlabel("Sentiment")
251
- ax1.set_ylabel("Count")
252
- fig1.tight_layout()
253
-
254
- # 2) Top 25 bar plot
255
- fig2 = plt.figure(figsize=(10, 6))
256
- ax2 = fig2.add_subplot(111)
257
- top25_plot = top25.reset_index()
258
- sns.barplot(data=top25_plot, x="overall_sentiment_score", y="player", ax=ax2)
259
- ax2.set_title("Top 25 Players by Overall Sentiment Score")
260
- ax2.set_xlabel("Overall Sentiment Score")
261
- ax2.set_ylabel("Player")
262
- fig2.tight_layout()
263
-
264
- # 3) Word cloud (positive only)
265
- fig3 = None
266
  if make_wordcloud:
267
- positive_text = " ".join(processed.loc[processed["sentiment"] == "Positive", "clean_text"].dropna().astype(str).tolist())
268
- if positive_text.strip():
269
- wc = WordCloud(width=1200, height=600, background_color="white").generate(positive_text)
270
- fig3 = plt.figure(figsize=(10, 5))
271
- ax3 = fig3.add_subplot(111)
272
- ax3.imshow(wc, interpolation="bilinear")
273
- ax3.axis("off")
274
- ax3.set_title("Word Cloud (Positive Comments)")
275
- fig3.tight_layout()
276
-
277
- # Preview table
278
- preview = processed.head(25)
279
-
280
- status = (
281
- f"Loaded {len(df):,} rows. "
282
- f"Using player column: '{player_col}', text column: '{text_col}'. "
283
- f"Outputs saved for download."
284
- )
285
 
286
- return preview, str(processed_path), str(players_path), str(top25_path), fig1, fig2, fig3, status
287
-
288
-
289
- def sentiment_single_text(player_name, comment_text):
290
- ensure_nltk()
291
- sia = SentimentIntensityAnalyzer()
292
- stop_words = set(stopwords.words("english"))
293
- lemmatizer = WordNetLemmatizer()
294
-
295
- body = extract_comment_body(comment_text or "")
296
- clean = normalize_text(body, stop_words, lemmatizer)
297
- label = vader_label(sia, clean)
298
- scores = sia.polarity_scores(clean)
299
-
300
- out = {
301
- "player": player_name or "",
302
- "comment_body": body,
303
- "clean_text": clean,
304
- "sentiment": label,
305
- "vader_scores": scores
306
- }
307
- return out
308
 
309
 
310
  # -----------------------------
311
  # Gradio UI
312
  # -----------------------------
313
- with gr.Blocks(title="NFL Reddit Sentiment (NLP Lab App)") as demo:
314
- gr.Markdown("# NFL Reddit Sentiment Analysis (NLP Lab)")
 
 
 
 
 
315
 
316
- with gr.Tab("Batch Analysis (Upload CSV)"):
317
- with gr.Row():
318
- file_in = gr.File(label="Upload NFL Reddit CSV", file_types=[".csv"])
319
  with gr.Row():
320
- player_col_in = gr.Textbox(label="Player column name (optional)", placeholder="e.g., player")
321
- text_col_in = gr.Textbox(label="Text/comment column name (optional)", placeholder="e.g., text")
322
  with gr.Row():
323
- max_rows = gr.Number(label="Max rows (0 = all)", value=0, precision=0)
324
- make_wordcloud = gr.Checkbox(label="Generate word cloud (positive comments)", value=True)
 
325
 
326
- run_btn = gr.Button("Run Sentiment Analysis")
327
 
328
  status = gr.Textbox(label="Status", interactive=False)
329
 
330
- preview_df = gr.Dataframe(label="Preview (first 25 processed rows)", interactive=False)
 
331
 
332
  with gr.Row():
333
- processed_out = gr.File(label="Download: Comment-level sentiment CSV")
334
- players_out = gr.File(label="Download: Player-level sentiment results CSV")
335
- top25_out = gr.File(label="Download: Top 25 players CSV")
336
 
337
- dist_plot = gr.Plot(label="Plot: Sentiment Distribution")
338
- top25_plot = gr.Plot(label="Plot: Top 25 Players")
339
- wc_plot = gr.Plot(label="Plot: Word Cloud (Positive)")
 
340
 
341
  run_btn.click(
342
- fn=run_analysis,
343
- inputs=[file_in, player_col_in, text_col_in, max_rows, make_wordcloud],
344
- outputs=[preview_df, processed_out, players_out, top25_out, dist_plot, top25_plot, wc_plot, status]
 
 
345
  )
346
 
347
- with gr.Tab("Single Comment Sentiment"):
348
- gr.Markdown("Test sentiment on one comment using the same preprocessing + VADER logic.")
349
- player_name = gr.Textbox(label="Player name (optional)")
350
- comment_text = gr.Textbox(label="Comment text", lines=6, placeholder="Paste a Reddit comment here...")
351
- single_btn = gr.Button("Analyze Sentiment")
352
- single_out = gr.JSON(label="Result")
353
-
354
- single_btn.click(
355
- fn=sentiment_single_text,
356
- inputs=[player_name, comment_text],
357
- outputs=[single_out]
358
  )
359
 
 
360
  if __name__ == "__main__":
361
- # For local runs; on hosting platforms, PORT may be provided
362
  port = int(os.environ.get("PORT", "7860"))
363
  demo.launch(server_name="0.0.0.0", server_port=port)
 
1
  import os
2
  import re
3
+ import math
4
  import tempfile
5
  from pathlib import Path
6
+ from typing import Dict, List, Tuple
7
 
8
+ import gradio as gr
9
  import numpy as np
10
  import pandas as pd
 
11
 
12
  import nltk
 
 
13
  from nltk.sentiment import SentimentIntensityAnalyzer
14
 
15
+ from pypdf import PdfReader
16
+
17
+ from sklearn.feature_extraction.text import TfidfVectorizer
18
+
19
  import matplotlib.pyplot as plt
20
  import seaborn as sns
21
  from wordcloud import WordCloud
22
 
23
+ from sumy.parsers.plaintext import PlaintextParser
24
+ from sumy.nlp.tokenizers import Tokenizer
25
+ from sumy.summarizers.text_rank import TextRankSummarizer
26
+
27
 
28
  # -----------------------------
29
  # NLTK setup (downloads once)
 
34
  global _NLTK_READY
35
  if _NLTK_READY:
36
  return
 
 
 
37
  nltk.download("punkt", quiet=True)
38
+ nltk.download("punkt_tab", quiet=True) # some envs need this
 
39
  nltk.download("vader_lexicon", quiet=True)
 
40
  _NLTK_READY = True
41
 
42
 
43
  # -----------------------------
44
+ # PDF extraction
45
  # -----------------------------
46
+ def extract_text_from_pdf(pdf_path: str, max_pages: int = 0) -> Tuple[str, int]:
47
  """
48
+ Returns (text, page_count). max_pages=0 means all pages.
49
+ Note: scanned-image PDFs may yield little/no text.
50
  """
51
+ reader = PdfReader(pdf_path)
52
+ page_count = len(reader.pages)
53
+ pages_to_read = page_count if (max_pages is None or max_pages <= 0) else min(page_count, max_pages)
54
 
55
+ parts = []
56
+ for i in range(pages_to_read):
57
+ try:
58
+ t = reader.pages[i].extract_text() or ""
59
+ except Exception:
60
+ t = ""
61
+ if t.strip():
62
+ parts.append(t)
63
 
64
+ return "\n".join(parts).strip(), page_count
65
 
66
 
67
+ # -----------------------------
68
+ # Utilities
69
+ # -----------------------------
70
+ def clean_whitespace(text: str) -> str:
71
+ text = text or ""
72
+ text = text.replace("\x00", " ")
73
+ text = re.sub(r"\s+", " ", text).strip()
74
+ return text
75
+
76
+ def split_into_chunks(text: str, chunk_chars: int = 3000) -> List[str]:
77
  """
78
+ Chunk by sentences into ~chunk_chars blocks.
 
79
  """
80
+ text = text or ""
81
+ if not text.strip():
82
+ return []
83
+
84
+ sentences = nltk.sent_tokenize(text)
85
+ chunks = []
86
+ cur = []
87
+
88
+ cur_len = 0
89
+ for s in sentences:
90
+ s = s.strip()
91
+ if not s:
92
+ continue
93
+ if cur_len + len(s) + 1 > chunk_chars and cur:
94
+ chunks.append(" ".join(cur))
95
+ cur = [s]
96
+ cur_len = len(s)
97
+ else:
98
+ cur.append(s)
99
+ cur_len += len(s) + 1
100
+
101
+ if cur:
102
+ chunks.append(" ".join(cur))
103
+
104
+ return chunks
105
+
106
+ def vader_doc_sentiment(text: str, chunk_chars: int = 3000) -> Tuple[float, str, List[float]]:
107
+ """
108
+ Returns: (avg_compound_score, label, chunk_scores)
109
+ """
110
+ ensure_nltk()
111
+ sia = SentimentIntensityAnalyzer()
112
 
113
+ chunks = split_into_chunks(text, chunk_chars=chunk_chars)
114
+ if not chunks:
115
+ return 0.0, "Neutral", []
116
 
117
+ scores = [sia.polarity_scores(c).get("compound", 0.0) for c in chunks]
118
+ avg = float(np.mean(scores))
119
 
120
+ if avg >= 0.05:
121
+ label = "Positive"
122
+ elif avg <= -0.05:
123
+ label = "Negative"
124
+ else:
125
+ label = "Neutral"
126
 
127
+ return avg, label, scores
 
128
 
129
+ def extract_keywords_tfidf(text: str, top_k: int = 20) -> List[Tuple[str, float]]:
130
+ """
131
+ TF-IDF keywords for a single document.
132
+ Uses unigrams + bigrams; returns list of (term, score).
133
+ """
134
+ text = text or ""
135
+ if not text.strip():
136
+ return []
137
+
138
+ vectorizer = TfidfVectorizer(
139
+ stop_words="english",
140
+ ngram_range=(1, 2),
141
+ max_features=5000
142
+ )
143
+ X = vectorizer.fit_transform([text])
144
+ feats = np.array(vectorizer.get_feature_names_out())
145
+ scores = X.toarray().ravel()
146
+
147
+ if scores.size == 0:
148
+ return []
149
+
150
+ idx = np.argsort(scores)[::-1]
151
+ idx = idx[: max(1, int(top_k))]
152
+ return [(feats[i], float(scores[i])) for i in idx if scores[i] > 0]
153
+
154
+ def make_wordcloud_figure(text: str):
155
+ text = text or ""
156
+ if not text.strip():
157
+ return None
158
+ wc = WordCloud(width=1200, height=600, background_color="white").generate(text)
159
+ fig = plt.figure(figsize=(10, 5))
160
+ ax = fig.add_subplot(111)
161
+ ax.imshow(wc, interpolation="bilinear")
162
+ ax.axis("off")
163
+ fig.tight_layout()
164
+ return fig
165
+
166
+ def textrank_summary(text: str, num_sentences: int = 6) -> str:
167
+ text = (text or "").strip()
168
  if not text:
169
  return ""
170
+ num_sentences = max(1, int(num_sentences))
171
 
172
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
173
+ summarizer = TextRankSummarizer()
174
+ sents = summarizer(parser.document, num_sentences)
175
+ return " ".join(str(s) for s in sents)
 
 
176
 
177
+ def detect_title(text: str) -> str:
178
  """
179
+ Heuristic: pick the first 'strong' line from the first ~30 lines.
 
 
 
180
  """
181
+ raw = text or ""
182
+ lines = [l.strip() for l in raw.splitlines() if l.strip()]
183
+ lines = lines[:30]
184
+ for l in lines:
185
+ if 8 <= len(l) <= 200 and not l.lower().startswith(("abstract", "introduction")):
186
+ # avoid obvious author lines
187
+ if not re.search(r"\b(university|department|email|corresponding)\b", l.lower()):
188
+ return l
189
+ return lines[0] if lines else ""
190
+
191
+ def extract_abstract(text: str) -> str:
 
 
192
  """
193
+ Try: ABSTRACT ... INTRODUCTION
 
194
  """
195
+ t = text or ""
196
+ m = re.search(r"\babstract\b(.*?)(\bintroduction\b|\b1\.\s*introduction\b)", t, flags=re.IGNORECASE | re.DOTALL)
197
+ if not m:
198
+ return ""
199
+ abs_text = clean_whitespace(m.group(1))
200
+ # keep reasonable length
201
+ return abs_text[:2000]
 
 
202
 
203
+ def extract_section_headings(text: str, max_headings: int = 20) -> List[str]:
204
+ """
205
+ Simple heading heuristic:
206
+ - Lines that look like: "1. Introduction", "2 Methods", "RESULTS", etc.
207
+ """
208
+ lines = [l.strip() for l in (text or "").splitlines()]
209
+ headings = []
210
+ for l in lines:
211
+ if not l or len(l) > 120:
212
+ continue
213
+ if re.match(r"^\d+(\.\d+)*\s+[A-Z].{2,}$", l):
214
+ headings.append(l)
215
+ elif l.isupper() and 4 <= len(l) <= 60:
216
+ headings.append(l)
217
+ if len(headings) >= max_headings:
218
  break
219
+ # dedupe while preserving order
220
+ seen = set()
221
+ out = []
222
+ for h in headings:
223
+ key = h.lower()
224
+ if key not in seen:
225
+ seen.add(key)
226
+ out.append(h)
227
+ return out
228
 
229
+ def detect_cas_numbers(text: str) -> List[str]:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  """
231
+ CAS format: 2-7 digits - 2 digits - 1 digit
 
 
232
  """
233
+ cas = re.findall(r"\b\d{2,7}-\d{2}-\d\b", text or "")
234
+ # unique preserve order
235
+ seen = set()
236
+ out = []
237
+ for c in cas:
238
+ if c not in seen:
239
+ seen.add(c)
240
+ out.append(c)
241
+ return out
242
 
243
+ TOX_TERMS = [
244
+ "hazard", "risk", "exposure", "dose", "response", "toxicity",
245
+ "adverse", "noael", "loael", "benchmark dose", "bmd", "bmdl",
246
+ "carcinogenic", "mutagen", "genotoxic", "teratogenic",
247
+ "lc50", "ld50", "in vitro", "in vivo", "metabolite"
248
+ ]
249
 
250
+ def tox_term_counts(text: str) -> List[Tuple[str, int]]:
251
+ t = (text or "").lower()
252
+ counts = []
253
+ for term in TOX_TERMS:
254
+ c = len(re.findall(r"\b" + re.escape(term) + r"\b", t))
255
+ if c > 0:
256
+ counts.append((term, c))
257
+ counts.sort(key=lambda x: x[1], reverse=True)
258
+ return counts
259
 
 
 
260
 
261
+ # -----------------------------
262
+ # Batch pipeline + reporting
263
+ # -----------------------------
264
+ def build_context_report(
265
+ filename: str,
266
+ title: str,
267
+ pages: int,
268
+ word_count: int,
269
+ sent_score: float,
270
+ sent_label: str,
271
+ keywords: List[Tuple[str, float]],
272
+ abstract: str,
273
+ headings: List[str],
274
+ summary: str,
275
+ cas: List[str],
276
+ tox_counts: List[Tuple[str, int]]
277
+ ) -> str:
278
+ kw = ", ".join([k for k, _ in keywords[:15]]) if keywords else "(none)"
279
+ cas_str = ", ".join(cas[:15]) + (" ..." if len(cas) > 15 else "") if cas else "(none)"
280
+ headings_str = "\n".join([f"- {h}" for h in headings]) if headings else "- (none detected)"
281
+ tox_str = "\n".join([f"- {t}: {c}" for t, c in tox_counts[:12]]) if tox_counts else "- (none detected)"
282
+
283
+ abs_block = abstract if abstract else "(abstract not detected)"
284
+ sum_block = summary if summary else "(summary unavailable)"
285
+
286
+ return f"""## {filename}
287
+
288
+ **Title (heuristic):** {title or "(not detected)"}
289
+ **Pages:** {pages}
290
+ **Approx. word count:** {word_count:,}
291
+
292
+ ### Sentiment / Tone
293
+ - **Average compound score:** {sent_score:.3f}
294
+ - **Label:** **{sent_label}**
295
+ > Interpretation note: for research papers, this is best read as *tone polarity* rather than emotion.
296
+
297
+ ### Keywords (TF-IDF)
298
+ {kw}
299
+
300
+ ### Abstract (if detected)
301
+ {abs_block}
302
+
303
+ ### Extractive summary (TextRank)
304
+ {sum_block}
305
+
306
+ ### Section outline (heuristic)
307
+ {headings_str}
308
+
309
+ ### CAS numbers detected
310
+ {cas_str}
311
+
312
+ ### Toxicology concept coverage
313
+ {tox_str}
314
+ """
315
+
316
+
317
+ def analyze_pdfs(files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud):
318
+ ensure_nltk()
319
 
320
+ if not files:
321
+ return None, None, [], "", None, None, None, "Upload one or more PDFs."
322
+
323
+ top_k_keywords = int(top_k_keywords)
324
+ summary_sentences = int(summary_sentences)
325
+ chunk_chars = int(chunk_chars)
326
+ max_pages = int(max_pages)
327
+
328
+ results_rows = []
329
+ details: Dict[str, Dict] = {}
330
+
331
+ tmpdir = Path(tempfile.mkdtemp(prefix="tox_paper_nlp_"))
332
+
333
+ for f in files:
334
+ pdf_path = f.name
335
+ filename = os.path.basename(pdf_path)
336
+
337
+ raw_text, pages = extract_text_from_pdf(pdf_path, max_pages=max_pages)
338
+ raw_text = raw_text or ""
339
+ word_count = len(clean_whitespace(raw_text).split())
340
+
341
+ # sentiment
342
+ sent_score, sent_label, chunk_scores = vader_doc_sentiment(raw_text, chunk_chars=chunk_chars)
343
+
344
+ # keywords + summary + context
345
+ keywords = extract_keywords_tfidf(raw_text, top_k=top_k_keywords)
346
+ abstract = extract_abstract(raw_text)
347
+ title = detect_title(raw_text)
348
+ headings = extract_section_headings(raw_text)
349
+ summary = textrank_summary(raw_text, num_sentences=summary_sentences)
350
+ cas = detect_cas_numbers(raw_text)
351
+ tox_counts = tox_term_counts(raw_text)
352
+
353
+ report_md = build_context_report(
354
+ filename=filename,
355
+ title=title,
356
+ pages=pages,
357
+ word_count=word_count,
358
+ sent_score=sent_score,
359
+ sent_label=sent_label,
360
+ keywords=keywords,
361
+ abstract=abstract,
362
+ headings=headings,
363
+ summary=summary,
364
+ cas=cas,
365
+ tox_counts=tox_counts
366
  )
367
 
368
+ # Save extracted text + per-doc JSON for portability
369
+ txt_path = tmpdir / f"{Path(filename).stem}.txt"
370
+ txt_path.write_text(raw_text, encoding="utf-8", errors="ignore")
371
+
372
+ details[filename] = {
373
+ "filename": filename,
374
+ "pages": pages,
375
+ "word_count": word_count,
376
+ "sentiment_score": sent_score,
377
+ "sentiment_label": sent_label,
378
+ "chunk_scores": chunk_scores,
379
+ "keywords": keywords,
380
+ "abstract": abstract,
381
+ "title": title,
382
+ "headings": headings,
383
+ "summary": summary,
384
+ "cas_numbers": cas,
385
+ "tox_term_counts": tox_counts,
386
+ "report_md": report_md,
387
+ "text_path": str(txt_path),
388
+ "raw_text_preview": (raw_text[:6000] + " ...") if len(raw_text) > 6000 else raw_text
389
+ }
390
+
391
+ results_rows.append({
392
+ "file": filename,
393
+ "pages": pages,
394
+ "word_count": word_count,
395
+ "sentiment_score": round(sent_score, 4),
396
+ "sentiment_label": sent_label,
397
+ "top_keywords": ", ".join([k for k, _ in keywords[:10]]),
398
+ "cas_count": len(cas),
399
+ })
400
+
401
+ df = pd.DataFrame(results_rows).sort_values(["sentiment_score", "word_count"], ascending=[False, False])
402
+
403
+ # Save table as CSV for download
404
+ csv_path = tmpdir / "pdf_nlp_results.csv"
405
+ df.to_csv(csv_path, index=False)
406
+
407
+ # Populate doc selector and default view
408
+ doc_names = list(details.keys())
409
+ first = doc_names[0]
410
+
411
+ state = details
412
+ report_md = details[first]["report_md"]
413
+
414
+ # sentiment distribution plot for first doc
415
+ fig_sent = None
416
+ scores = details[first]["chunk_scores"]
417
+ if scores:
418
+ fig_sent = plt.figure()
419
+ ax = fig_sent.add_subplot(111)
420
+ sns.histplot(scores, kde=True, ax=ax)
421
+ ax.set_title(f"Chunk Sentiment Distribution: {first}")
422
+ ax.set_xlabel("VADER compound score")
423
+ ax.set_ylabel("Chunk count")
424
+ fig_sent.tight_layout()
425
+
426
+ fig_wc = None
427
+ if make_wordcloud:
428
+ fig_wc = make_wordcloud_figure(details[first]["raw_text_preview"])
429
 
430
+ return df, str(csv_path), doc_names, report_md, fig_sent, fig_wc, details[first]["raw_text_preview"], "Done."
 
 
431
 
 
 
 
 
 
 
 
432
 
433
+ def render_doc(doc_name, state, make_wordcloud):
434
+ if not state or not doc_name or doc_name not in state:
435
+ return "", None, None, ""
 
436
 
437
+ d = state[doc_name]
438
+ report_md = d["report_md"]
439
+ preview = d["raw_text_preview"]
440
 
441
+ fig_sent = None
442
+ scores = d.get("chunk_scores", [])
443
+ if scores:
444
+ fig_sent = plt.figure()
445
+ ax = fig_sent.add_subplot(111)
446
+ sns.histplot(scores, kde=True, ax=ax)
447
+ ax.set_title(f"Chunk Sentiment Distribution: {doc_name}")
448
+ ax.set_xlabel("VADER compound score")
449
+ ax.set_ylabel("Chunk count")
450
+ fig_sent.tight_layout()
451
 
452
+ fig_wc = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
  if make_wordcloud:
454
+ fig_wc = make_wordcloud_figure(preview)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
+ return report_md, fig_sent, fig_wc, preview
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
 
458
 
459
  # -----------------------------
460
  # Gradio UI
461
  # -----------------------------
462
+ with gr.Blocks(title="Toxicology PDF NLP Analyzer") as demo:
463
+ gr.Markdown("# Toxicology PDF NLP Analyzer")
464
+
465
+ state = gr.State({})
466
+
467
+ with gr.Tab("Batch (Upload PDFs)"):
468
+ files = gr.File(label="Upload toxicology research PDFs", file_types=[".pdf"], file_count="multiple")
469
 
 
 
 
470
  with gr.Row():
471
+ top_k_keywords = gr.Slider(5, 50, value=20, step=1, label="Top keywords (TF-IDF)")
472
+ summary_sentences = gr.Slider(2, 12, value=6, step=1, label="Summary sentences (TextRank)")
473
  with gr.Row():
474
+ chunk_chars = gr.Slider(800, 8000, value=3000, step=100, label="Chunk size for sentiment (chars)")
475
+ max_pages = gr.Slider(0, 200, value=0, step=1, label="Max pages to read (0 = all)")
476
+ make_wordcloud = gr.Checkbox(label="Generate word cloud", value=True)
477
 
478
+ run_btn = gr.Button("Analyze PDFs")
479
 
480
  status = gr.Textbox(label="Status", interactive=False)
481
 
482
+ results_df = gr.Dataframe(label="Batch Results", interactive=False)
483
+ results_csv = gr.File(label="Download: results CSV")
484
 
485
  with gr.Row():
486
+ doc_selector = gr.Dropdown(label="Select a document for details", choices=[], value=None)
 
 
487
 
488
+ report_md = gr.Markdown()
489
+ sent_plot = gr.Plot(label="Sentiment Distribution (by chunk)")
490
+ wc_plot = gr.Plot(label="Word Cloud")
491
+ raw_preview = gr.Textbox(label="Extracted text preview (first ~6k chars)", lines=10)
492
 
493
  run_btn.click(
494
+ fn=analyze_pdfs,
495
+ inputs=[files, top_k_keywords, summary_sentences, chunk_chars, max_pages, make_wordcloud],
496
+ outputs=[results_df, results_csv, doc_selector, report_md, sent_plot, wc_plot, raw_preview, status]
497
+ ).then(
498
+ fn=lambda d: d, inputs=None, outputs=state
499
  )
500
 
501
+ # Update details view on selection change
502
+ doc_selector.change(
503
+ fn=render_doc,
504
+ inputs=[doc_selector, state, make_wordcloud],
505
+ outputs=[report_md, sent_plot, wc_plot, raw_preview]
 
 
 
 
 
 
506
  )
507
 
508
+
509
  if __name__ == "__main__":
 
510
  port = int(os.environ.get("PORT", "7860"))
511
  demo.launch(server_name="0.0.0.0", server_port=port)