import gradio as gr from transformers import pipeline from functools import lru_cache DEFAULT_LABELS = [ "finance", "sports", "tech", "politics", "health", "entertainment", "science", "business", "travel", "education" ] @lru_cache(maxsize=1) def get_pipes(): summarizer = pipeline( "summarization", model="sshleifer/distilbart-cnn-12-6" ) zshot = pipeline( "zero-shot-classification", model="valhalla/distilbart-mnli-12-1" ) # 3-class sentiment: NEGATIVE / NEUTRAL / POSITIVE sentiment = pipeline( "sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment-latest", tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest" ) return summarizer, zshot, sentiment def chunk_text(text: str, max_chars: int = 1600): """Naive chunker to keep inputs within summarizer limits. Splits on sentences by '. ' and groups into ~max_chars chunks. """ sentences = [s.strip() for s in text.replace("\n", " ").split(". ") if s.strip()] chunks, buf = [], "" for s in sentences: add = (s + (". " if not s.endswith(".") else " ")) if len(buf) + len(add) <= max_chars: buf += add else: if buf: chunks.append(buf.strip()) buf = add if buf: chunks.append(buf.strip()) # Fallback if text had no periods if not chunks: for i in range(0, len(text), max_chars): chunks.append(text[i:i+max_chars]) return chunks def summarize_long(text: str, target_words: int = 120): summarizer, _, _ = get_pipes() # Map rough word target to token lengths max_len = min(256, max(64, int(target_words * 1.6))) min_len = max(20, int(max_len * 0.4)) pieces = [] for ch in chunk_text(text, max_chars=1600): try: out = summarizer(ch, max_length=max_len, min_length=min_len, do_sample=False) pieces.append(out[0]["summary_text"]) except Exception: # If the model complains about length, try a smaller window out = summarizer(ch[:1200], max_length=max_len, min_length=min_len, do_sample=False) pieces.append(out[0]["summary_text"]) # If multiple pieces, do a second pass to fuse fused = " ".join(pieces) if len(pieces) > 1 and len(fused.split()) > target_words: out = summarizer(fused, max_length=max_len, min_length=min_len, do_sample=False) return out[0]["summary_text"].strip() return fused.strip() def classify_topics(text: str, labels: list[str]): _, zshot, _ = get_pipes() res = zshot(text, candidate_labels=labels, multi_label=True) # Zip labels and scores, sort desc pairs = sorted(zip(res["labels"], res["scores"]), key=lambda x: x[1], reverse=True) top3 = pairs[:3] return pairs, top3 def analyze_sentiment(text: str): """3-class sentiment with chunk-aware averaging for long inputs.""" _, _, sentiment = get_pipes() # Smaller chunk for sentiment; keep first few for speed s_chunks = chunk_text(text, max_chars=300) or [text[:300]] s_chunks = s_chunks[:8] agg = {"NEGATIVE": 0.0, "NEUTRAL": 0.0, "POSITIVE": 0.0} for ch in s_chunks: scores = sentiment(ch, return_all_scores=True)[0] for s in scores: agg[s["label"].upper()] += float(s["score"]) n = float(len(s_chunks)) for k in agg: agg[k] /= n label = max(agg, key=agg.get) score = agg[label] return label, score def analyze(text, labels_csv, summary_words): text = (text or "").strip() if not text: return ( "", # summary [], # table rows "", # top topics string "", # sentiment label 0.0, # sentiment score ) # Prepare labels (CSV → list) labels_csv = (labels_csv or "").strip() labels = [l.strip() for l in labels_csv.split(",") if l.strip()] or DEFAULT_LABELS summary = summarize_long(text, target_words=int(summary_words)) pairs, top3 = classify_topics(text, labels) sent_label, sent_score = analyze_sentiment(text) # Build a friendly top-topics string top_str = ", ".join([f"{lab} ({score:.2f})" for lab, score in top3]) if top3 else "" # Convert for Dataframe: list[list] table_rows = [[lab, round(score, 4)] for lab, score in pairs] return summary, table_rows, top_str, sent_label, sent_score with gr.Blocks(title="TriScope — Text Insight Stack", css=""" :root{--radius:16px} .header {font-size: 28px; font-weight: 800;} .subtle {opacity:.8} .card {border:1px solid #e5e7eb; border-radius: var(--radius); padding:16px} """) as demo: gr.Markdown("""