TextSense / app.py
npaleti2002's picture
Update app.py
3931bcf verified
import gradio as gr
from transformers import pipeline
from functools import lru_cache
DEFAULT_LABELS = [
"finance", "sports", "tech", "politics", "health", "entertainment",
"science", "business", "travel", "education"
]
@lru_cache(maxsize=1)
def get_pipes():
summarizer = pipeline(
"summarization",
model="sshleifer/distilbart-cnn-12-6"
)
zshot = pipeline(
"zero-shot-classification",
model="valhalla/distilbart-mnli-12-1"
)
# 3-class sentiment: NEGATIVE / NEUTRAL / POSITIVE
sentiment = pipeline(
"sentiment-analysis",
model="cardiffnlp/twitter-roberta-base-sentiment-latest",
tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest"
)
return summarizer, zshot, sentiment
def chunk_text(text: str, max_chars: int = 1600):
"""Naive chunker to keep inputs within summarizer limits.
Splits on sentences by '. ' and groups into ~max_chars chunks.
"""
sentences = [s.strip() for s in text.replace("\n", " ").split(". ") if s.strip()]
chunks, buf = [], ""
for s in sentences:
add = (s + (". " if not s.endswith(".") else " "))
if len(buf) + len(add) <= max_chars:
buf += add
else:
if buf:
chunks.append(buf.strip())
buf = add
if buf:
chunks.append(buf.strip())
# Fallback if text had no periods
if not chunks:
for i in range(0, len(text), max_chars):
chunks.append(text[i:i+max_chars])
return chunks
def summarize_long(text: str, target_words: int = 120):
summarizer, _, _ = get_pipes()
# Map rough word target to token lengths
max_len = min(256, max(64, int(target_words * 1.6)))
min_len = max(20, int(max_len * 0.4))
pieces = []
for ch in chunk_text(text, max_chars=1600):
try:
out = summarizer(ch, max_length=max_len, min_length=min_len, do_sample=False)
pieces.append(out[0]["summary_text"])
except Exception:
# If the model complains about length, try a smaller window
out = summarizer(ch[:1200], max_length=max_len, min_length=min_len, do_sample=False)
pieces.append(out[0]["summary_text"])
# If multiple pieces, do a second pass to fuse
fused = " ".join(pieces)
if len(pieces) > 1 and len(fused.split()) > target_words:
out = summarizer(fused, max_length=max_len, min_length=min_len, do_sample=False)
return out[0]["summary_text"].strip()
return fused.strip()
def classify_topics(text: str, labels: list[str]):
_, zshot, _ = get_pipes()
res = zshot(text, candidate_labels=labels, multi_label=True)
# Zip labels and scores, sort desc
pairs = sorted(zip(res["labels"], res["scores"]), key=lambda x: x[1], reverse=True)
top3 = pairs[:3]
return pairs, top3
def analyze_sentiment(text: str):
"""3-class sentiment with chunk-aware averaging for long inputs."""
_, _, sentiment = get_pipes()
# Smaller chunk for sentiment; keep first few for speed
s_chunks = chunk_text(text, max_chars=300) or [text[:300]]
s_chunks = s_chunks[:8]
agg = {"NEGATIVE": 0.0, "NEUTRAL": 0.0, "POSITIVE": 0.0}
for ch in s_chunks:
scores = sentiment(ch, return_all_scores=True)[0]
for s in scores:
agg[s["label"].upper()] += float(s["score"])
n = float(len(s_chunks))
for k in agg:
agg[k] /= n
label = max(agg, key=agg.get)
score = agg[label]
return label, score
def analyze(text, labels_csv, summary_words):
text = (text or "").strip()
if not text:
return (
"", # summary
[], # table rows
"", # top topics string
"", # sentiment label
0.0, # sentiment score
)
# Prepare labels (CSV → list)
labels_csv = (labels_csv or "").strip()
labels = [l.strip() for l in labels_csv.split(",") if l.strip()] or DEFAULT_LABELS
summary = summarize_long(text, target_words=int(summary_words))
pairs, top3 = classify_topics(text, labels)
sent_label, sent_score = analyze_sentiment(text)
# Build a friendly top-topics string
top_str = ", ".join([f"{lab} ({score:.2f})" for lab, score in top3]) if top3 else ""
# Convert for Dataframe: list[list]
table_rows = [[lab, round(score, 4)] for lab, score in pairs]
return summary, table_rows, top_str, sent_label, sent_score
with gr.Blocks(title="TriScope — Text Insight Stack", css="""
:root{--radius:16px}
.header {font-size: 28px; font-weight: 800;}
.subtle {opacity:.8}
.card {border:1px solid #e5e7eb; border-radius: var(--radius); padding:16px}
""") as demo:
gr.Markdown("""
<div class="header">🧠 TriScope — Text Insight Stack</div>
<div class="subtle">Summarize • Topic Classify • Sentiment — powered by three open models on Hugging Face</div>
""")
with gr.Row():
with gr.Column(scale=5):
txt = gr.Textbox(
label="Paste text",
placeholder="Paste any article, JD, email, or paragraph...",
lines=12,
elem_classes=["card"],
)
labels = gr.Textbox(
label="Candidate topic labels (comma-separated)",
value=", ".join(DEFAULT_LABELS),
elem_classes=["card"],
)
words = gr.Slider(
minimum=40, maximum=200, value=120, step=10,
label="Target summary length (words)",
elem_classes=["card"],
)
run = gr.Button("Analyze", variant="primary")
with gr.Column(scale=5):
with gr.Tab("Summary"):
out_summary = gr.Markdown()
with gr.Tab("Topics"):
out_table = gr.Dataframe(headers=["label", "score"], datatype=["str", "number"], interactive=False)
out_top = gr.Markdown()
with gr.Tab("Sentiment"):
# Show 3 classes
out_sent_label = gr.Label(num_top_classes=3)
out_sent_score = gr.Number(label="Confidence score")
gr.Examples(
label="Try an example",
examples=[[
"Open-source models are transforming AI by enabling broad access to powerful capabilities. However, organizations must balance innovation with governance, ensuring that safety and compliance keep pace with deployment. This article explores how companies can adopt a pragmatic approach to evaluation, monitoring, and human oversight while still benefiting from the speed of open development."
]],
inputs=[txt]
)
run.click(
analyze,
inputs=[txt, labels, words],
outputs=[out_summary, out_table, out_top, out_sent_label, out_sent_score]
)
if __name__ == "__main__":
# Helpful for Spaces; enables logs and proper binding
demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)