Spaces:

npaleti2002
/

TextSense

Sleeping

File size: 6,994 Bytes

import gradio as gr
from transformers import pipeline
from functools import lru_cache

DEFAULT_LABELS = [
    "finance", "sports", "tech", "politics", "health", "entertainment",
    "science", "business", "travel", "education"
]


@lru_cache(maxsize=1)
def get_pipes():
    summarizer = pipeline(
        "summarization",
        model="sshleifer/distilbart-cnn-12-6"
    )
    zshot = pipeline(
        "zero-shot-classification",
        model="valhalla/distilbart-mnli-12-1"
    )
    # 3-class sentiment: NEGATIVE / NEUTRAL / POSITIVE
    sentiment = pipeline(
        "sentiment-analysis",
        model="cardiffnlp/twitter-roberta-base-sentiment-latest",
        tokenizer="cardiffnlp/twitter-roberta-base-sentiment-latest"
    )
    return summarizer, zshot, sentiment


def chunk_text(text: str, max_chars: int = 1600):
    """Naive chunker to keep inputs within summarizer limits.
    Splits on sentences by '. ' and groups into ~max_chars chunks.
    """
    sentences = [s.strip() for s in text.replace("\n", " ").split(". ") if s.strip()]
    chunks, buf = [], ""
    for s in sentences:
        add = (s + (". " if not s.endswith(".") else " "))
        if len(buf) + len(add) <= max_chars:
            buf += add
        else:
            if buf:
                chunks.append(buf.strip())
            buf = add
    if buf:
        chunks.append(buf.strip())
    # Fallback if text had no periods
    if not chunks:
        for i in range(0, len(text), max_chars):
            chunks.append(text[i:i+max_chars])
    return chunks


def summarize_long(text: str, target_words: int = 120):
    summarizer, _, _ = get_pipes()
    # Map rough word target to token lengths
    max_len = min(256, max(64, int(target_words * 1.6)))
    min_len = max(20, int(max_len * 0.4))
    pieces = []
    for ch in chunk_text(text, max_chars=1600):
        try:
            out = summarizer(ch, max_length=max_len, min_length=min_len, do_sample=False)
            pieces.append(out[0]["summary_text"])
        except Exception:
            # If the model complains about length, try a smaller window
            out = summarizer(ch[:1200], max_length=max_len, min_length=min_len, do_sample=False)
            pieces.append(out[0]["summary_text"])
    # If multiple pieces, do a second pass to fuse
    fused = " ".join(pieces)
    if len(pieces) > 1 and len(fused.split()) > target_words:
        out = summarizer(fused, max_length=max_len, min_length=min_len, do_sample=False)
        return out[0]["summary_text"].strip()
    return fused.strip()


def classify_topics(text: str, labels: list[str]):
    _, zshot, _ = get_pipes()
    res = zshot(text, candidate_labels=labels, multi_label=True)
    # Zip labels and scores, sort desc
    pairs = sorted(zip(res["labels"], res["scores"]), key=lambda x: x[1], reverse=True)
    top3 = pairs[:3]
    return pairs, top3


def analyze_sentiment(text: str):
    """3-class sentiment with chunk-aware averaging for long inputs."""
    _, _, sentiment = get_pipes()
    # Smaller chunk for sentiment; keep first few for speed
    s_chunks = chunk_text(text, max_chars=300) or [text[:300]]
    s_chunks = s_chunks[:8]

    agg = {"NEGATIVE": 0.0, "NEUTRAL": 0.0, "POSITIVE": 0.0}
    for ch in s_chunks:
        scores = sentiment(ch, return_all_scores=True)[0]
        for s in scores:
            agg[s["label"].upper()] += float(s["score"])
    n = float(len(s_chunks))
    for k in agg:
        agg[k] /= n

    label = max(agg, key=agg.get)
    score = agg[label]
    return label, score


def analyze(text, labels_csv, summary_words):
    text = (text or "").strip()
    if not text:
        return (
            "",   # summary
            [],   # table rows
            "",   # top topics string
            "",   # sentiment label
            0.0,  # sentiment score
        )

    # Prepare labels (CSV → list)
    labels_csv = (labels_csv or "").strip()
    labels = [l.strip() for l in labels_csv.split(",") if l.strip()] or DEFAULT_LABELS

    summary = summarize_long(text, target_words=int(summary_words))
    pairs, top3 = classify_topics(text, labels)
    sent_label, sent_score = analyze_sentiment(text)

    # Build a friendly top-topics string
    top_str = ", ".join([f"{lab} ({score:.2f})" for lab, score in top3]) if top3 else ""

    # Convert for Dataframe: list[list]
    table_rows = [[lab, round(score, 4)] for lab, score in pairs]

    return summary, table_rows, top_str, sent_label, sent_score


with gr.Blocks(title="TriScope — Text Insight Stack", css="""
:root{--radius:16px}
.header {font-size: 28px; font-weight: 800;}
.subtle {opacity:.8}
.card {border:1px solid #e5e7eb; border-radius: var(--radius); padding:16px}
""") as demo:
    gr.Markdown("""
    <div class="header">🧠 TriScope — Text Insight Stack</div>
    <div class="subtle">Summarize • Topic Classify • Sentiment — powered by three open models on Hugging Face</div>
    """)

    with gr.Row():
        with gr.Column(scale=5):
            txt = gr.Textbox(
                label="Paste text",
                placeholder="Paste any article, JD, email, or paragraph...",
                lines=12,
                elem_classes=["card"],
            )
            labels = gr.Textbox(
                label="Candidate topic labels (comma-separated)",
                value=", ".join(DEFAULT_LABELS),
                elem_classes=["card"],
            )
            words = gr.Slider(
                minimum=40, maximum=200, value=120, step=10,
                label="Target summary length (words)",
                elem_classes=["card"],
            )
            run = gr.Button("Analyze", variant="primary")

        with gr.Column(scale=5):
            with gr.Tab("Summary"):
                out_summary = gr.Markdown()
            with gr.Tab("Topics"):
                out_table = gr.Dataframe(headers=["label", "score"], datatype=["str", "number"], interactive=False)
                out_top = gr.Markdown()
            with gr.Tab("Sentiment"):
                # Show 3 classes
                out_sent_label = gr.Label(num_top_classes=3)
                out_sent_score = gr.Number(label="Confidence score")

    gr.Examples(
        label="Try an example",
        examples=[[
            "Open-source models are transforming AI by enabling broad access to powerful capabilities. However, organizations must balance innovation with governance, ensuring that safety and compliance keep pace with deployment. This article explores how companies can adopt a pragmatic approach to evaluation, monitoring, and human oversight while still benefiting from the speed of open development."
        ]],
        inputs=[txt]
    )

    run.click(
        analyze,
        inputs=[txt, labels, words],
        outputs=[out_summary, out_table, out_top, out_sent_label, out_sent_score]
    )

if __name__ == "__main__":
    # Helpful for Spaces; enables logs and proper binding
    demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)