Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from transformers import pipeline
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
import math
|
| 5 |
+
|
| 6 |
+
DEFAULT_LABELS = [
|
| 7 |
+
"finance", "sports", "tech", "politics", "health", "entertainment",
|
| 8 |
+
"science", "business", "travel", "education"
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
@lru_cache(maxsize=1)
|
| 12 |
+
def get_pipes():
|
| 13 |
+
summarizer = pipeline(
|
| 14 |
+
"summarization",
|
| 15 |
+
model="sshleifer/distilbart-cnn-12-6"
|
| 16 |
+
)
|
| 17 |
+
zshot = pipeline(
|
| 18 |
+
"zero-shot-classification",
|
| 19 |
+
model="valhalla/distilbart-mnli-12-1"
|
| 20 |
+
)
|
| 21 |
+
sentiment = pipeline(
|
| 22 |
+
"sentiment-analysis",
|
| 23 |
+
model="distilbert-base-uncased-finetuned-sst-2-english"
|
| 24 |
+
)
|
| 25 |
+
return summarizer, zshot, sentiment
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def chunk_text(text: str, max_chars: int = 1600):
|
| 29 |
+
"""Naive chunker to keep inputs within summarizer limits.
|
| 30 |
+
Splits on sentences by '. ' and groups into ~max_chars chunks.
|
| 31 |
+
"""
|
| 32 |
+
sentences = [s.strip() for s in text.replace("\n", " ").split(". ") if s.strip()]
|
| 33 |
+
chunks, buf = [], ""
|
| 34 |
+
for s in sentences:
|
| 35 |
+
add = (s + (". " if not s.endswith(".") else " "))
|
| 36 |
+
if len(buf) + len(add) <= max_chars:
|
| 37 |
+
buf += add
|
| 38 |
+
else:
|
| 39 |
+
if buf:
|
| 40 |
+
chunks.append(buf.strip())
|
| 41 |
+
buf = add
|
| 42 |
+
if buf:
|
| 43 |
+
chunks.append(buf.strip())
|
| 44 |
+
# Fallback if text had no periods
|
| 45 |
+
if not chunks:
|
| 46 |
+
for i in range(0, len(text), max_chars):
|
| 47 |
+
chunks.append(text[i:i+max_chars])
|
| 48 |
+
return chunks
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def summarize_long(text: str, target_words: int = 120):
|
| 52 |
+
summarizer, _, _ = get_pipes()
|
| 53 |
+
# Map rough word target to token lengths
|
| 54 |
+
max_len = min(256, max(64, int(target_words * 1.6)))
|
| 55 |
+
min_len = max(20, int(max_len * 0.4))
|
| 56 |
+
pieces = []
|
| 57 |
+
for ch in chunk_text(text, max_chars=1600):
|
| 58 |
+
try:
|
| 59 |
+
out = summarizer(ch, max_length=max_len, min_length=min_len, do_sample=False)
|
| 60 |
+
pieces.append(out[0]["summary_text"])
|
| 61 |
+
except Exception:
|
| 62 |
+
# If the model complains about length, try a smaller window
|
| 63 |
+
out = summarizer(ch[:1200], max_length=max_len, min_length=min_len, do_sample=False)
|
| 64 |
+
pieces.append(out[0]["summary_text"])
|
| 65 |
+
# If multiple pieces, do a second pass to fuse
|
| 66 |
+
fused = " ".join(pieces)
|
| 67 |
+
if len(pieces) > 1 and len(fused.split()) > target_words:
|
| 68 |
+
out = summarizer(fused, max_length=max_len, min_length=min_len, do_sample=False)
|
| 69 |
+
return out[0]["summary_text"].strip()
|
| 70 |
+
return fused.strip()
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def classify_topics(text: str, labels: list[str]):
|
| 74 |
+
_, zshot, _ = get_pipes()
|
| 75 |
+
res = zshot(text, candidate_labels=labels, multi_label=True)
|
| 76 |
+
# Zip labels and scores, sort desc
|
| 77 |
+
pairs = sorted(zip(res["labels"], res["scores"]), key=lambda x: x[1], reverse=True)
|
| 78 |
+
top3 = pairs[:3]
|
| 79 |
+
return pairs, top3
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def analyze_sentiment(text: str):
|
| 83 |
+
_, _, sentiment = get_pipes()
|
| 84 |
+
out = sentiment(text[:2000])[0] # keep it snappy
|
| 85 |
+
return out["label"], float(out["score"]) # ('POSITIVE'/'NEGATIVE', score)
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def analyze(text, labels_csv, summary_words):
|
| 89 |
+
text = (text or "").strip()
|
| 90 |
+
if not text:
|
| 91 |
+
return (
|
| 92 |
+
"", # summary
|
| 93 |
+
[], # table rows
|
| 94 |
+
"", # top topics string
|
| 95 |
+
"", # sentiment label
|
| 96 |
+
0.0, # sentiment score
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
# Prepare labels (CSV → list)
|
| 100 |
+
labels_csv = (labels_csv or "").strip()
|
| 101 |
+
labels = [l.strip() for l in labels_csv.split(",") if l.strip()] or DEFAULT_LABELS
|
| 102 |
+
|
| 103 |
+
summary = summarize_long(text, target_words=int(summary_words))
|
| 104 |
+
pairs, top3 = classify_topics(text, labels)
|
| 105 |
+
sent_label, sent_score = analyze_sentiment(text)
|
| 106 |
+
|
| 107 |
+
# Build a friendly top-topics string
|
| 108 |
+
top_str = ", ".join([f"{lab} ({score:.2f})" for lab, score in top3]) if top3 else ""
|
| 109 |
+
|
| 110 |
+
# Convert for Dataframe: list[list]
|
| 111 |
+
table_rows = [[lab, round(score, 4)] for lab, score in pairs]
|
| 112 |
+
|
| 113 |
+
return summary, table_rows, top_str, sent_label, sent_score
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
with gr.Blocks(title="Text Insight Stack", css="""
|
| 117 |
+
:root{--radius:16px}
|
| 118 |
+
.header {font-size: 28px; font-weight: 800;}
|
| 119 |
+
.subtle {opacity:.8}
|
| 120 |
+
.card {border:1px solid #e5e7eb; border-radius: var(--radius); padding:16px}
|
| 121 |
+
""") as demo:
|
| 122 |
+
gr.Markdown("""
|
| 123 |
+
<div class="header">🧠 Text Insight Stack</div>
|
| 124 |
+
<div class="subtle">Summarize • Topic Classify • Sentiment — powered by three open models on Hugging Face</div>
|
| 125 |
+
""")
|
| 126 |
+
|
| 127 |
+
with gr.Row():
|
| 128 |
+
with gr.Column(scale=5):
|
| 129 |
+
txt = gr.Textbox(
|
| 130 |
+
label="Paste text",
|
| 131 |
+
placeholder="Paste any article, JD, email, or paragraph...",
|
| 132 |
+
lines=12,
|
| 133 |
+
elem_classes=["card"],
|
| 134 |
+
)
|
| 135 |
+
labels = gr.Textbox(
|
| 136 |
+
label="Candidate topic labels (comma‑separated)",
|
| 137 |
+
value=", ".join(DEFAULT_LABELS),
|
| 138 |
+
elem_classes=["card"],
|
| 139 |
+
)
|
| 140 |
+
words = gr.Slider(
|
| 141 |
+
minimum=40, maximum=200, value=120, step=10,
|
| 142 |
+
label="Target summary length (words)",
|
| 143 |
+
elem_classes=["card"],
|
| 144 |
+
)
|
| 145 |
+
run = gr.Button("Analyze", variant="primary")
|
| 146 |
+
|
| 147 |
+
with gr.Column(scale=5):
|
| 148 |
+
with gr.Tab("Summary"):
|
| 149 |
+
out_summary = gr.Markdown()
|
| 150 |
+
with gr.Tab("Topics"):
|
| 151 |
+
out_table = gr.Dataframe(headers=["label", "score"], datatype=["str", "number"], interactive=False)
|
| 152 |
+
out_top = gr.Markdown()
|
| 153 |
+
with gr.Tab("Sentiment"):
|
| 154 |
+
out_sent_label = gr.Label(num_top_classes=2)
|
| 155 |
+
out_sent_score = gr.Number(label="Confidence score")
|
| 156 |
+
|
| 157 |
+
ex1 = gr.Examples(
|
| 158 |
+
label="Try an example",
|
| 159 |
+
examples=[[
|
| 160 |
+
"Open-source models are transforming AI by enabling broad access to powerful capabilities. However, organizations must balance innovation with governance, ensuring that safety and compliance keep pace with deployment. This article explores how companies can adopt a pragmatic approach to evaluation, monitoring, and human oversight while still benefiting from the speed of open development."]],
|
| 161 |
+
inputs=[txt]
|
| 162 |
+
)
|
| 163 |
+
|
| 164 |
+
run.click(
|
| 165 |
+
analyze,
|
| 166 |
+
inputs=[txt, labels, words],
|
| 167 |
+
outputs=[out_summary, out_table, out_top, out_sent_label, out_sent_score]
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
if __name__ == "__main__":
|
| 171 |
+
demo.launch()
|