Spaces:
Sleeping
Sleeping
File size: 6,312 Bytes
74b4cd7 ad25769 74b4cd7 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 ad25769 48cf862 74b4cd7 48cf862 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import streamlit as st
import pdfplumber, re
from transformers import pipeline, AutoTokenizer
st.set_page_config(page_title="Financial News Analyzer",
page_icon="π°",
layout="wide")
@st.cache_resource(ttl=86400)
# βββββββββββββββββ Cached pipelines ββββββββββββββββββββββββββββββββββββ
def load_pipes():
summarizer = pipeline("summarization", model=SUMM_MODEL)
tokenizer = AutoTokenizer.from_pretrained(SUMM_MODEL)
sentiment = pipeline("text-classification", model=SENT_MODEL)
ner = pipeline("token-classification", model=NER_MODEL,
aggregation_strategy="simple")
return summarizer, tokenizer, sentiment, ner
# βββββββββββββββββ Helper functions ββββββββββββββββββββββββββββββββββββ
# Read text from PDF
def extract_pdf(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text
# Split text to avoid long contents
def split_by_tokens(text, max_tokens):
words = re.split(r"(\s+)", text)
buf, n = "", 0
for w in words:
ln = len(TOK(w).input_ids)
if n + ln <= max_tokens:
buf, n = buf + w, n + ln
else:
yield buf.strip()
buf, n = w, ln
if buf.strip():
yield buf.strip()
# Summarise the news
def summarise(text):
parts = list(split_by_tokens(text, MAX_TOK))
per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
first_pass = [
SUMMAR(p, max_length=per_len, min_length=per_len // 2, do_sample=False)[0]["summary_text"]
for p in parts
]
joined = " ".join(first_pass)
if len(joined.split()) > TARGET_WORDS:
joined = SUMMAR(joined, max_length=TARGET_WORDS,
min_length=TARGET_WORDS // 2,
do_sample=False)[0]["summary_text"]
return joined
# Shorten the summary in 1-5 sentence (let user to choose)
def shorten(summary, n_sentences):
s = summary.split(". ")
return (". ".join(s[:n_sentences]).rstrip(".") + ".") if len(s) > n_sentences else summary
# Key entity tagging
def tag_entities(text):
tag_dict = {"Organization": [], "Person": [], "Location": [], "Miscellaneous": []}
for entity in NER(text):
group = {"ORG": "Organization", "PER": "Person", "LOC": "Location"}.get(entity["entity_group"], "Miscellaneous")
tag_dict[group].append(entity["word"])
return {k: sorted(set(v)) for k, v in tag_dict.items() if v}
# βββββββββββββββββ Main App Logic ββββββββββββββββββββββββββββββββββββ
def main():
st.title("π° Financial News Analyzer")
st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")
# Sidebar Input
with st.sidebar:
st.header("Input News to Analyze:")
# Step 1: enter the news
txt_input = st.text_area("Paste news article", height=150)
pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
# let user choose summary length (1-5 sentence)
sent_count = st.slider("Summary length (sentences)", min_value=1, max_value=5, value=3)
run_btn = st.button("π Analyze", use_container_width=True)
raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()
# Main processing
if run_btn:
if not raw_text:
st.warning("Please provide text or a PDF first.")
st.stop()
with st.spinner("Analyzing..."):
full_sum = summarise(raw_text)
summary = shorten(full_sum, sent_count)
# Step 2: Summarization
cols = st.columns([2, 1])
with cols[0]:
st.subheader("π Summary")
st.markdown(f"<div style='white-space: pre-wrap'>{summary}</div>", unsafe_allow_html=True)
# Step 3: Sentiment analysis
with cols[1]:
result = SENT_CLF(summary)[0]
label = LABEL_MAP.get(result["label"], result["label"])
color = COLOR_MAP[label]
st.subheader("π Sentiment")
st.markdown(
f"<h3 style='color:{color};margin-bottom:0'>{label}</h3>"
f"{result['score'] * 100:.1f}% Confidence</p>",
unsafe_allow_html=True
)
# Step 3: Entity Tags
tags = tag_entities(summary)
st.subheader("π·οΈ Relevant Tags")
if tags:
# Tag pill CSS
pill_css = """
<style>
.tag-pill {
display: inline-block;
background: #f0f2f6;
color: #333;
padding: 4px 10px;
margin: 2px 4px 2px 0;
border-radius: 12px;
font-size: 0.9em;
}
.tag-cat {
font-weight: 600;
margin-top: 0;
margin-bottom: 4px;
}
</style>
"""
st.markdown(pill_css, unsafe_allow_html=True)
for category, values in tags.items():
st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in values)
st.markdown(pills, unsafe_allow_html=True)
else:
st.info("No entities detected.")
# βββββββββββββββββ Main Part βββββββββββββββββββββββββββββββββββββββ
# models and other constant variables
SUMM_MODEL = "sshleifer/distilbart-cnn-12-6"
SENT_MODEL = "nynn/Fintuned_Sentiment"
NER_MODEL = "Babelscape/wikineural-multilingual-ner"
SUMMAR, TOK, SENT_CLF, NER = load_pipes()
MAX_TOK = 1024
TARGET_WORDS = 225
LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive", "LABEL_2": "Neutral"}
COLOR_MAP = {"Positive": "green", "Negative": "red", "Neutral": "gray"}
if __name__ == "__main__":
main()
|