FinanceAnalyzer / app.py
DWD1211's picture
Update app.py
48cf862 verified
import streamlit as st
import pdfplumber, re
from transformers import pipeline, AutoTokenizer
st.set_page_config(page_title="Financial News Analyzer",
page_icon="πŸ“°",
layout="wide")
@st.cache_resource(ttl=86400)
# ───────────────── Cached pipelines ────────────────────────────────────
def load_pipes():
summarizer = pipeline("summarization", model=SUMM_MODEL)
tokenizer = AutoTokenizer.from_pretrained(SUMM_MODEL)
sentiment = pipeline("text-classification", model=SENT_MODEL)
ner = pipeline("token-classification", model=NER_MODEL,
aggregation_strategy="simple")
return summarizer, tokenizer, sentiment, ner
# ───────────────── Helper functions ────────────────────────────────────
# Read text from PDF
def extract_pdf(file):
text = ""
with pdfplumber.open(file) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""
return text
# Split text to avoid long contents
def split_by_tokens(text, max_tokens):
words = re.split(r"(\s+)", text)
buf, n = "", 0
for w in words:
ln = len(TOK(w).input_ids)
if n + ln <= max_tokens:
buf, n = buf + w, n + ln
else:
yield buf.strip()
buf, n = w, ln
if buf.strip():
yield buf.strip()
# Summarise the news
def summarise(text):
parts = list(split_by_tokens(text, MAX_TOK))
per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
first_pass = [
SUMMAR(p, max_length=per_len, min_length=per_len // 2, do_sample=False)[0]["summary_text"]
for p in parts
]
joined = " ".join(first_pass)
if len(joined.split()) > TARGET_WORDS:
joined = SUMMAR(joined, max_length=TARGET_WORDS,
min_length=TARGET_WORDS // 2,
do_sample=False)[0]["summary_text"]
return joined
# Shorten the summary in 1-5 sentence (let user to choose)
def shorten(summary, n_sentences):
s = summary.split(". ")
return (". ".join(s[:n_sentences]).rstrip(".") + ".") if len(s) > n_sentences else summary
# Key entity tagging
def tag_entities(text):
tag_dict = {"Organization": [], "Person": [], "Location": [], "Miscellaneous": []}
for entity in NER(text):
group = {"ORG": "Organization", "PER": "Person", "LOC": "Location"}.get(entity["entity_group"], "Miscellaneous")
tag_dict[group].append(entity["word"])
return {k: sorted(set(v)) for k, v in tag_dict.items() if v}
# ───────────────── Main App Logic ────────────────────────────────────
def main():
st.title("πŸ“° Financial News Analyzer")
st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")
# Sidebar Input
with st.sidebar:
st.header("Input News to Analyze:")
# Step 1: enter the news
txt_input = st.text_area("Paste news article", height=150)
pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
# let user choose summary length (1-5 sentence)
sent_count = st.slider("Summary length (sentences)", min_value=1, max_value=5, value=3)
run_btn = st.button("πŸ” Analyze", use_container_width=True)
raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()
# Main processing
if run_btn:
if not raw_text:
st.warning("Please provide text or a PDF first.")
st.stop()
with st.spinner("Analyzing..."):
full_sum = summarise(raw_text)
summary = shorten(full_sum, sent_count)
# Step 2: Summarization
cols = st.columns([2, 1])
with cols[0]:
st.subheader("πŸ“ Summary")
st.markdown(f"<div style='white-space: pre-wrap'>{summary}</div>", unsafe_allow_html=True)
# Step 3: Sentiment analysis
with cols[1]:
result = SENT_CLF(summary)[0]
label = LABEL_MAP.get(result["label"], result["label"])
color = COLOR_MAP[label]
st.subheader("πŸ“Š Sentiment")
st.markdown(
f"<h3 style='color:{color};margin-bottom:0'>{label}</h3>"
f"{result['score'] * 100:.1f}% Confidence</p>",
unsafe_allow_html=True
)
# Step 3: Entity Tags
tags = tag_entities(summary)
st.subheader("🏷️ Relevant Tags")
if tags:
# Tag pill CSS
pill_css = """
<style>
.tag-pill {
display: inline-block;
background: #f0f2f6;
color: #333;
padding: 4px 10px;
margin: 2px 4px 2px 0;
border-radius: 12px;
font-size: 0.9em;
}
.tag-cat {
font-weight: 600;
margin-top: 0;
margin-bottom: 4px;
}
</style>
"""
st.markdown(pill_css, unsafe_allow_html=True)
for category, values in tags.items():
st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in values)
st.markdown(pills, unsafe_allow_html=True)
else:
st.info("No entities detected.")
# ───────────────── Main Part ───────────────────────────────────────
# models and other constant variables
SUMM_MODEL = "sshleifer/distilbart-cnn-12-6"
SENT_MODEL = "nynn/Fintuned_Sentiment"
NER_MODEL = "Babelscape/wikineural-multilingual-ner"
SUMMAR, TOK, SENT_CLF, NER = load_pipes()
MAX_TOK = 1024
TARGET_WORDS = 225
LABEL_MAP = {"LABEL_0": "Negative", "LABEL_1": "Positive", "LABEL_2": "Neutral"}
COLOR_MAP = {"Positive": "green", "Negative": "red", "Neutral": "gray"}
if __name__ == "__main__":
main()