FinanceAnalyzer

Sleeping

File size: 6,312 Bytes

import streamlit as st
import pdfplumber, re
from transformers import pipeline, AutoTokenizer


st.set_page_config(page_title="Financial News Analyzer",
                       page_icon="📰",
                       layout="wide")
@st.cache_resource(ttl=86400)

# ───────────────── Cached pipelines ────────────────────────────────────
def load_pipes():
    summarizer = pipeline("summarization", model=SUMM_MODEL)
    tokenizer  = AutoTokenizer.from_pretrained(SUMM_MODEL)
    sentiment  = pipeline("text-classification", model=SENT_MODEL)
    ner        = pipeline("token-classification", model=NER_MODEL,
                          aggregation_strategy="simple")
    return summarizer, tokenizer, sentiment, ner


# ───────────────── Helper functions ────────────────────────────────────
# Read text from PDF
def extract_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Split text to avoid long contents
def split_by_tokens(text, max_tokens):
    words = re.split(r"(\s+)", text)
    buf, n = "", 0
    for w in words:
        ln = len(TOK(w).input_ids)
        if n + ln <= max_tokens:
            buf, n = buf + w, n + ln
        else:
            yield buf.strip()
            buf, n = w, ln
    if buf.strip():
        yield buf.strip()

# Summarise the news
def summarise(text):
    parts = list(split_by_tokens(text, MAX_TOK))
    per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
    first_pass = [
        SUMMAR(p, max_length=per_len, min_length=per_len // 2, do_sample=False)[0]["summary_text"]
        for p in parts
    ]
    joined = " ".join(first_pass)
    if len(joined.split()) > TARGET_WORDS:
        joined = SUMMAR(joined, max_length=TARGET_WORDS,
                        min_length=TARGET_WORDS // 2,
                        do_sample=False)[0]["summary_text"]
    return joined

# Shorten the summary in 1-5 sentence (let user to choose)
def shorten(summary, n_sentences):
    s = summary.split(". ")
    return (". ".join(s[:n_sentences]).rstrip(".") + ".") if len(s) > n_sentences else summary

# Key entity tagging
def tag_entities(text):
    tag_dict = {"Organization": [], "Person": [], "Location": [], "Miscellaneous": []}
    for entity in NER(text):
        group = {"ORG": "Organization", "PER": "Person", "LOC": "Location"}.get(entity["entity_group"], "Miscellaneous")
        tag_dict[group].append(entity["word"])
    return {k: sorted(set(v)) for k, v in tag_dict.items() if v}


# ───────────────── Main App Logic ────────────────────────────────────
def main():
    st.title("📰 Financial News Analyzer")
    st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")

    # Sidebar Input
    with st.sidebar:
        st.header("Input News to Analyze:")
        # Step 1: enter the news
        txt_input = st.text_area("Paste news article", height=150)
        pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
        # let user choose summary length (1-5 sentence)
        sent_count = st.slider("Summary length (sentences)", min_value=1, max_value=5, value=3)
        run_btn = st.button("🔍 Analyze", use_container_width=True)

    raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()

    # Main processing
    if run_btn:
        if not raw_text:
            st.warning("Please provide text or a PDF first.")
            st.stop()

        with st.spinner("Analyzing..."):
            full_sum = summarise(raw_text)
            summary = shorten(full_sum, sent_count)

        # Step 2: Summarization
        cols = st.columns([2, 1])
        with cols[0]:
            st.subheader("📝 Summary")
            st.markdown(f"<div style='white-space: pre-wrap'>{summary}</div>", unsafe_allow_html=True)

        # Step 3: Sentiment analysis
        with cols[1]:
            result = SENT_CLF(summary)[0]
            label = LABEL_MAP.get(result["label"], result["label"])
            color = COLOR_MAP[label]
            st.subheader("📊 Sentiment")
            st.markdown(
                f"<h3 style='color:{color};margin-bottom:0'>{label}</h3>"
                f"{result['score'] * 100:.1f}% Confidence</p>",
                unsafe_allow_html=True
            )

        # Step 3: Entity Tags
        tags = tag_entities(summary)
        st.subheader("🏷️ Relevant Tags")

        if tags:
            # Tag pill CSS
            pill_css = """
            <style>
            .tag-pill {
              display: inline-block;
              background: #f0f2f6;
              color: #333;
              padding: 4px 10px;
              margin: 2px 4px 2px 0;
              border-radius: 12px;
              font-size: 0.9em;
            }
            .tag-cat {
              font-weight: 600;
              margin-top: 0;
              margin-bottom: 4px;
            }
            </style>
            """
            st.markdown(pill_css, unsafe_allow_html=True)

            for category, values in tags.items():
                st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
                pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in values)
                st.markdown(pills, unsafe_allow_html=True)
        else:
            st.info("No entities detected.")



# ───────────────── Main Part ───────────────────────────────────────
# models and other constant variables
SUMM_MODEL   = "sshleifer/distilbart-cnn-12-6"
SENT_MODEL   = "nynn/Fintuned_Sentiment"
NER_MODEL    = "Babelscape/wikineural-multilingual-ner"
SUMMAR, TOK, SENT_CLF, NER = load_pipes()

MAX_TOK      = 1024
TARGET_WORDS = 225
LABEL_MAP    = {"LABEL_0": "Negative", "LABEL_1": "Positive", "LABEL_2": "Neutral"}
COLOR_MAP    = {"Positive": "green", "Negative": "red", "Neutral": "gray"}

if __name__ == "__main__":
    main()