File size: 6,312 Bytes
74b4cd7
ad25769
 
74b4cd7
48cf862
 
 
 
ad25769
48cf862
 
ad25769
48cf862
 
 
 
 
ad25769
 
48cf862
ad25769
48cf862
 
 
 
 
 
 
 
 
ad25769
 
 
 
 
 
 
 
48cf862
 
 
 
ad25769
48cf862
ad25769
 
 
48cf862
 
 
 
 
ad25769
 
48cf862
 
ad25769
 
48cf862
 
ad25769
48cf862
ad25769
48cf862
ad25769
48cf862
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ad25769
 
48cf862
 
ad25769
 
 
 
 
 
 
 
48cf862
 
74b4cd7
48cf862
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import streamlit as st
import pdfplumber, re
from transformers import pipeline, AutoTokenizer


st.set_page_config(page_title="Financial News Analyzer",
                       page_icon="πŸ“°",
                       layout="wide")
@st.cache_resource(ttl=86400)

# ───────────────── Cached pipelines ────────────────────────────────────
def load_pipes():
    summarizer = pipeline("summarization", model=SUMM_MODEL)
    tokenizer  = AutoTokenizer.from_pretrained(SUMM_MODEL)
    sentiment  = pipeline("text-classification", model=SENT_MODEL)
    ner        = pipeline("token-classification", model=NER_MODEL,
                          aggregation_strategy="simple")
    return summarizer, tokenizer, sentiment, ner


# ───────────────── Helper functions ────────────────────────────────────
# Read text from PDF
def extract_pdf(file):
    text = ""
    with pdfplumber.open(file) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Split text to avoid long contents
def split_by_tokens(text, max_tokens):
    words = re.split(r"(\s+)", text)
    buf, n = "", 0
    for w in words:
        ln = len(TOK(w).input_ids)
        if n + ln <= max_tokens:
            buf, n = buf + w, n + ln
        else:
            yield buf.strip()
            buf, n = w, ln
    if buf.strip():
        yield buf.strip()

# Summarise the news
def summarise(text):
    parts = list(split_by_tokens(text, MAX_TOK))
    per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
    first_pass = [
        SUMMAR(p, max_length=per_len, min_length=per_len // 2, do_sample=False)[0]["summary_text"]
        for p in parts
    ]
    joined = " ".join(first_pass)
    if len(joined.split()) > TARGET_WORDS:
        joined = SUMMAR(joined, max_length=TARGET_WORDS,
                        min_length=TARGET_WORDS // 2,
                        do_sample=False)[0]["summary_text"]
    return joined

# Shorten the summary in 1-5 sentence (let user to choose)
def shorten(summary, n_sentences):
    s = summary.split(". ")
    return (". ".join(s[:n_sentences]).rstrip(".") + ".") if len(s) > n_sentences else summary

# Key entity tagging
def tag_entities(text):
    tag_dict = {"Organization": [], "Person": [], "Location": [], "Miscellaneous": []}
    for entity in NER(text):
        group = {"ORG": "Organization", "PER": "Person", "LOC": "Location"}.get(entity["entity_group"], "Miscellaneous")
        tag_dict[group].append(entity["word"])
    return {k: sorted(set(v)) for k, v in tag_dict.items() if v}


# ───────────────── Main App Logic ────────────────────────────────────
def main():
    st.title("πŸ“° Financial News Analyzer")
    st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")

    # Sidebar Input
    with st.sidebar:
        st.header("Input News to Analyze:")
        # Step 1: enter the news
        txt_input = st.text_area("Paste news article", height=150)
        pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
        # let user choose summary length (1-5 sentence)
        sent_count = st.slider("Summary length (sentences)", min_value=1, max_value=5, value=3)
        run_btn = st.button("πŸ” Analyze", use_container_width=True)

    raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()

    # Main processing
    if run_btn:
        if not raw_text:
            st.warning("Please provide text or a PDF first.")
            st.stop()

        with st.spinner("Analyzing..."):
            full_sum = summarise(raw_text)
            summary = shorten(full_sum, sent_count)

        # Step 2: Summarization
        cols = st.columns([2, 1])
        with cols[0]:
            st.subheader("πŸ“ Summary")
            st.markdown(f"<div style='white-space: pre-wrap'>{summary}</div>", unsafe_allow_html=True)

        # Step 3: Sentiment analysis
        with cols[1]:
            result = SENT_CLF(summary)[0]
            label = LABEL_MAP.get(result["label"], result["label"])
            color = COLOR_MAP[label]
            st.subheader("πŸ“Š Sentiment")
            st.markdown(
                f"<h3 style='color:{color};margin-bottom:0'>{label}</h3>"
                f"{result['score'] * 100:.1f}% Confidence</p>",
                unsafe_allow_html=True
            )

        # Step 3: Entity Tags
        tags = tag_entities(summary)
        st.subheader("🏷️ Relevant Tags")

        if tags:
            # Tag pill CSS
            pill_css = """
            <style>
            .tag-pill {
              display: inline-block;
              background: #f0f2f6;
              color: #333;
              padding: 4px 10px;
              margin: 2px 4px 2px 0;
              border-radius: 12px;
              font-size: 0.9em;
            }
            .tag-cat {
              font-weight: 600;
              margin-top: 0;
              margin-bottom: 4px;
            }
            </style>
            """
            st.markdown(pill_css, unsafe_allow_html=True)

            for category, values in tags.items():
                st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
                pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in values)
                st.markdown(pills, unsafe_allow_html=True)
        else:
            st.info("No entities detected.")



# ───────────────── Main Part ───────────────────────────────────────
# models and other constant variables
SUMM_MODEL   = "sshleifer/distilbart-cnn-12-6"
SENT_MODEL   = "nynn/Fintuned_Sentiment"
NER_MODEL    = "Babelscape/wikineural-multilingual-ner"
SUMMAR, TOK, SENT_CLF, NER = load_pipes()

MAX_TOK      = 1024
TARGET_WORDS = 225
LABEL_MAP    = {"LABEL_0": "Negative", "LABEL_1": "Positive", "LABEL_2": "Neutral"}
COLOR_MAP    = {"Positive": "green", "Negative": "red", "Neutral": "gray"}

if __name__ == "__main__":
    main()