File size: 5,956 Bytes
0259c99
8b425b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0259c99
8b425b2
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import streamlit as st
import pandas as pd
from datetime import datetime
from search import search_news, fill_missing_urls
from fetch_and_extract import get_companies_and_articles
from helpers import match_companies_to_articles
from config import MAX_NEWS_PER_TOPIC, MAX_TOPICS, COMPANY_CACHE_SHEET_NAME
import os

# --- PAGE CONFIGURATION ---
st.set_page_config(page_title="News Finder Agent", page_icon="πŸ•΅οΈ", layout="wide")

# --- SESSION STATE INITIALIZATION ---
if 'results_data' not in st.session_state:
    st.session_state.results_data = None

# --- MAIN INTERFACE ---
st.title("πŸ•΅οΈ News Finder AI Agent")
st.markdown("Enter your topics below to generate a report of companies mentioned in the news.")

# 1. TOPIC INPUT
topics_input = st.text_area(
    f"1. Topics (Comma separated), maximum {MAX_TOPICS} topics",
    placeholder="e.g. Artificial Intelligence, Nvidia, Supply Chain Logistics, Green Energy...",
    help="Paste your long list of topics here. The agent will dedup and search for all of them."
)

# CHANGED: Created 3 columns to fit the new field neatly
col_geo, col_time, col_limit = st.columns(3)

# 2. GEOGRAPHY INPUT
iso_countries = {
    # --- GLOBAL & NORTH AMERICA ---
    "Global": "any",
    "United States": "us",
    "Canada": "ca",

    # --- ASIA PACIFIC ---
    "Australia": "au",
    "China": "cn",
    "India": "in",
    "Japan": "jp",
    "Malaysia": "my",
    "South Korea": "kr",
    "Singapore": "sg",
    "Taiwan": "tw",
    "Hong Kong": "hk",

    # --- EUROPE (WESTERN) ---
    "United Kingdom": "gb",
    "Germany": "de",
    "France": "fr",
    "Italy": "it",
    "Spain": "es",
    "Netherlands": "nl",
    "Belgium": "be",
    "Switzerland": "ch",
    "Austria": "at",
    "Ireland": "ie",
    "Luxembourg": "lu",
    "Portugal": "pt",

    # --- EUROPE (NORDIC) ---
    "Sweden": "se",
    "Norway": "no",
    "Denmark": "dk",
    "Finland": "fi",
    "Iceland": "is",

    # --- EUROPE (CENTRAL & EASTERN) ---
    "Poland": "pl",
    "Czech Republic": "cz",
    "Hungary": "hu",
    "Romania": "ro",
    "Ukraine": "ua",
    "Greece": "gr",
    "Turkey": "tr",
    "Bulgaria": "bg",
    "Croatia": "hr",
    "Slovakia": "sk",
    "Slovenia": "si",
    "Serbia": "rs",

    # --- EUROPE (BALTIC) ---
    "Estonia": "ee",
    "Latvia": "lv",
    "Lithuania": "lt",
}

with col_geo:
    selected_country = st.selectbox(
        "2. Geography",
        options=list(iso_countries.keys()),
        index=0
    )
    geo_code = iso_countries[selected_country]

# 3. TIME FRAME INPUT
with col_time:
    days_back = st.slider(
        "3. Time Frame (Days Back)",
        min_value=1,
        max_value=30,
        value=7,
        help="How far back should we search for news?"
    )

# 4. MAX ARTICLES INPUT
with col_limit:
    max_news = st.number_input(
        "4. Max Articles per Topic",
        min_value=10,
        max_value=MAX_NEWS_PER_TOPIC,  # Restricted by config
        value=min(50, MAX_NEWS_PER_TOPIC),
        step=10,
        help=f"Control costs by limiting articles. Max allowed: {MAX_NEWS_PER_TOPIC}"
    )

# --- ACTION BUTTON ---
if st.button("πŸš€ Find News & Extract Companies", type="primary"):
    if not topics_input:
        st.error("⚠️ Please enter at least one topic.")
    else:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        SERPER_API_KEY = os.environ.get('SERPER_API_KEY')

        topic_list = [t.strip() for t in topics_input.split(",") if t.strip()]

        # ENFORCE LIMIT ON TOPICS
        if len(topic_list) > MAX_TOPICS:
            st.warning(
                f"⚠️ Limit Reached: You entered {len(topic_list)} topics. Processing only the first {MAX_TOPICS}.")
            topic_list = topic_list[:MAX_TOPICS]

        with st.status("πŸ€– Agent is working...", expanded=True) as status:
            st.write(f"πŸ” Searching {len(topic_list)} topics in {selected_country} (Max {max_news} articles each)...")

            # 1. Search News
            articles = search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, selected_country)

            if not articles:
                status.update(label="❌ No news found!", state="error")
                st.stop()

            st.write(f"βœ… Found {len(articles)} unique articles. πŸ› οΈ Extracting companies with LLM...")

            # 2. Extract Companies (LLM)
            urls_to_process = [a['link'] for a in articles]
            articles_with_companies_from_llm = get_companies_and_articles(urls_to_process, OPENAI_API_KEY)

            st.write(f"βœ… Generating results...")

            # 3. Combine & Fill URLs
            matched_results = match_companies_to_articles(articles, articles_with_companies_from_llm)
            structured_results = fill_missing_urls(matched_results, COMPANY_CACHE_SHEET_NAME, SERPER_API_KEY)

            status.update(label="βœ… Search Complete!", state="complete", expanded=False)

        # SAVE RESULTS
        if structured_results:
            st.session_state.results_data = pd.DataFrame(structured_results)
        else:
            st.warning("No companies found in the extracted text.")

# --- RESULTS & DOWNLOAD ---
if st.session_state.results_data is not None:
    st.divider()
    st.subheader("πŸ“‚ Extracted Data")

    st.dataframe(
        st.session_state.results_data,
        column_config={
            "company_url": st.column_config.LinkColumn(
                "Website"  # Full URL shown, clickable
            ),
            "article_url": st.column_config.LinkColumn(
                "Source Article"  # Full URL shown, clickable
            ),
        },
        use_container_width=True
    )

    csv = st.session_state.results_data.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="πŸ“₯ Download Results as CSV",
        data=csv,
        file_name=f"news_extraction_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
        mime="text/csv",
        type="primary"
    )