Spaces:

chummchumm
/

newFinderAgent_v2

Paused

File size: 5,956 Bytes

import streamlit as st
import pandas as pd
from datetime import datetime
from search import search_news, fill_missing_urls
from fetch_and_extract import get_companies_and_articles
from helpers import match_companies_to_articles
from config import MAX_NEWS_PER_TOPIC, MAX_TOPICS, COMPANY_CACHE_SHEET_NAME
import os

# --- PAGE CONFIGURATION ---
st.set_page_config(page_title="News Finder Agent", page_icon="🕵️", layout="wide")

# --- SESSION STATE INITIALIZATION ---
if 'results_data' not in st.session_state:
    st.session_state.results_data = None

# --- MAIN INTERFACE ---
st.title("🕵️ News Finder AI Agent")
st.markdown("Enter your topics below to generate a report of companies mentioned in the news.")

# 1. TOPIC INPUT
topics_input = st.text_area(
    f"1. Topics (Comma separated), maximum {MAX_TOPICS} topics",
    placeholder="e.g. Artificial Intelligence, Nvidia, Supply Chain Logistics, Green Energy...",
    help="Paste your long list of topics here. The agent will dedup and search for all of them."
)

# CHANGED: Created 3 columns to fit the new field neatly
col_geo, col_time, col_limit = st.columns(3)

# 2. GEOGRAPHY INPUT
iso_countries = {
    # --- GLOBAL & NORTH AMERICA ---
    "Global": "any",
    "United States": "us",
    "Canada": "ca",

    # --- ASIA PACIFIC ---
    "Australia": "au",
    "China": "cn",
    "India": "in",
    "Japan": "jp",
    "Malaysia": "my",
    "South Korea": "kr",
    "Singapore": "sg",
    "Taiwan": "tw",
    "Hong Kong": "hk",

    # --- EUROPE (WESTERN) ---
    "United Kingdom": "gb",
    "Germany": "de",
    "France": "fr",
    "Italy": "it",
    "Spain": "es",
    "Netherlands": "nl",
    "Belgium": "be",
    "Switzerland": "ch",
    "Austria": "at",
    "Ireland": "ie",
    "Luxembourg": "lu",
    "Portugal": "pt",

    # --- EUROPE (NORDIC) ---
    "Sweden": "se",
    "Norway": "no",
    "Denmark": "dk",
    "Finland": "fi",
    "Iceland": "is",

    # --- EUROPE (CENTRAL & EASTERN) ---
    "Poland": "pl",
    "Czech Republic": "cz",
    "Hungary": "hu",
    "Romania": "ro",
    "Ukraine": "ua",
    "Greece": "gr",
    "Turkey": "tr",
    "Bulgaria": "bg",
    "Croatia": "hr",
    "Slovakia": "sk",
    "Slovenia": "si",
    "Serbia": "rs",

    # --- EUROPE (BALTIC) ---
    "Estonia": "ee",
    "Latvia": "lv",
    "Lithuania": "lt",
}

with col_geo:
    selected_country = st.selectbox(
        "2. Geography",
        options=list(iso_countries.keys()),
        index=0
    )
    geo_code = iso_countries[selected_country]

# 3. TIME FRAME INPUT
with col_time:
    days_back = st.slider(
        "3. Time Frame (Days Back)",
        min_value=1,
        max_value=30,
        value=7,
        help="How far back should we search for news?"
    )

# 4. MAX ARTICLES INPUT
with col_limit:
    max_news = st.number_input(
        "4. Max Articles per Topic",
        min_value=10,
        max_value=MAX_NEWS_PER_TOPIC,  # Restricted by config
        value=min(50, MAX_NEWS_PER_TOPIC),
        step=10,
        help=f"Control costs by limiting articles. Max allowed: {MAX_NEWS_PER_TOPIC}"
    )

# --- ACTION BUTTON ---
if st.button("🚀 Find News & Extract Companies", type="primary"):
    if not topics_input:
        st.error("⚠️ Please enter at least one topic.")
    else:
        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
        SERPER_API_KEY = os.environ.get('SERPER_API_KEY')

        topic_list = [t.strip() for t in topics_input.split(",") if t.strip()]

        # ENFORCE LIMIT ON TOPICS
        if len(topic_list) > MAX_TOPICS:
            st.warning(
                f"⚠️ Limit Reached: You entered {len(topic_list)} topics. Processing only the first {MAX_TOPICS}.")
            topic_list = topic_list[:MAX_TOPICS]

        with st.status("🤖 Agent is working...", expanded=True) as status:
            st.write(f"🔍 Searching {len(topic_list)} topics in {selected_country} (Max {max_news} articles each)...")

            # 1. Search News
            articles = search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, selected_country)

            if not articles:
                status.update(label="❌ No news found!", state="error")
                st.stop()

            st.write(f"✅ Found {len(articles)} unique articles. 🛠️ Extracting companies with LLM...")

            # 2. Extract Companies (LLM)
            urls_to_process = [a['link'] for a in articles]
            articles_with_companies_from_llm = get_companies_and_articles(urls_to_process, OPENAI_API_KEY)

            st.write(f"✅ Generating results...")

            # 3. Combine & Fill URLs
            matched_results = match_companies_to_articles(articles, articles_with_companies_from_llm)
            structured_results = fill_missing_urls(matched_results, COMPANY_CACHE_SHEET_NAME, SERPER_API_KEY)

            status.update(label="✅ Search Complete!", state="complete", expanded=False)

        # SAVE RESULTS
        if structured_results:
            st.session_state.results_data = pd.DataFrame(structured_results)
        else:
            st.warning("No companies found in the extracted text.")

# --- RESULTS & DOWNLOAD ---
if st.session_state.results_data is not None:
    st.divider()
    st.subheader("📂 Extracted Data")

    st.dataframe(
        st.session_state.results_data,
        column_config={
            "company_url": st.column_config.LinkColumn(
                "Website"  # Full URL shown, clickable
            ),
            "article_url": st.column_config.LinkColumn(
                "Source Article"  # Full URL shown, clickable
            ),
        },
        use_container_width=True
    )

    csv = st.session_state.results_data.to_csv(index=False).encode('utf-8')
    st.download_button(
        label="📥 Download Results as CSV",
        data=csv,
        file_name=f"news_extraction_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
        mime="text/csv",
        type="primary"
    )