Spaces:

chummchumm
/

newFinderAgent_v2

Paused

App Files Files Community

chummchumm commited on 11 days ago

Commit

8b425b2

verified ·

1 Parent(s): d20c82c

Upload 6 files

Browse files

Files changed (6) hide show

src/cache.py +159 -0
src/config.py +31 -0
src/fetch_and_extract.py +122 -0
src/helpers.py +76 -0
src/search.py +315 -0
src/streamlit_app.py +188 -38

src/cache.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import gspread
+from google.oauth2.service_account import Credentials
+import streamlit as st
+import os
+import json
+# --- 1. SETUP GOOGLE SHEETS CONNECTION ---
+def connect_to_sheet(sheet_name):
+    scopes = [
+        "https://www.googleapis.com/auth/spreadsheets",
+        "https://www.googleapis.com/auth/drive"
+    ]
+    # --- STRATEGY 1: LOCAL FILE (Priority) ---
+    if os.path.exists("credentials.json"):
+        print("📂 Using local 'credentials.json' file.")
+        creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
+    # --- STRATEGY 2: ENVIRONMENT VARIABLE (Fallback for Deploy) ---
+    else:
+        print("☁️ 'credentials.json' not found. Checking Environment Variables...")
+        creds_json_str = os.environ.get("gcp_service_account")
+        if not creds_json_str:
+            raise ValueError(
+                "❌ Error: Could not find 'credentials.json' LOCALLY, and 'gcp_service_account' is missing from ENV vars."
+            )
+        creds_dict = json.loads(creds_json_str)
+        creds = Credentials.from_service_account_info(creds_dict, scopes=scopes)
+    # Authorize & Open
+    client = gspread.authorize(creds)
+    return client.open(sheet_name).sheet1
+# def connect_to_sheet(sheet_name):
+#     # Define the scopes (Permissions)
+#     scopes = [
+#         "https://www.googleapis.com/auth/spreadsheets",
+#         "https://www.googleapis.com/auth/drive"
+#     ]
+#
+#     creds_json_str = os.environ.get("gcp_service_account")
+#
+#     if not creds_json_str:
+#         raise ValueError(
+#             "❌ Error: Could not find 'gcp_service_account' in Environment Variables. Did you add the secret in Hugging Face settings?")
+#
+#     # 2. Convert the String back into a Python Dictionary
+#     creds_dict = json.loads(creds_json_str)
+#
+#     # 3. Create credentials object
+#     creds = Credentials.from_service_account_info(
+#         creds_dict,
+#         scopes=scopes
+#     )
+#
+#     # Authorize gspread
+#     client = gspread.authorize(creds)
+#
+#     # Open the sheet
+#     return client.open(sheet_name).sheet1
+def load_cache_dict(sheet):
+    """Returns a dict: {'company name': 'url'} for fast lookup."""
+    try:
+        data = sheet.get_all_records()
+        # Create dict: lowercased name -> url
+        return {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
+    except Exception as e:
+        print(f"⚠️ Cache read error (empty sheet?): {e}")
+        return {}
+def append_to_cache(sheet, new_entries):
+    """Appends list of {'Company': name, 'Website': url} to sheet."""
+    if not new_entries:
+        return
+    rows = [[entry['Company'], entry['Website']] for entry in new_entries]
+    try:
+        sheet.append_rows(rows)
+        print(f"💾 Cached {len(rows)} new companies.")
+    except Exception as e:
+        print(f"❌ Error saving to cache: {e}")
+# # --- 2. LOAD CACHE (READ URLS FROM SHEET) ---
+# def load_cache(sheet):
+#     """
+#     Reads the entire sheet and creates a dictionary:
+#     {'Nvidia': 'https://nvidia.com', 'Tesla': 'https://tesla.com'}
+#     """
+#     print("📂 Reading existing data from Google Sheet...")
+#     data = sheet.get_all_records()  # Assumes headers: "Company", "Website"
+#
+#     # Create a quick lookup dictionary (Normalize names to lowercase to be safe)
+#     cache = {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
+#     return cache
+#
+#
+# # --- 3. THE "CASH SAVER" FUNCTION ---
+# def save_companies_to_cache(new_rows_to_add, sheet_name):
+#     sheet = connect_to_sheet(sheet_name)
+#
+#     # 1. Check if there is data
+#     if new_rows_to_add:
+#         print(f"💾 Saving {len(new_rows_to_add)} new companies to Sheet...")
+#
+#         # 2. CONVERT Dicts to Lists
+#         values_to_upload = [
+#             [item.get('company_name'), item.get('company_website')]
+#             for item in new_rows_to_add
+#         ]
+#
+#         # 3. Append to Sheet
+#         sheet.append_rows(values_to_upload)
+#         print("✅ Save Complete.")
+#
+#     else:
+#         print("🎉 No new searches needed. Sheet is up to date!")
+#
+#
+# # --- 3. THE LOAD CACHED COMPANIES FUNCTION ---
+# def get_cached_companies(company_list, sheet_name):
+#     """
+#     Splits companies into 'Found in Cache' and 'Missing (Need to Search)'
+#     """
+#     sheet = connect_to_sheet(sheet_name)
+#     cache = load_cache(sheet)
+#
+#     companies_in_cache = []
+#     missing_companies = []
+#
+#     print(f"📂 Checking Cache for {len(company_list)} companies...")
+#     # print(company_list)
+#
+#     for item in company_list:
+#         name = item  # item['company_name']
+#         # Normalize key for matching (must match how you save them)
+#         name_key = name.lower().strip()
+#
+#         # === CHECK CACHE ===
+#         # We check if key exists AND value is not empty
+#         if name_key in cache and cache[name_key]:
+#             # print(f"   ✅ Cache Hit: {name}") # Optional: Comment out to reduce noise
+#             companies_in_cache.append({
+#                 'company_name': name,
+#                 'company_website': cache[name_key]
+#             })
+#         # === MISSING ===
+#         else:
+#             # print(f"   🔎 Cache Miss: {name}")
+#             missing_companies.append(item)
+#
+#     # RETURN OUTSIDE THE LOOP
+#     return {
+#         'found': companies_in_cache,
+#         'missing': missing_companies
+#     }

src/config.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# config.py
+# --- LIMITS ---
+# Maximum number of topics allowed in one run
+MAX_TOPICS = 20
+# Maximum number of articles to process per search (Search & Clean phase)
+MAX_NEWS_PER_TOPIC = 500
+# Maximum number of articles to send to GPT (Extraction phase) to save money
+#MAX_ARTICLES_TO_LLM = 10000
+# Number of articles to process in one LLM call
+#LLM_BATCH_SIZE = 15
+#SERPER_RESULTS_PER_PAGE = 20
+# --- API DEFAULTS ---
+DEFAULT_DAYS_BACK = 7
+DEFAULT_COUNTRY = "us"
+# "Reporter" plan limits https://worldnewsapi.com/pricing/
+WORLD_NEWS_REQUESTS_PER_SECOND = 2.0
+WORLD_NEWS_MAX_CONCURRENT_REQUESTS = 5
+#Google settings
+COMPANY_CACHE_SHEET_NAME = "company_info_cache"

src/fetch_and_extract.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import asyncio
+# import aiohttp
+# import os
+import json
+import trafilatura
+from openai import AsyncOpenAI
+from pydantic import BaseModel, Field
+from typing import List
+# --- CONFIGURATION ---
+MAX_SCRAPE_CONCURRENCY = 10
+MAX_AI_CONCURRENCY = 5
+# --- DATA MODELS ---
+class CompanyResult(BaseModel):
+    name: str = Field(..., description="Name of the commercial company")
+    url: str = Field(..., description="""Official website URL. Predict if missing.
+                                        If you are NOT 100% sure about the official website,
+                                        respond ONLY with:'SEARCH_REQUIRED'""")
+    article_id: int = Field(..., description="The ID of the article provided in context")
+class ExtractionResponse(BaseModel):
+    companies: List[CompanyResult]
+# --- ROBUST WORKER ---
+async def process_article(url: str, article_id: int, scrape_sem, ai_sem, OPENAI_API_KEY):
+    loop = asyncio.get_running_loop()
+    # 1. Fetch & Extract (Using Trafilatura's robust fetcher)
+    async with scrape_sem:
+        try:
+            # Run the synchronous fetch_url in a separate thread
+            downloaded = await loop.run_in_executor(None, trafilatura.fetch_url, url)
+            if downloaded is None:
+                return {"url": url, "error": "Fetch failed (blocked or 404)"}
+            # Extract text (also CPU bound, so runs in executor)
+            text = await loop.run_in_executor(None, trafilatura.extract, downloaded)
+            if not text:
+                return {"url": url, "error": "No main text found"}
+        except Exception as e:
+            return {"url": url, "error": f"Scrape error: {str(e)}"}
+    # 2. AI Extraction
+    truncated_text = text[:5000]  # Trim to save tokens
+    user_content = f"Article ID: {article_id}\n\nText:\n{truncated_text}"
+    client = AsyncOpenAI(api_key=OPENAI_API_KEY)
+    async with ai_sem:
+        try:
+            completion = await client.beta.chat.completions.parse(
+                model="gpt-4o-mini",
+                messages=[
+                    {"role": "system", "content": "Extract commercial companies. Exclude generic entities, countries, government bodies."},
+                    {"role": "user", "content": user_content},
+                ],
+                response_format=ExtractionResponse,
+                temperature=0
+            )
+            result_obj = completion.choices[0].message.parsed
+            return {
+                "url": url,
+                "status": "success",
+                "companies": [c.model_dump() for c in result_obj.companies]
+            }
+        except Exception as e:
+            return {"url": url, "error": f"AI error: {str(e)}"}
+# --- MAIN ORCHESTRATOR ---
+async def run_pipeline(urls: List[str], OPENAI_API_KEY):
+    scrape_sem = asyncio.Semaphore(MAX_SCRAPE_CONCURRENCY)
+    ai_sem = asyncio.Semaphore(MAX_AI_CONCURRENCY)
+    print(f"🚀 Processing {len(urls)} articles...")
+    # We don't need aiohttp session anymore for fetching, as Trafilatura handles it.
+    tasks = [
+        process_article(url, idx, scrape_sem, ai_sem, OPENAI_API_KEY)
+        for idx, url in enumerate(urls)
+    ]
+    results = await asyncio.gather(*tasks)
+    # Reporting
+    success = [r for r in results if "error" not in r]
+    failures = [r for r in results if "error" in r]
+    print(f"\n✅ Completed: {len(success)}")
+    print(f"❌ Failed:    {len(failures)}")
+    if success:
+        print(f"\n[Sample Output]:\n{json.dumps(success[0], indent=2)}")
+    # Save to file
+    with open("final_results.json", "w") as f:
+        json.dump(success, f, indent=2)
+    return success
+def get_companies_and_articles(article_url: list, OPENAI_API_KEY):
+    companies_with_articles = asyncio.run(run_pipeline(article_url, OPENAI_API_KEY))
+    return companies_with_articles
+# if __name__ == "__main__":
+#     # REAL, LIVE URLs (Checked Feb 4, 2026)
+#     live_urls = [
+#         "https://newsroom.ibm.com/2026-02-04-ibm-opens-global-rfp-for-ai-driven-solutions-shaping-the-future-of-work-and-education",
+#         "https://eng.lsm.lv/article/society/defence/04.02.2026-artificial-intelligence-centre-to-get-230000-euros-from-defence-budget.a633009/",
+#         "https://www.unesco.org/en/articles/tech-spark-africa-advances-simulation-based-learning-skills-development"
+#     ]
+#
+#     companies_with_articles = asyncio.run(run_pipeline(live_urls))

src/helpers.py ADDED Viewed

	@@ -0,0 +1,76 @@

+#import gspread
+def match_companies_to_articles(articles_metadata, ai_results):
+    # A. Create a lookup dictionary: URL -> Title
+# This allows instant access to titles without looping every time
+    url_to_title_map = {item['link']: item['title'] for item in articles_metadata}
+    final_list = []
+    for result in ai_results:
+      article_url = result.get('url')
+    # Look up the title, default to "Unknown" if the URL isn't in metadata
+      article_title = url_to_title_map.get(article_url, "Unknown Title")
+    # Iterate through the companies found in this specific article
+      if 'companies' in result:
+          for company in result['companies']:
+            record = {
+                "company_name": company['name'],
+                "company_url": company.get('url', ''), # Handle missing URLs gracefully
+                "article_title": article_title,
+                "article_url": article_url
+            }
+            final_list.append(record)
+    results = sorted(final_list, key=lambda x: x['company_name'])
+    return results
+#
+# def connect_to_sheet(json_keyfile, sheet_name):
+#     """Authenticates and returns the worksheet object."""
+#     try:
+#         gc = gspread.service_account(filename=json_keyfile)
+#         sh = gc.open(sheet_name)
+#         return sh.sheet1
+#     except Exception as e:
+#         print(f"❌ Error connecting to Google Sheets: {e}")
+#         return None
+#
+#
+# def get_cached_websites(worksheet):
+#     """
+#     Returns a dictionary of existing companies: {'Tesla': 'tesla.com', ...}
+#     """
+#     if not worksheet: return {}
+#
+#     print("📂 Reading cache from Google Sheets...")
+#     try:
+#         records = worksheet.get_all_records()
+#         # Convert list of dicts to a lookup map
+#         return {
+#             row['company_name']: row['company_website']
+#             for row in records
+#             if row.get('company_name')
+#         }
+#     except Exception:
+#         return {}
+#
+#
+# def save_new_websites(worksheet, new_data):
+#     """
+#     Appends new data to the sheet.
+#     Expects a list of dicts: [{'company_name': 'X', 'company_website': 'Y'}]
+#     """
+#     if not worksheet or not new_data: return
+#
+#     print(f"💾 Saving {len(new_data)} new entries to Google Sheets...")
+#
+#     # Prepare rows as list of lists: [['Name', 'URL'], ['Name', 'URL']]
+#     rows = [[item['company_name'], item['company_website']] for item in new_data]
+#
+#     # Add headers if sheet is empty
+#     if not worksheet.get_all_values():
+#         worksheet.append_row(["company_name", "company_website"])
+#
+#     worksheet.append_rows(rows)

src/search.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import asyncio
+import aiohttp
+import time
+import json
+import math
+import certifi
+import ssl
+from cache import connect_to_sheet, load_cache_dict, append_to_cache
+import re
+from urllib.parse import urlparse
+# --- CONFIGURATION ---
+BASE_URL = "https://google.serper.dev/news"
+RESULTS_PER_PAGE = 100  # Serper max per request
+MAX_CONCURRENCY = 10  # Avoid 429 errors
+# --- WORKER: Fetch articles for one topic ---
+async def fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name=""):
+    articles = []
+    headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
+    time_filter = f"qdr:d{days_back}"
+    # Calculate how many pages we need based on max_articles
+    # e.g., if max_articles=50, we need 1 page. If 150, we need 2 pages.
+    required_pages = math.ceil(max_articles / RESULTS_PER_PAGE)
+    async with sem:
+        print(f"--> Starting: {topic}")
+        if country_name and country_name != "Global":
+            query = f"{topic} {country_name}"
+        else:
+            query = topic
+        for page in range(1, required_pages + 1):
+            payload = {
+                "q": query,
+                "gl": geo_code,
+                "tbs": time_filter,
+                "num": RESULTS_PER_PAGE,
+                "page": page
+            }
+            try:
+                async with session.post(BASE_URL, headers=headers, json=payload) as resp:
+                    if resp.status != 200:
+                        print(f"   x Error {topic} (Page {page}): Status {resp.status}")
+                        break
+                    data = await resp.json()
+                    new_news = data.get("news", [])
+                    if not new_news:
+                        break  # No more results
+                    articles.extend(new_news)
+                    # Stop if we have reached the requested limit for this topic
+                    if len(articles) >= max_articles:
+                        articles = articles[:max_articles]
+                        break
+            except Exception as e:
+                print(f"   x Exception {topic}: {e}")
+                break
+    print(f"✅ Finished: {topic} ({len(articles)} articles)")
+    return articles
+# --- MAIN ORCHESTRATOR ---
+async def start_async_search(topics: list, geo_code: str, days_back: int, max_articles: int, api_key: str,
+                             country_name: str):
+    start_time = time.time()
+    # 1. Setup Concurrency
+    sem = asyncio.Semaphore(MAX_CONCURRENCY)
+    ssl_context = ssl.create_default_context(cafile=certifi.where())
+    connector = aiohttp.TCPConnector(ssl=ssl_context)
+    #    connector = aiohttp.TCPConnector(ssl=False)  # Ignore SSL errors if necessary
+    # 2. Run Tasks
+    async with aiohttp.ClientSession(connector=connector) as session:
+        tasks = [
+            fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name)
+            for topic in topics
+        ]
+        results = await asyncio.gather(*tasks)
+    # 3. Flatten and Deduplicate
+    # We use a dictionary keyed by URL to ensure every article is unique
+    unique_articles_map = {}
+    for topic_articles in results:
+        for article in topic_articles:
+            link = article.get('link')
+            if link and link not in unique_articles_map:
+                unique_articles_map[link] = article
+    final_articles_list = list(unique_articles_map.values())
+    # 4. Optional: Save to file for debug
+    with open("unique_articles.json", "w", encoding="utf-8") as f:
+        json.dump(final_articles_list, f, indent=2, ensure_ascii=False)
+    print("\n" + "=" * 40)
+    print(f"Total Time: {time.time() - start_time:.2f} seconds")
+    print(f"Total Unique Articles: {len(final_articles_list)}")
+    print("=" * 40)
+    return final_articles_list
+def search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, country_name):
+    articles = asyncio.run(start_async_search(
+        topics=topic_list,
+        geo_code=geo_code,
+        days_back=days_back,
+        max_articles=max_news,
+        api_key=SERPER_API_KEY,
+        country_name=country_name
+    ))
+    # Verify output
+    print(f"Search_news captured {len(articles)} articles.")
+    if articles:
+        print(f"Sample title: {articles[0].get('title')}")
+    return articles
+# ************** Company URL part
+async def fetch_url_from_serper(session, company_name, api_key):
+    """
+    Async worker: specific search for one company.
+    """
+    url = "https://google.serper.dev/search"
+    payload = json.dumps({"q": f"{company_name} official website", "num": 1})
+    headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
+    # try:
+    #     # We use the session passed from the parent, which now has SSL configured
+    #     async with session.post(url, headers=headers, data=payload) as response:
+    #         if response.status == 200:
+    #             data = await response.json()
+    #             if "organic" in data and len(data["organic"]) > 0:
+    #                 return data["organic"][0].get("link", "")
+    # except Exception as e:
+    #     print(f"⚠️ Serper error for {company_name}: {e}")
+    #
+    # return ""
+    # Helper to clean names for comparison (e.g. "99 Startups" -> "99startups")
+    def clean(text):
+        return re.sub(r'\W+', '', text).lower()
+    target_name = clean(company_name)
+    # Domains to ignore if they appear as the main link
+    blacklist = ["wikipedia", "linkedin", "bloomberg", "crunchbase", "facebook", "instagram", "youtube"]
+    try:
+        async with session.post(url, headers=headers, data=payload) as response:
+            if response.status == 200:
+                data = await response.json()
+                if "organic" not in data:
+                    return ""
+                results = data["organic"]
+                # --- STRATEGY 1: Check High-Quality Matches in Links ---
+                for res in results:
+                    link = res.get("link", "")
+                    domain = urlparse(link).netloc.lower()
+                    # Skip blacklisted profile sites
+                    if any(b in domain for b in blacklist):
+                        continue
+                    # If the domain contains the company name strictly (e.g. 'sabre.com' contains 'sabre')
+                    # This fixes the "Generic Name" issue if the official site ranks high
+                    if target_name in clean(domain):
+                        return link
+                # --- STRATEGY 2: Snippet Hunting (The "99 Startups" Fix) ---
+                # If Strategy 1 failed, look for URLs hidden inside the text snippet
+                for res in results:
+                    snippet = res.get("snippet", "")
+                    # Find potential URLs in the text (e.g. "Website: www.99startups.com")
+                    hidden_urls = re.findall(r'(?:www\.|https?://)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', snippet)
+                    for hidden in hidden_urls:
+                        # If this hidden URL matches our company name, it's likely the real one
+                        if target_name in clean(hidden):
+                            # Ensure it has a schema
+                            if not hidden.startswith("http"):
+                                return f"https://{hidden}"
+                            return hidden
+                # --- STRATEGY 3: Fallback (Best Guess) ---
+                # If no perfect match found, return the first non-blacklisted result
+                for res in results:
+                    link = res.get("link", "")
+                    if not any(b in link for b in blacklist):
+                        return link
+    except Exception as e:
+        print(f"⚠️ Serper error for {company_name}: {e}")
+    return ""
+async def run_batch_search(company_names, api_key):
+    """
+    Orchestrator: runs all searches in parallel with SECURE SSL CONTEXT.
+    """
+    results = {}
+    # --- SSL FIX START ---
+    # Create an SSL context that uses certifi's trusted CA bundle
+    ssl_context = ssl.create_default_context(cafile=certifi.where())
+    connector = aiohttp.TCPConnector(ssl=ssl_context)
+    # --- SSL FIX END ---
+    # Pass the connector to the session
+    async with aiohttp.ClientSession(connector=connector) as session:
+        tasks = []
+        for name in company_names:
+            tasks.append(fetch_url_from_serper(session, name, api_key))
+        # Run them all at once
+        urls = await asyncio.gather(*tasks)
+        for name, url in zip(company_names, urls):
+            results[name] = url
+    return results
+def fill_missing_urls(data_list, sheet_name, serper_api_key):
+    """
+    Main function to process the data list.
+    1. Checks Cache
+    2. Searches Serper for missing
+    3. Updates Data
+    4. Saves new finds to Cache
+    """
+    # A. Identify targets
+    # We only care about rows where url is 'SEARCH_REQUIRED'
+    target_indices = [i for i, row in enumerate(data_list) if row.get('company_url') == 'SEARCH_REQUIRED']
+    if not target_indices:
+        print("✅ No searches required.")
+        return data_list
+    print(f"🔍 Processing {len(target_indices)} missing URLs...")
+    # Get unique company names needing search (Deduplication)
+    companies_to_resolve = {data_list[i]['company_name'] for i in target_indices}
+    # B. Connect & Check Cache
+    try:
+        sheet = connect_to_sheet(sheet_name)
+        cache_dict = load_cache_dict(sheet)
+    except Exception as e:
+        print(f"⚠️ Cache connection failed, skipping cache: {e}")
+        sheet = None
+        cache_dict = {}
+    # Separate into Found vs Missing
+    found_in_cache = {}
+    missing_from_cache = []
+    for company in companies_to_resolve:
+        norm_name = company.lower().strip()
+        if norm_name in cache_dict:
+            found_in_cache[company] = cache_dict[norm_name]
+        else:
+            missing_from_cache.append(company)
+    print(f"   - Found in Cache: {len(found_in_cache)}")
+    print(f"   - Need API Search: {len(missing_from_cache)}")
+    # C. Perform API Search (if any missing)
+    search_results = {}
+    if missing_from_cache:
+        print(f"🌍 Searching internet for {len(missing_from_cache)} companies...")
+        search_results = asyncio.run(run_batch_search(missing_from_cache, serper_api_key))
+    # D. Update the Original Data List
+    # Combine all known URLs (Cache + Search)
+    full_knowledge_base = {**found_in_cache, **search_results}
+    for i in target_indices:
+        comp_name = data_list[i]['company_name']
+        # Look up URL (default to empty string if search failed)
+        url = full_knowledge_base.get(comp_name, "")
+        data_list[i]['company_url'] = url
+    # E. Update Cache (Save only what we just searched)
+    if sheet and search_results:
+        # Prepare list for GSheet [{'Company': 'Name', 'Website': 'URL'}]
+        new_cache_entries = [
+            {'Company': name, 'Website': url}
+            for name, url in search_results.items()
+            if url  # Only cache if we actually found a URL
+        ]
+        append_to_cache(sheet, new_cache_entries)
+    else:
+        print('Nothing appended to cache')
+    return data_list

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,190 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

 import streamlit as st
+import pandas as pd
+from datetime import datetime
+from search import search_news, fill_missing_urls
+from fetch_and_extract import get_companies_and_articles
+from helpers import match_companies_to_articles
+from config import MAX_NEWS_PER_TOPIC, MAX_TOPICS, COMPANY_CACHE_SHEET_NAME
+import os
+# --- PAGE CONFIGURATION ---
+st.set_page_config(page_title="News Finder Agent", page_icon="🕵️", layout="wide")
+# --- SESSION STATE INITIALIZATION ---
+if 'results_data' not in st.session_state:
+    st.session_state.results_data = None
+# --- MAIN INTERFACE ---
+st.title("🕵️ News Finder AI Agent")
+st.markdown("Enter your topics below to generate a report of companies mentioned in the news.")
+# 1. TOPIC INPUT
+topics_input = st.text_area(
+    f"1. Topics (Comma separated), maximum {MAX_TOPICS} topics",
+    placeholder="e.g. Artificial Intelligence, Nvidia, Supply Chain Logistics, Green Energy...",
+    help="Paste your long list of topics here. The agent will dedup and search for all of them."
+)
+# CHANGED: Created 3 columns to fit the new field neatly
+col_geo, col_time, col_limit = st.columns(3)
+# 2. GEOGRAPHY INPUT
+iso_countries = {
+    # --- GLOBAL & NORTH AMERICA ---
+    "Global": "any",
+    "United States": "us",
+    "Canada": "ca",
+    # --- ASIA PACIFIC ---
+    "Australia": "au",
+    "China": "cn",
+    "India": "in",
+    "Japan": "jp",
+    "Malaysia": "my",
+    "South Korea": "kr",
+    "Singapore": "sg",
+    "Taiwan": "tw",
+    "Hong Kong": "hk",
+    # --- EUROPE (WESTERN) ---
+    "United Kingdom": "gb",
+    "Germany": "de",
+    "France": "fr",
+    "Italy": "it",
+    "Spain": "es",
+    "Netherlands": "nl",
+    "Belgium": "be",
+    "Switzerland": "ch",
+    "Austria": "at",
+    "Ireland": "ie",
+    "Luxembourg": "lu",
+    "Portugal": "pt",
+    # --- EUROPE (NORDIC) ---
+    "Sweden": "se",
+    "Norway": "no",
+    "Denmark": "dk",
+    "Finland": "fi",
+    "Iceland": "is",
+    # --- EUROPE (CENTRAL & EASTERN) ---
+    "Poland": "pl",
+    "Czech Republic": "cz",
+    "Hungary": "hu",
+    "Romania": "ro",
+    "Ukraine": "ua",
+    "Greece": "gr",
+    "Turkey": "tr",
+    "Bulgaria": "bg",
+    "Croatia": "hr",
+    "Slovakia": "sk",
+    "Slovenia": "si",
+    "Serbia": "rs",
+    # --- EUROPE (BALTIC) ---
+    "Estonia": "ee",
+    "Latvia": "lv",
+    "Lithuania": "lt",
+}
+with col_geo:
+    selected_country = st.selectbox(
+        "2. Geography",
+        options=list(iso_countries.keys()),
+        index=0
+    )
+    geo_code = iso_countries[selected_country]
+# 3. TIME FRAME INPUT
+with col_time:
+    days_back = st.slider(
+        "3. Time Frame (Days Back)",
+        min_value=1,
+        max_value=30,
+        value=7,
+        help="How far back should we search for news?"
+    )
+# 4. MAX ARTICLES INPUT
+with col_limit:
+    max_news = st.number_input(
+        "4. Max Articles per Topic",
+        min_value=10,
+        max_value=MAX_NEWS_PER_TOPIC,  # Restricted by config
+        value=min(50, MAX_NEWS_PER_TOPIC),
+        step=10,
+        help=f"Control costs by limiting articles. Max allowed: {MAX_NEWS_PER_TOPIC}"
+    )
+# --- ACTION BUTTON ---
+if st.button("🚀 Find News & Extract Companies", type="primary"):
+    if not topics_input:
+        st.error("⚠️ Please enter at least one topic.")
+    else:
+        OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+        SERPER_API_KEY = os.environ.get('SERPER_API_KEY')
+        topic_list = [t.strip() for t in topics_input.split(",") if t.strip()]
+        # ENFORCE LIMIT ON TOPICS
+        if len(topic_list) > MAX_TOPICS:
+            st.warning(
+                f"⚠️ Limit Reached: You entered {len(topic_list)} topics. Processing only the first {MAX_TOPICS}.")
+            topic_list = topic_list[:MAX_TOPICS]
+        with st.status("🤖 Agent is working...", expanded=True) as status:
+            st.write(f"🔍 Searching {len(topic_list)} topics in {selected_country} (Max {max_news} articles each)...")
+            # 1. Search News
+            articles = search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, selected_country)
+            if not articles:
+                status.update(label="❌ No news found!", state="error")
+                st.stop()
+            st.write(f"✅ Found {len(articles)} unique articles. 🛠️ Extracting companies with LLM...")
+            # 2. Extract Companies (LLM)
+            urls_to_process = [a['link'] for a in articles]
+            articles_with_companies_from_llm = get_companies_and_articles(urls_to_process, OPENAI_API_KEY)
+            st.write(f"✅ Generating results...")
+            # 3. Combine & Fill URLs
+            matched_results = match_companies_to_articles(articles, articles_with_companies_from_llm)
+            structured_results = fill_missing_urls(matched_results, COMPANY_CACHE_SHEET_NAME, SERPER_API_KEY)
+            status.update(label="✅ Search Complete!", state="complete", expanded=False)
+        # SAVE RESULTS
+        if structured_results:
+            st.session_state.results_data = pd.DataFrame(structured_results)
+        else:
+            st.warning("No companies found in the extracted text.")
+# --- RESULTS & DOWNLOAD ---
+if st.session_state.results_data is not None:
+    st.divider()
+    st.subheader("📂 Extracted Data")
+    st.dataframe(
+        st.session_state.results_data,
+        column_config={
+            "company_url": st.column_config.LinkColumn(
+                "Website"  # Full URL shown, clickable
+            ),
+            "article_url": st.column_config.LinkColumn(
+                "Source Article"  # Full URL shown, clickable
+            ),
+        },
+        use_container_width=True
+    )
+    csv = st.session_state.results_data.to_csv(index=False).encode('utf-8')
+    st.download_button(
+        label="📥 Download Results as CSV",
+        data=csv,
+        file_name=f"news_extraction_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
+        mime="text/csv",
+        type="primary"
+    )