chummchumm commited on
Commit
8b425b2
Β·
verified Β·
1 Parent(s): d20c82c

Upload 6 files

Browse files
Files changed (6) hide show
  1. src/cache.py +159 -0
  2. src/config.py +31 -0
  3. src/fetch_and_extract.py +122 -0
  4. src/helpers.py +76 -0
  5. src/search.py +315 -0
  6. src/streamlit_app.py +188 -38
src/cache.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gspread
2
+ from google.oauth2.service_account import Credentials
3
+ import streamlit as st
4
+ import os
5
+ import json
6
+
7
+
8
+ # --- 1. SETUP GOOGLE SHEETS CONNECTION ---
9
+ def connect_to_sheet(sheet_name):
10
+ scopes = [
11
+ "https://www.googleapis.com/auth/spreadsheets",
12
+ "https://www.googleapis.com/auth/drive"
13
+ ]
14
+
15
+ # --- STRATEGY 1: LOCAL FILE (Priority) ---
16
+ if os.path.exists("credentials.json"):
17
+ print("πŸ“‚ Using local 'credentials.json' file.")
18
+ creds = Credentials.from_service_account_file("credentials.json", scopes=scopes)
19
+
20
+ # --- STRATEGY 2: ENVIRONMENT VARIABLE (Fallback for Deploy) ---
21
+ else:
22
+ print("☁️ 'credentials.json' not found. Checking Environment Variables...")
23
+ creds_json_str = os.environ.get("gcp_service_account")
24
+
25
+ if not creds_json_str:
26
+ raise ValueError(
27
+ "❌ Error: Could not find 'credentials.json' LOCALLY, and 'gcp_service_account' is missing from ENV vars."
28
+ )
29
+
30
+ creds_dict = json.loads(creds_json_str)
31
+ creds = Credentials.from_service_account_info(creds_dict, scopes=scopes)
32
+
33
+ # Authorize & Open
34
+ client = gspread.authorize(creds)
35
+ return client.open(sheet_name).sheet1
36
+ # def connect_to_sheet(sheet_name):
37
+ # # Define the scopes (Permissions)
38
+ # scopes = [
39
+ # "https://www.googleapis.com/auth/spreadsheets",
40
+ # "https://www.googleapis.com/auth/drive"
41
+ # ]
42
+ #
43
+ # creds_json_str = os.environ.get("gcp_service_account")
44
+ #
45
+ # if not creds_json_str:
46
+ # raise ValueError(
47
+ # "❌ Error: Could not find 'gcp_service_account' in Environment Variables. Did you add the secret in Hugging Face settings?")
48
+ #
49
+ # # 2. Convert the String back into a Python Dictionary
50
+ # creds_dict = json.loads(creds_json_str)
51
+ #
52
+ # # 3. Create credentials object
53
+ # creds = Credentials.from_service_account_info(
54
+ # creds_dict,
55
+ # scopes=scopes
56
+ # )
57
+ #
58
+ # # Authorize gspread
59
+ # client = gspread.authorize(creds)
60
+ #
61
+ # # Open the sheet
62
+ # return client.open(sheet_name).sheet1
63
+
64
+ def load_cache_dict(sheet):
65
+ """Returns a dict: {'company name': 'url'} for fast lookup."""
66
+ try:
67
+ data = sheet.get_all_records()
68
+ # Create dict: lowercased name -> url
69
+ return {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
70
+ except Exception as e:
71
+ print(f"⚠️ Cache read error (empty sheet?): {e}")
72
+ return {}
73
+
74
+
75
+ def append_to_cache(sheet, new_entries):
76
+ """Appends list of {'Company': name, 'Website': url} to sheet."""
77
+ if not new_entries:
78
+ return
79
+
80
+ rows = [[entry['Company'], entry['Website']] for entry in new_entries]
81
+ try:
82
+ sheet.append_rows(rows)
83
+ print(f"πŸ’Ύ Cached {len(rows)} new companies.")
84
+ except Exception as e:
85
+ print(f"❌ Error saving to cache: {e}")
86
+
87
+ # # --- 2. LOAD CACHE (READ URLS FROM SHEET) ---
88
+ # def load_cache(sheet):
89
+ # """
90
+ # Reads the entire sheet and creates a dictionary:
91
+ # {'Nvidia': 'https://nvidia.com', 'Tesla': 'https://tesla.com'}
92
+ # """
93
+ # print("πŸ“‚ Reading existing data from Google Sheet...")
94
+ # data = sheet.get_all_records() # Assumes headers: "Company", "Website"
95
+ #
96
+ # # Create a quick lookup dictionary (Normalize names to lowercase to be safe)
97
+ # cache = {row['Company'].lower().strip(): row['Website'] for row in data if row['Company']}
98
+ # return cache
99
+ #
100
+ #
101
+ # # --- 3. THE "CASH SAVER" FUNCTION ---
102
+ # def save_companies_to_cache(new_rows_to_add, sheet_name):
103
+ # sheet = connect_to_sheet(sheet_name)
104
+ #
105
+ # # 1. Check if there is data
106
+ # if new_rows_to_add:
107
+ # print(f"πŸ’Ύ Saving {len(new_rows_to_add)} new companies to Sheet...")
108
+ #
109
+ # # 2. CONVERT Dicts to Lists
110
+ # values_to_upload = [
111
+ # [item.get('company_name'), item.get('company_website')]
112
+ # for item in new_rows_to_add
113
+ # ]
114
+ #
115
+ # # 3. Append to Sheet
116
+ # sheet.append_rows(values_to_upload)
117
+ # print("βœ… Save Complete.")
118
+ #
119
+ # else:
120
+ # print("πŸŽ‰ No new searches needed. Sheet is up to date!")
121
+ #
122
+ #
123
+ # # --- 3. THE LOAD CACHED COMPANIES FUNCTION ---
124
+ # def get_cached_companies(company_list, sheet_name):
125
+ # """
126
+ # Splits companies into 'Found in Cache' and 'Missing (Need to Search)'
127
+ # """
128
+ # sheet = connect_to_sheet(sheet_name)
129
+ # cache = load_cache(sheet)
130
+ #
131
+ # companies_in_cache = []
132
+ # missing_companies = []
133
+ #
134
+ # print(f"πŸ“‚ Checking Cache for {len(company_list)} companies...")
135
+ # # print(company_list)
136
+ #
137
+ # for item in company_list:
138
+ # name = item # item['company_name']
139
+ # # Normalize key for matching (must match how you save them)
140
+ # name_key = name.lower().strip()
141
+ #
142
+ # # === CHECK CACHE ===
143
+ # # We check if key exists AND value is not empty
144
+ # if name_key in cache and cache[name_key]:
145
+ # # print(f" βœ… Cache Hit: {name}") # Optional: Comment out to reduce noise
146
+ # companies_in_cache.append({
147
+ # 'company_name': name,
148
+ # 'company_website': cache[name_key]
149
+ # })
150
+ # # === MISSING ===
151
+ # else:
152
+ # # print(f" πŸ”Ž Cache Miss: {name}")
153
+ # missing_companies.append(item)
154
+ #
155
+ # # RETURN OUTSIDE THE LOOP
156
+ # return {
157
+ # 'found': companies_in_cache,
158
+ # 'missing': missing_companies
159
+ # }
src/config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+
3
+ # --- LIMITS ---
4
+ # Maximum number of topics allowed in one run
5
+ MAX_TOPICS = 20
6
+
7
+ # Maximum number of articles to process per search (Search & Clean phase)
8
+ MAX_NEWS_PER_TOPIC = 500
9
+
10
+ # Maximum number of articles to send to GPT (Extraction phase) to save money
11
+ #MAX_ARTICLES_TO_LLM = 10000
12
+
13
+ # Number of articles to process in one LLM call
14
+ #LLM_BATCH_SIZE = 15
15
+
16
+ #SERPER_RESULTS_PER_PAGE = 20
17
+
18
+ # --- API DEFAULTS ---
19
+ DEFAULT_DAYS_BACK = 7
20
+ DEFAULT_COUNTRY = "us"
21
+
22
+ # "Reporter" plan limits https://worldnewsapi.com/pricing/
23
+ WORLD_NEWS_REQUESTS_PER_SECOND = 2.0
24
+ WORLD_NEWS_MAX_CONCURRENT_REQUESTS = 5
25
+
26
+ #Google settings
27
+ COMPANY_CACHE_SHEET_NAME = "company_info_cache"
28
+
29
+
30
+
31
+
src/fetch_and_extract.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ # import aiohttp
3
+ # import os
4
+ import json
5
+ import trafilatura
6
+ from openai import AsyncOpenAI
7
+ from pydantic import BaseModel, Field
8
+ from typing import List
9
+
10
+ # --- CONFIGURATION ---
11
+ MAX_SCRAPE_CONCURRENCY = 10
12
+ MAX_AI_CONCURRENCY = 5
13
+
14
+
15
+
16
+ # --- DATA MODELS ---
17
+ class CompanyResult(BaseModel):
18
+ name: str = Field(..., description="Name of the commercial company")
19
+ url: str = Field(..., description="""Official website URL. Predict if missing.
20
+ If you are NOT 100% sure about the official website,
21
+ respond ONLY with:'SEARCH_REQUIRED'""")
22
+ article_id: int = Field(..., description="The ID of the article provided in context")
23
+
24
+
25
+ class ExtractionResponse(BaseModel):
26
+ companies: List[CompanyResult]
27
+
28
+
29
+ # --- ROBUST WORKER ---
30
+ async def process_article(url: str, article_id: int, scrape_sem, ai_sem, OPENAI_API_KEY):
31
+ loop = asyncio.get_running_loop()
32
+
33
+ # 1. Fetch & Extract (Using Trafilatura's robust fetcher)
34
+ async with scrape_sem:
35
+ try:
36
+ # Run the synchronous fetch_url in a separate thread
37
+ downloaded = await loop.run_in_executor(None, trafilatura.fetch_url, url)
38
+
39
+ if downloaded is None:
40
+ return {"url": url, "error": "Fetch failed (blocked or 404)"}
41
+
42
+ # Extract text (also CPU bound, so runs in executor)
43
+ text = await loop.run_in_executor(None, trafilatura.extract, downloaded)
44
+
45
+ if not text:
46
+ return {"url": url, "error": "No main text found"}
47
+
48
+ except Exception as e:
49
+ return {"url": url, "error": f"Scrape error: {str(e)}"}
50
+
51
+ # 2. AI Extraction
52
+ truncated_text = text[:5000] # Trim to save tokens
53
+ user_content = f"Article ID: {article_id}\n\nText:\n{truncated_text}"
54
+ client = AsyncOpenAI(api_key=OPENAI_API_KEY)
55
+ async with ai_sem:
56
+ try:
57
+ completion = await client.beta.chat.completions.parse(
58
+ model="gpt-4o-mini",
59
+ messages=[
60
+ {"role": "system", "content": "Extract commercial companies. Exclude generic entities, countries, government bodies."},
61
+ {"role": "user", "content": user_content},
62
+ ],
63
+ response_format=ExtractionResponse,
64
+ temperature=0
65
+ )
66
+
67
+ result_obj = completion.choices[0].message.parsed
68
+
69
+ return {
70
+ "url": url,
71
+ "status": "success",
72
+ "companies": [c.model_dump() for c in result_obj.companies]
73
+ }
74
+
75
+ except Exception as e:
76
+ return {"url": url, "error": f"AI error: {str(e)}"}
77
+
78
+
79
+ # --- MAIN ORCHESTRATOR ---
80
+ async def run_pipeline(urls: List[str], OPENAI_API_KEY):
81
+ scrape_sem = asyncio.Semaphore(MAX_SCRAPE_CONCURRENCY)
82
+ ai_sem = asyncio.Semaphore(MAX_AI_CONCURRENCY)
83
+
84
+ print(f"πŸš€ Processing {len(urls)} articles...")
85
+
86
+ # We don't need aiohttp session anymore for fetching, as Trafilatura handles it.
87
+ tasks = [
88
+ process_article(url, idx, scrape_sem, ai_sem, OPENAI_API_KEY)
89
+ for idx, url in enumerate(urls)
90
+ ]
91
+ results = await asyncio.gather(*tasks)
92
+
93
+ # Reporting
94
+ success = [r for r in results if "error" not in r]
95
+ failures = [r for r in results if "error" in r]
96
+
97
+ print(f"\nβœ… Completed: {len(success)}")
98
+ print(f"❌ Failed: {len(failures)}")
99
+
100
+ if success:
101
+ print(f"\n[Sample Output]:\n{json.dumps(success[0], indent=2)}")
102
+
103
+ # Save to file
104
+ with open("final_results.json", "w") as f:
105
+ json.dump(success, f, indent=2)
106
+
107
+ return success
108
+
109
+
110
+ def get_companies_and_articles(article_url: list, OPENAI_API_KEY):
111
+ companies_with_articles = asyncio.run(run_pipeline(article_url, OPENAI_API_KEY))
112
+ return companies_with_articles
113
+
114
+ # if __name__ == "__main__":
115
+ # # REAL, LIVE URLs (Checked Feb 4, 2026)
116
+ # live_urls = [
117
+ # "https://newsroom.ibm.com/2026-02-04-ibm-opens-global-rfp-for-ai-driven-solutions-shaping-the-future-of-work-and-education",
118
+ # "https://eng.lsm.lv/article/society/defence/04.02.2026-artificial-intelligence-centre-to-get-230000-euros-from-defence-budget.a633009/",
119
+ # "https://www.unesco.org/en/articles/tech-spark-africa-advances-simulation-based-learning-skills-development"
120
+ # ]
121
+ #
122
+ # companies_with_articles = asyncio.run(run_pipeline(live_urls))
src/helpers.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import gspread
2
+
3
+
4
+ def match_companies_to_articles(articles_metadata, ai_results):
5
+ # A. Create a lookup dictionary: URL -> Title
6
+ # This allows instant access to titles without looping every time
7
+ url_to_title_map = {item['link']: item['title'] for item in articles_metadata}
8
+
9
+ final_list = []
10
+
11
+ for result in ai_results:
12
+ article_url = result.get('url')
13
+ # Look up the title, default to "Unknown" if the URL isn't in metadata
14
+ article_title = url_to_title_map.get(article_url, "Unknown Title")
15
+
16
+ # Iterate through the companies found in this specific article
17
+ if 'companies' in result:
18
+ for company in result['companies']:
19
+ record = {
20
+ "company_name": company['name'],
21
+ "company_url": company.get('url', ''), # Handle missing URLs gracefully
22
+ "article_title": article_title,
23
+ "article_url": article_url
24
+ }
25
+ final_list.append(record)
26
+
27
+ results = sorted(final_list, key=lambda x: x['company_name'])
28
+ return results
29
+ #
30
+ # def connect_to_sheet(json_keyfile, sheet_name):
31
+ # """Authenticates and returns the worksheet object."""
32
+ # try:
33
+ # gc = gspread.service_account(filename=json_keyfile)
34
+ # sh = gc.open(sheet_name)
35
+ # return sh.sheet1
36
+ # except Exception as e:
37
+ # print(f"❌ Error connecting to Google Sheets: {e}")
38
+ # return None
39
+ #
40
+ #
41
+ # def get_cached_websites(worksheet):
42
+ # """
43
+ # Returns a dictionary of existing companies: {'Tesla': 'tesla.com', ...}
44
+ # """
45
+ # if not worksheet: return {}
46
+ #
47
+ # print("πŸ“‚ Reading cache from Google Sheets...")
48
+ # try:
49
+ # records = worksheet.get_all_records()
50
+ # # Convert list of dicts to a lookup map
51
+ # return {
52
+ # row['company_name']: row['company_website']
53
+ # for row in records
54
+ # if row.get('company_name')
55
+ # }
56
+ # except Exception:
57
+ # return {}
58
+ #
59
+ #
60
+ # def save_new_websites(worksheet, new_data):
61
+ # """
62
+ # Appends new data to the sheet.
63
+ # Expects a list of dicts: [{'company_name': 'X', 'company_website': 'Y'}]
64
+ # """
65
+ # if not worksheet or not new_data: return
66
+ #
67
+ # print(f"πŸ’Ύ Saving {len(new_data)} new entries to Google Sheets...")
68
+ #
69
+ # # Prepare rows as list of lists: [['Name', 'URL'], ['Name', 'URL']]
70
+ # rows = [[item['company_name'], item['company_website']] for item in new_data]
71
+ #
72
+ # # Add headers if sheet is empty
73
+ # if not worksheet.get_all_values():
74
+ # worksheet.append_row(["company_name", "company_website"])
75
+ #
76
+ # worksheet.append_rows(rows)
src/search.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import aiohttp
3
+ import time
4
+ import json
5
+ import math
6
+ import certifi
7
+ import ssl
8
+ from cache import connect_to_sheet, load_cache_dict, append_to_cache
9
+ import re
10
+ from urllib.parse import urlparse
11
+
12
+ # --- CONFIGURATION ---
13
+ BASE_URL = "https://google.serper.dev/news"
14
+ RESULTS_PER_PAGE = 100 # Serper max per request
15
+ MAX_CONCURRENCY = 10 # Avoid 429 errors
16
+
17
+
18
+ # --- WORKER: Fetch articles for one topic ---
19
+ async def fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name=""):
20
+ articles = []
21
+ headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
22
+ time_filter = f"qdr:d{days_back}"
23
+
24
+ # Calculate how many pages we need based on max_articles
25
+ # e.g., if max_articles=50, we need 1 page. If 150, we need 2 pages.
26
+ required_pages = math.ceil(max_articles / RESULTS_PER_PAGE)
27
+
28
+ async with sem:
29
+ print(f"--> Starting: {topic}")
30
+
31
+ if country_name and country_name != "Global":
32
+ query = f"{topic} {country_name}"
33
+ else:
34
+ query = topic
35
+ for page in range(1, required_pages + 1):
36
+ payload = {
37
+ "q": query,
38
+ "gl": geo_code,
39
+ "tbs": time_filter,
40
+ "num": RESULTS_PER_PAGE,
41
+ "page": page
42
+ }
43
+
44
+ try:
45
+ async with session.post(BASE_URL, headers=headers, json=payload) as resp:
46
+ if resp.status != 200:
47
+ print(f" x Error {topic} (Page {page}): Status {resp.status}")
48
+ break
49
+
50
+ data = await resp.json()
51
+ new_news = data.get("news", [])
52
+
53
+ if not new_news:
54
+ break # No more results
55
+
56
+ articles.extend(new_news)
57
+
58
+ # Stop if we have reached the requested limit for this topic
59
+ if len(articles) >= max_articles:
60
+ articles = articles[:max_articles]
61
+ break
62
+
63
+ except Exception as e:
64
+ print(f" x Exception {topic}: {e}")
65
+ break
66
+
67
+ print(f"βœ… Finished: {topic} ({len(articles)} articles)")
68
+ return articles
69
+
70
+
71
+ # --- MAIN ORCHESTRATOR ---
72
+ async def start_async_search(topics: list, geo_code: str, days_back: int, max_articles: int, api_key: str,
73
+ country_name: str):
74
+ start_time = time.time()
75
+
76
+ # 1. Setup Concurrency
77
+ sem = asyncio.Semaphore(MAX_CONCURRENCY)
78
+ ssl_context = ssl.create_default_context(cafile=certifi.where())
79
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
80
+
81
+ # connector = aiohttp.TCPConnector(ssl=False) # Ignore SSL errors if necessary
82
+
83
+ # 2. Run Tasks
84
+ async with aiohttp.ClientSession(connector=connector) as session:
85
+ tasks = [
86
+ fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name)
87
+ for topic in topics
88
+ ]
89
+ results = await asyncio.gather(*tasks)
90
+
91
+ # 3. Flatten and Deduplicate
92
+ # We use a dictionary keyed by URL to ensure every article is unique
93
+ unique_articles_map = {}
94
+
95
+ for topic_articles in results:
96
+ for article in topic_articles:
97
+ link = article.get('link')
98
+ if link and link not in unique_articles_map:
99
+ unique_articles_map[link] = article
100
+
101
+ final_articles_list = list(unique_articles_map.values())
102
+
103
+ # 4. Optional: Save to file for debug
104
+ with open("unique_articles.json", "w", encoding="utf-8") as f:
105
+ json.dump(final_articles_list, f, indent=2, ensure_ascii=False)
106
+
107
+ print("\n" + "=" * 40)
108
+ print(f"Total Time: {time.time() - start_time:.2f} seconds")
109
+ print(f"Total Unique Articles: {len(final_articles_list)}")
110
+ print("=" * 40)
111
+
112
+ return final_articles_list
113
+
114
+
115
+ def search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, country_name):
116
+ articles = asyncio.run(start_async_search(
117
+ topics=topic_list,
118
+ geo_code=geo_code,
119
+ days_back=days_back,
120
+ max_articles=max_news,
121
+ api_key=SERPER_API_KEY,
122
+ country_name=country_name
123
+ ))
124
+
125
+ # Verify output
126
+ print(f"Search_news captured {len(articles)} articles.")
127
+ if articles:
128
+ print(f"Sample title: {articles[0].get('title')}")
129
+
130
+ return articles
131
+
132
+
133
+ # ************** Company URL part
134
+ async def fetch_url_from_serper(session, company_name, api_key):
135
+ """
136
+ Async worker: specific search for one company.
137
+ """
138
+ url = "https://google.serper.dev/search"
139
+ payload = json.dumps({"q": f"{company_name} official website", "num": 1})
140
+
141
+ headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
142
+
143
+ # try:
144
+ # # We use the session passed from the parent, which now has SSL configured
145
+ # async with session.post(url, headers=headers, data=payload) as response:
146
+ # if response.status == 200:
147
+ # data = await response.json()
148
+ # if "organic" in data and len(data["organic"]) > 0:
149
+ # return data["organic"][0].get("link", "")
150
+ # except Exception as e:
151
+ # print(f"⚠️ Serper error for {company_name}: {e}")
152
+ #
153
+ # return ""
154
+
155
+ # Helper to clean names for comparison (e.g. "99 Startups" -> "99startups")
156
+ def clean(text):
157
+ return re.sub(r'\W+', '', text).lower()
158
+
159
+ target_name = clean(company_name)
160
+
161
+ # Domains to ignore if they appear as the main link
162
+ blacklist = ["wikipedia", "linkedin", "bloomberg", "crunchbase", "facebook", "instagram", "youtube"]
163
+
164
+ try:
165
+ async with session.post(url, headers=headers, data=payload) as response:
166
+ if response.status == 200:
167
+ data = await response.json()
168
+ if "organic" not in data:
169
+ return ""
170
+
171
+ results = data["organic"]
172
+
173
+ # --- STRATEGY 1: Check High-Quality Matches in Links ---
174
+ for res in results:
175
+ link = res.get("link", "")
176
+ domain = urlparse(link).netloc.lower()
177
+
178
+ # Skip blacklisted profile sites
179
+ if any(b in domain for b in blacklist):
180
+ continue
181
+
182
+ # If the domain contains the company name strictly (e.g. 'sabre.com' contains 'sabre')
183
+ # This fixes the "Generic Name" issue if the official site ranks high
184
+ if target_name in clean(domain):
185
+ return link
186
+
187
+ # --- STRATEGY 2: Snippet Hunting (The "99 Startups" Fix) ---
188
+ # If Strategy 1 failed, look for URLs hidden inside the text snippet
189
+ for res in results:
190
+ snippet = res.get("snippet", "")
191
+ # Find potential URLs in the text (e.g. "Website: www.99startups.com")
192
+ hidden_urls = re.findall(r'(?:www\.|https?://)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', snippet)
193
+
194
+ for hidden in hidden_urls:
195
+ # If this hidden URL matches our company name, it's likely the real one
196
+ if target_name in clean(hidden):
197
+ # Ensure it has a schema
198
+ if not hidden.startswith("http"):
199
+ return f"https://{hidden}"
200
+ return hidden
201
+
202
+ # --- STRATEGY 3: Fallback (Best Guess) ---
203
+ # If no perfect match found, return the first non-blacklisted result
204
+ for res in results:
205
+ link = res.get("link", "")
206
+ if not any(b in link for b in blacklist):
207
+ return link
208
+
209
+ except Exception as e:
210
+ print(f"⚠️ Serper error for {company_name}: {e}")
211
+
212
+ return ""
213
+
214
+
215
+ async def run_batch_search(company_names, api_key):
216
+ """
217
+ Orchestrator: runs all searches in parallel with SECURE SSL CONTEXT.
218
+ """
219
+ results = {}
220
+
221
+ # --- SSL FIX START ---
222
+ # Create an SSL context that uses certifi's trusted CA bundle
223
+ ssl_context = ssl.create_default_context(cafile=certifi.where())
224
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
225
+ # --- SSL FIX END ---
226
+
227
+ # Pass the connector to the session
228
+ async with aiohttp.ClientSession(connector=connector) as session:
229
+ tasks = []
230
+ for name in company_names:
231
+ tasks.append(fetch_url_from_serper(session, name, api_key))
232
+
233
+ # Run them all at once
234
+ urls = await asyncio.gather(*tasks)
235
+
236
+ for name, url in zip(company_names, urls):
237
+ results[name] = url
238
+
239
+ return results
240
+
241
+
242
+ def fill_missing_urls(data_list, sheet_name, serper_api_key):
243
+ """
244
+ Main function to process the data list.
245
+ 1. Checks Cache
246
+ 2. Searches Serper for missing
247
+ 3. Updates Data
248
+ 4. Saves new finds to Cache
249
+ """
250
+
251
+ # A. Identify targets
252
+ # We only care about rows where url is 'SEARCH_REQUIRED'
253
+ target_indices = [i for i, row in enumerate(data_list) if row.get('company_url') == 'SEARCH_REQUIRED']
254
+
255
+ if not target_indices:
256
+ print("βœ… No searches required.")
257
+ return data_list
258
+
259
+ print(f"πŸ” Processing {len(target_indices)} missing URLs...")
260
+
261
+ # Get unique company names needing search (Deduplication)
262
+ companies_to_resolve = {data_list[i]['company_name'] for i in target_indices}
263
+
264
+ # B. Connect & Check Cache
265
+ try:
266
+ sheet = connect_to_sheet(sheet_name)
267
+ cache_dict = load_cache_dict(sheet)
268
+ except Exception as e:
269
+ print(f"⚠️ Cache connection failed, skipping cache: {e}")
270
+ sheet = None
271
+ cache_dict = {}
272
+
273
+ # Separate into Found vs Missing
274
+ found_in_cache = {}
275
+ missing_from_cache = []
276
+
277
+ for company in companies_to_resolve:
278
+ norm_name = company.lower().strip()
279
+ if norm_name in cache_dict:
280
+ found_in_cache[company] = cache_dict[norm_name]
281
+ else:
282
+ missing_from_cache.append(company)
283
+
284
+ print(f" - Found in Cache: {len(found_in_cache)}")
285
+ print(f" - Need API Search: {len(missing_from_cache)}")
286
+
287
+ # C. Perform API Search (if any missing)
288
+ search_results = {}
289
+ if missing_from_cache:
290
+ print(f"🌍 Searching internet for {len(missing_from_cache)} companies...")
291
+ search_results = asyncio.run(run_batch_search(missing_from_cache, serper_api_key))
292
+
293
+ # D. Update the Original Data List
294
+ # Combine all known URLs (Cache + Search)
295
+ full_knowledge_base = {**found_in_cache, **search_results}
296
+
297
+ for i in target_indices:
298
+ comp_name = data_list[i]['company_name']
299
+ # Look up URL (default to empty string if search failed)
300
+ url = full_knowledge_base.get(comp_name, "")
301
+ data_list[i]['company_url'] = url
302
+
303
+ # E. Update Cache (Save only what we just searched)
304
+ if sheet and search_results:
305
+ # Prepare list for GSheet [{'Company': 'Name', 'Website': 'URL'}]
306
+ new_cache_entries = [
307
+ {'Company': name, 'Website': url}
308
+ for name, url in search_results.items()
309
+ if url # Only cache if we actually found a URL
310
+ ]
311
+ append_to_cache(sheet, new_cache_entries)
312
+ else:
313
+ print('Nothing appended to cache')
314
+
315
+ return data_list
src/streamlit_app.py CHANGED
@@ -1,40 +1,190 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ from datetime import datetime
4
+ from search import search_news, fill_missing_urls
5
+ from fetch_and_extract import get_companies_and_articles
6
+ from helpers import match_companies_to_articles
7
+ from config import MAX_NEWS_PER_TOPIC, MAX_TOPICS, COMPANY_CACHE_SHEET_NAME
8
+ import os
9
+
10
+ # --- PAGE CONFIGURATION ---
11
+ st.set_page_config(page_title="News Finder Agent", page_icon="πŸ•΅οΈ", layout="wide")
12
+
13
+ # --- SESSION STATE INITIALIZATION ---
14
+ if 'results_data' not in st.session_state:
15
+ st.session_state.results_data = None
16
+
17
+ # --- MAIN INTERFACE ---
18
+ st.title("πŸ•΅οΈ News Finder AI Agent")
19
+ st.markdown("Enter your topics below to generate a report of companies mentioned in the news.")
20
+
21
+ # 1. TOPIC INPUT
22
+ topics_input = st.text_area(
23
+ f"1. Topics (Comma separated), maximum {MAX_TOPICS} topics",
24
+ placeholder="e.g. Artificial Intelligence, Nvidia, Supply Chain Logistics, Green Energy...",
25
+ help="Paste your long list of topics here. The agent will dedup and search for all of them."
26
+ )
27
+
28
+ # CHANGED: Created 3 columns to fit the new field neatly
29
+ col_geo, col_time, col_limit = st.columns(3)
30
+
31
+ # 2. GEOGRAPHY INPUT
32
+ iso_countries = {
33
+ # --- GLOBAL & NORTH AMERICA ---
34
+ "Global": "any",
35
+ "United States": "us",
36
+ "Canada": "ca",
37
+
38
+ # --- ASIA PACIFIC ---
39
+ "Australia": "au",
40
+ "China": "cn",
41
+ "India": "in",
42
+ "Japan": "jp",
43
+ "Malaysia": "my",
44
+ "South Korea": "kr",
45
+ "Singapore": "sg",
46
+ "Taiwan": "tw",
47
+ "Hong Kong": "hk",
48
+
49
+ # --- EUROPE (WESTERN) ---
50
+ "United Kingdom": "gb",
51
+ "Germany": "de",
52
+ "France": "fr",
53
+ "Italy": "it",
54
+ "Spain": "es",
55
+ "Netherlands": "nl",
56
+ "Belgium": "be",
57
+ "Switzerland": "ch",
58
+ "Austria": "at",
59
+ "Ireland": "ie",
60
+ "Luxembourg": "lu",
61
+ "Portugal": "pt",
62
+
63
+ # --- EUROPE (NORDIC) ---
64
+ "Sweden": "se",
65
+ "Norway": "no",
66
+ "Denmark": "dk",
67
+ "Finland": "fi",
68
+ "Iceland": "is",
69
+
70
+ # --- EUROPE (CENTRAL & EASTERN) ---
71
+ "Poland": "pl",
72
+ "Czech Republic": "cz",
73
+ "Hungary": "hu",
74
+ "Romania": "ro",
75
+ "Ukraine": "ua",
76
+ "Greece": "gr",
77
+ "Turkey": "tr",
78
+ "Bulgaria": "bg",
79
+ "Croatia": "hr",
80
+ "Slovakia": "sk",
81
+ "Slovenia": "si",
82
+ "Serbia": "rs",
83
+
84
+ # --- EUROPE (BALTIC) ---
85
+ "Estonia": "ee",
86
+ "Latvia": "lv",
87
+ "Lithuania": "lt",
88
+ }
89
+
90
+ with col_geo:
91
+ selected_country = st.selectbox(
92
+ "2. Geography",
93
+ options=list(iso_countries.keys()),
94
+ index=0
95
+ )
96
+ geo_code = iso_countries[selected_country]
97
+
98
+ # 3. TIME FRAME INPUT
99
+ with col_time:
100
+ days_back = st.slider(
101
+ "3. Time Frame (Days Back)",
102
+ min_value=1,
103
+ max_value=30,
104
+ value=7,
105
+ help="How far back should we search for news?"
106
+ )
107
+
108
+ # 4. MAX ARTICLES INPUT
109
+ with col_limit:
110
+ max_news = st.number_input(
111
+ "4. Max Articles per Topic",
112
+ min_value=10,
113
+ max_value=MAX_NEWS_PER_TOPIC, # Restricted by config
114
+ value=min(50, MAX_NEWS_PER_TOPIC),
115
+ step=10,
116
+ help=f"Control costs by limiting articles. Max allowed: {MAX_NEWS_PER_TOPIC}"
117
+ )
118
+
119
+ # --- ACTION BUTTON ---
120
+ if st.button("πŸš€ Find News & Extract Companies", type="primary"):
121
+ if not topics_input:
122
+ st.error("⚠️ Please enter at least one topic.")
123
+ else:
124
+ OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
125
+ SERPER_API_KEY = os.environ.get('SERPER_API_KEY')
126
+
127
+ topic_list = [t.strip() for t in topics_input.split(",") if t.strip()]
128
+
129
+ # ENFORCE LIMIT ON TOPICS
130
+ if len(topic_list) > MAX_TOPICS:
131
+ st.warning(
132
+ f"⚠️ Limit Reached: You entered {len(topic_list)} topics. Processing only the first {MAX_TOPICS}.")
133
+ topic_list = topic_list[:MAX_TOPICS]
134
+
135
+ with st.status("πŸ€– Agent is working...", expanded=True) as status:
136
+ st.write(f"πŸ” Searching {len(topic_list)} topics in {selected_country} (Max {max_news} articles each)...")
137
+
138
+ # 1. Search News
139
+ articles = search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, selected_country)
140
+
141
+ if not articles:
142
+ status.update(label="❌ No news found!", state="error")
143
+ st.stop()
144
+
145
+ st.write(f"βœ… Found {len(articles)} unique articles. πŸ› οΈ Extracting companies with LLM...")
146
+
147
+ # 2. Extract Companies (LLM)
148
+ urls_to_process = [a['link'] for a in articles]
149
+ articles_with_companies_from_llm = get_companies_and_articles(urls_to_process, OPENAI_API_KEY)
150
+
151
+ st.write(f"βœ… Generating results...")
152
+
153
+ # 3. Combine & Fill URLs
154
+ matched_results = match_companies_to_articles(articles, articles_with_companies_from_llm)
155
+ structured_results = fill_missing_urls(matched_results, COMPANY_CACHE_SHEET_NAME, SERPER_API_KEY)
156
+
157
+ status.update(label="βœ… Search Complete!", state="complete", expanded=False)
158
+
159
+ # SAVE RESULTS
160
+ if structured_results:
161
+ st.session_state.results_data = pd.DataFrame(structured_results)
162
+ else:
163
+ st.warning("No companies found in the extracted text.")
164
+
165
+ # --- RESULTS & DOWNLOAD ---
166
+ if st.session_state.results_data is not None:
167
+ st.divider()
168
+ st.subheader("πŸ“‚ Extracted Data")
169
+
170
+ st.dataframe(
171
+ st.session_state.results_data,
172
+ column_config={
173
+ "company_url": st.column_config.LinkColumn(
174
+ "Website" # Full URL shown, clickable
175
+ ),
176
+ "article_url": st.column_config.LinkColumn(
177
+ "Source Article" # Full URL shown, clickable
178
+ ),
179
+ },
180
+ use_container_width=True
181
+ )
182
 
183
+ csv = st.session_state.results_data.to_csv(index=False).encode('utf-8')
184
+ st.download_button(
185
+ label="πŸ“₯ Download Results as CSV",
186
+ data=csv,
187
+ file_name=f"news_extraction_{datetime.now().strftime('%Y%m%d_%H%M')}.csv",
188
+ mime="text/csv",
189
+ type="primary"
190
+ )