Spaces:

chummchumm
/

newFinderAgent_v2

Paused

App Files Files Community

newFinderAgent_v2 / src /search.py

chummchumm

Upload 6 files

8b425b2 verified 11 days ago

raw

history blame contribute delete

11.2 kB

	import asyncio
	import aiohttp
	import time
	import json
	import math
	import certifi
	import ssl
	from cache import connect_to_sheet, load_cache_dict, append_to_cache
	import re
	from urllib.parse import urlparse

	# --- CONFIGURATION ---
	BASE_URL = "https://google.serper.dev/news"
	RESULTS_PER_PAGE = 100 # Serper max per request
	MAX_CONCURRENCY = 10 # Avoid 429 errors


	# --- WORKER: Fetch articles for one topic ---
	async def fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name=""):
	articles = []
	headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}
	time_filter = f"qdr:d{days_back}"

	# Calculate how many pages we need based on max_articles
	# e.g., if max_articles=50, we need 1 page. If 150, we need 2 pages.
	required_pages = math.ceil(max_articles / RESULTS_PER_PAGE)

	async with sem:
	print(f"--> Starting: {topic}")

	if country_name and country_name != "Global":
	query = f"{topic} {country_name}"
	else:
	query = topic
	for page in range(1, required_pages + 1):
	payload = {
	"q": query,
	"gl": geo_code,
	"tbs": time_filter,
	"num": RESULTS_PER_PAGE,
	"page": page
	}

	try:
	async with session.post(BASE_URL, headers=headers, json=payload) as resp:
	if resp.status != 200:
	print(f" x Error {topic} (Page {page}): Status {resp.status}")
	break

	data = await resp.json()
	new_news = data.get("news", [])

	if not new_news:
	break # No more results

	articles.extend(new_news)

	# Stop if we have reached the requested limit for this topic
	if len(articles) >= max_articles:
	articles = articles[:max_articles]
	break

	except Exception as e:
	print(f" x Exception {topic}: {e}")
	break

	print(f"✅ Finished: {topic} ({len(articles)} articles)")
	return articles


	# --- MAIN ORCHESTRATOR ---
	async def start_async_search(topics: list, geo_code: str, days_back: int, max_articles: int, api_key: str,
	country_name: str):
	start_time = time.time()

	# 1. Setup Concurrency
	sem = asyncio.Semaphore(MAX_CONCURRENCY)
	ssl_context = ssl.create_default_context(cafile=certifi.where())
	connector = aiohttp.TCPConnector(ssl=ssl_context)

	# connector = aiohttp.TCPConnector(ssl=False) # Ignore SSL errors if necessary

	# 2. Run Tasks
	async with aiohttp.ClientSession(connector=connector) as session:
	tasks = [
	fetch_topic(session, topic, sem, geo_code, days_back, max_articles, api_key, country_name)
	for topic in topics
	]
	results = await asyncio.gather(*tasks)

	# 3. Flatten and Deduplicate
	# We use a dictionary keyed by URL to ensure every article is unique
	unique_articles_map = {}

	for topic_articles in results:
	for article in topic_articles:
	link = article.get('link')
	if link and link not in unique_articles_map:
	unique_articles_map[link] = article

	final_articles_list = list(unique_articles_map.values())

	# 4. Optional: Save to file for debug
	with open("unique_articles.json", "w", encoding="utf-8") as f:
	json.dump(final_articles_list, f, indent=2, ensure_ascii=False)

	print("\n" + "=" * 40)
	print(f"Total Time: {time.time() - start_time:.2f} seconds")
	print(f"Total Unique Articles: {len(final_articles_list)}")
	print("=" * 40)

	return final_articles_list


	def search_news(topic_list, geo_code, days_back, max_news, SERPER_API_KEY, country_name):
	articles = asyncio.run(start_async_search(
	topics=topic_list,
	geo_code=geo_code,
	days_back=days_back,
	max_articles=max_news,
	api_key=SERPER_API_KEY,
	country_name=country_name
	))

	# Verify output
	print(f"Search_news captured {len(articles)} articles.")
	if articles:
	print(f"Sample title: {articles[0].get('title')}")

	return articles


	# ************** Company URL part
	async def fetch_url_from_serper(session, company_name, api_key):
	"""
	Async worker: specific search for one company.
	"""
	url = "https://google.serper.dev/search"
	payload = json.dumps({"q": f"{company_name} official website", "num": 1})

	headers = {'X-API-KEY': api_key, 'Content-Type': 'application/json'}

	# try:
	# # We use the session passed from the parent, which now has SSL configured
	# async with session.post(url, headers=headers, data=payload) as response:
	# if response.status == 200:
	# data = await response.json()
	# if "organic" in data and len(data["organic"]) > 0:
	# return data["organic"][0].get("link", "")
	# except Exception as e:
	# print(f"⚠️ Serper error for {company_name}: {e}")
	#
	# return ""

	# Helper to clean names for comparison (e.g. "99 Startups" -> "99startups")
	def clean(text):
	return re.sub(r'\W+', '', text).lower()

	target_name = clean(company_name)

	# Domains to ignore if they appear as the main link
	blacklist = ["wikipedia", "linkedin", "bloomberg", "crunchbase", "facebook", "instagram", "youtube"]

	try:
	async with session.post(url, headers=headers, data=payload) as response:
	if response.status == 200:
	data = await response.json()
	if "organic" not in data:
	return ""

	results = data["organic"]

	# --- STRATEGY 1: Check High-Quality Matches in Links ---
	for res in results:
	link = res.get("link", "")
	domain = urlparse(link).netloc.lower()

	# Skip blacklisted profile sites
	if any(b in domain for b in blacklist):
	continue

	# If the domain contains the company name strictly (e.g. 'sabre.com' contains 'sabre')
	# This fixes the "Generic Name" issue if the official site ranks high
	if target_name in clean(domain):
	return link

	# --- STRATEGY 2: Snippet Hunting (The "99 Startups" Fix) ---
	# If Strategy 1 failed, look for URLs hidden inside the text snippet
	for res in results:
	snippet = res.get("snippet", "")
	# Find potential URLs in the text (e.g. "Website: www.99startups.com")
	hidden_urls = re.findall(r'(?:www\.\|https?://)[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', snippet)

	for hidden in hidden_urls:
	# If this hidden URL matches our company name, it's likely the real one
	if target_name in clean(hidden):
	# Ensure it has a schema
	if not hidden.startswith("http"):
	return f"https://{hidden}"
	return hidden

	# --- STRATEGY 3: Fallback (Best Guess) ---
	# If no perfect match found, return the first non-blacklisted result
	for res in results:
	link = res.get("link", "")
	if not any(b in link for b in blacklist):
	return link

	except Exception as e:
	print(f"⚠️ Serper error for {company_name}: {e}")

	return ""


	async def run_batch_search(company_names, api_key):
	"""
	Orchestrator: runs all searches in parallel with SECURE SSL CONTEXT.
	"""
	results = {}

	# --- SSL FIX START ---
	# Create an SSL context that uses certifi's trusted CA bundle
	ssl_context = ssl.create_default_context(cafile=certifi.where())
	connector = aiohttp.TCPConnector(ssl=ssl_context)
	# --- SSL FIX END ---

	# Pass the connector to the session
	async with aiohttp.ClientSession(connector=connector) as session:
	tasks = []
	for name in company_names:
	tasks.append(fetch_url_from_serper(session, name, api_key))

	# Run them all at once
	urls = await asyncio.gather(*tasks)

	for name, url in zip(company_names, urls):
	results[name] = url

	return results


	def fill_missing_urls(data_list, sheet_name, serper_api_key):
	"""
	Main function to process the data list.
	1. Checks Cache
	2. Searches Serper for missing
	3. Updates Data
	4. Saves new finds to Cache
	"""

	# A. Identify targets
	# We only care about rows where url is 'SEARCH_REQUIRED'
	target_indices = [i for i, row in enumerate(data_list) if row.get('company_url') == 'SEARCH_REQUIRED']

	if not target_indices:
	print("✅ No searches required.")
	return data_list

	print(f"🔍 Processing {len(target_indices)} missing URLs...")

	# Get unique company names needing search (Deduplication)
	companies_to_resolve = {data_list[i]['company_name'] for i in target_indices}

	# B. Connect & Check Cache
	try:
	sheet = connect_to_sheet(sheet_name)
	cache_dict = load_cache_dict(sheet)
	except Exception as e:
	print(f"⚠️ Cache connection failed, skipping cache: {e}")
	sheet = None
	cache_dict = {}

	# Separate into Found vs Missing
	found_in_cache = {}
	missing_from_cache = []

	for company in companies_to_resolve:
	norm_name = company.lower().strip()
	if norm_name in cache_dict:
	found_in_cache[company] = cache_dict[norm_name]
	else:
	missing_from_cache.append(company)

	print(f" - Found in Cache: {len(found_in_cache)}")
	print(f" - Need API Search: {len(missing_from_cache)}")

	# C. Perform API Search (if any missing)
	search_results = {}
	if missing_from_cache:
	print(f"🌍 Searching internet for {len(missing_from_cache)} companies...")
	search_results = asyncio.run(run_batch_search(missing_from_cache, serper_api_key))

	# D. Update the Original Data List
	# Combine all known URLs (Cache + Search)
	full_knowledge_base = {found_in_cache, search_results}

	for i in target_indices:
	comp_name = data_list[i]['company_name']
	# Look up URL (default to empty string if search failed)
	url = full_knowledge_base.get(comp_name, "")
	data_list[i]['company_url'] = url

	# E. Update Cache (Save only what we just searched)
	if sheet and search_results:
	# Prepare list for GSheet [{'Company': 'Name', 'Website': 'URL'}]
	new_cache_entries = [
	{'Company': name, 'Website': url}
	for name, url in search_results.items()
	if url # Only cache if we actually found a URL
	]
	append_to_cache(sheet, new_cache_entries)
	else:
	print('Nothing appended to cache')

	return data_list