testbed / ai_api /library /websearch.py
xspinners's picture
initial
090987a
"""
run_web_search.py
Module for running web searches and saving results
"""
import pandas as pd
from datetime import datetime
import os
def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None):
"""
Run web search for keywords and save results to CSV
Args:
keywords (list): List of keywords to search for
output_path (str): Path to save results
num_results (int): Number of results per keyword
use_serpapi (bool): Whether to use SerpApi
use_serper (bool): Whether to use Serper.dev
use_duckduckgo (bool): Whether to use DuckDuckGo
full_claim (str): The full claim text to use as a search query
Returns:
int: Number of results saved
"""
# Import search functions
try:
from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends
except ImportError:
print("Error importing web_search module. Make sure it exists and is accessible.")
return 0
# Create search queries
all_results = []
# Always use the full claim directly if available
if full_claim:
print(f"Using full claim as direct search query: '{full_claim}'")
# Search using SerpApi with the exact claim
if use_serpapi:
print("Searching with SerpApi (exact claim)...")
serpapi_results = search_serpapi(full_claim, num_results=num_results)
if serpapi_results:
print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)")
all_results.extend(serpapi_results)
else:
print("No results from SerpApi (exact claim)")
# Search using Serper.dev with the exact claim
if use_serper:
print("Searching with Serper.dev (exact claim)...")
serper_results = search_serper(full_claim, num_results=num_results)
if serper_results:
print(f"Found {len(serper_results)} results from Serper.dev (exact claim)")
all_results.extend(serper_results)
else:
print("No results from Serper.dev (exact claim)")
# For crime-related claims, also try targeted queries
crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"])
kelantan_related = "kelantan" in full_claim.lower()
if crime_related and kelantan_related:
# Check if this is about sexual crimes or ammunition
ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"])
if ammunition_related:
targeted_queries = [
"50 tan kelongsong peluru ditemui",
"kilang haram proses kelongsong peluru",
"penemuan kelongsong peluru di kilang",
"kelongsong peluru musuh negara"
]
else:
# Default to sexual crime queries
targeted_queries = [
"statistik jenayah seksual di kelantan",
"kes rogol dan sumbang mahram di kelantan meningkat",
"pdrm kelantan lapor kes rogol"
]
for query in targeted_queries:
print(f"Using targeted query: '{query}'")
# Search using SerpApi
if use_serpapi:
print(f"Searching with SerpApi (targeted query: {query})...")
serpapi_results = search_serpapi(query, num_results=num_results//2) # Use fewer results for each targeted query
if serpapi_results:
print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)")
all_results.extend(serpapi_results)
else:
print(f"No results from SerpApi (targeted query: {query})")
# Search using Serper.dev
if use_serper:
print(f"Searching with Serper.dev (targeted query: {query})...")
serper_results = search_serper(query, num_results=num_results//2) # Use fewer results for each targeted query
if serper_results:
print(f"Found {len(serper_results)} results from Serper.dev (targeted query)")
all_results.extend(serper_results)
else:
print(f"No results from Serper.dev (targeted query: {query})")
else:
# For other claims, use the original approach with keywords
# 1. Full claim query (if available)
full_claim_query = f'"{full_claim}"' if full_claim else None
# 2. Keyword-based query
search_terms = []
for kw in keywords:
# If keyword contains spaces (multi-word phrase), wrap in quotes
if " " in kw:
search_terms.append(f'"{kw}"')
else:
# For single words, don't use quotes to get broader results
search_terms.append(kw)
keyword_query = " OR ".join(search_terms)
# Search using full claim first (if available)
if full_claim_query:
print(f"Searching with full claim: {full_claim_query}")
# Search using SerpApi
if use_serpapi:
print("Searching with SerpApi (full claim)...")
serpapi_results = search_serpapi(full_claim, num_results=num_results)
if serpapi_results:
print(f"Found {len(serpapi_results)} results from SerpApi (full claim)")
all_results.extend(serpapi_results)
else:
print("No results from SerpApi (full claim)")
# Search using Serper.dev
if use_serper:
print("Searching with Serper.dev (full claim)...")
serper_results = search_serper(full_claim, num_results=num_results)
if serper_results:
print(f"Found {len(serper_results)} results from Serper.dev (full claim)")
all_results.extend(serper_results)
else:
print("No results from Serper.dev (full claim)")
# Search using keyword query as fallback
if not all_results or len(all_results) < num_results:
print(f"Searching with keyword query: {keyword_query}")
# Search using SerpApi
if use_serpapi:
print("Searching with SerpApi (keywords)...")
serpapi_results = search_serpapi(keyword_query, num_results=num_results)
if serpapi_results:
print(f"Found {len(serpapi_results)} results from SerpApi (keywords)")
all_results.extend(serpapi_results)
else:
print("No results from SerpApi (keywords)")
# Search using Serper.dev
if use_serper:
print("Searching with Serper.dev (keywords)...")
serper_results = search_serper(keyword_query, num_results=num_results)
if serper_results:
print(f"Found {len(serper_results)} results from Serper.dev (keywords)")
all_results.extend(serper_results)
else:
print("No results from Serper.dev (keywords)")
# Add DuckDuckGo results
if use_duckduckgo:
query_to_use = full_claim if full_claim else keyword_query
print(f"Searching with DuckDuckGo using: {query_to_use}")
duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results)
if duckduckgo_results:
print(f"Found {len(duckduckgo_results)} results from DuckDuckGo")
all_results.extend(duckduckgo_results)
else:
print("No results from DuckDuckGo")
# Add Google Trends data
trends_data = get_google_trends(keywords)
# Convert to DataFrame
if all_results:
# Remove duplicates based on URL
unique_results = []
seen_urls = set()
for result in all_results:
url = result.get('link', '')
if url and url not in seen_urls:
seen_urls.add(url)
unique_results.append(result)
print(f"Removed {len(all_results) - len(unique_results)} duplicate results")
df = pd.DataFrame(unique_results)
# Add additional columns to match the format expected by the sentiment analyzer
df['platform'] = 'web'
df['username'] = df['source']
df['post_text'] = df['snippet']
df['post_url'] = df['link']
df['likes'] = 0
df['shares'] = 0
df['comments_count'] = 0
df['comment_text'] = ''
df['combined_text'] = df['title'] + ' ' + df['snippet']
df['date'] = datetime.now().strftime('%Y-%m-%d')
# Create output directory if it doesn't exist
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Save to CSV
df.to_csv(output_path, index=False)
print(f"Saved {len(df)} web search results to {output_path}")
return len(df)
else:
print("No web search results found")
return 0
# Test the module
if __name__ == "__main__":
import sys
# Get keywords from command line or use default
if len(sys.argv) > 1:
keywords = sys.argv[1:]
full_claim = " ".join(sys.argv[1:])
else:
keywords = ["polis", "kelantan", "sumbang mahram", "rogol"]
full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan"
# Run web search
output_path = "output/web_search_results.csv"
run_web_search(keywords, output_path, num_results=10, full_claim=full_claim)