""" run_web_search.py Module for running web searches and saving results """ import pandas as pd from datetime import datetime import os def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None): """ Run web search for keywords and save results to CSV Args: keywords (list): List of keywords to search for output_path (str): Path to save results num_results (int): Number of results per keyword use_serpapi (bool): Whether to use SerpApi use_serper (bool): Whether to use Serper.dev use_duckduckgo (bool): Whether to use DuckDuckGo full_claim (str): The full claim text to use as a search query Returns: int: Number of results saved """ # Import search functions try: from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends except ImportError: print("Error importing web_search module. Make sure it exists and is accessible.") return 0 # Create search queries all_results = [] # Always use the full claim directly if available if full_claim: print(f"Using full claim as direct search query: '{full_claim}'") # Search using SerpApi with the exact claim if use_serpapi: print("Searching with SerpApi (exact claim)...") serpapi_results = search_serpapi(full_claim, num_results=num_results) if serpapi_results: print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)") all_results.extend(serpapi_results) else: print("No results from SerpApi (exact claim)") # Search using Serper.dev with the exact claim if use_serper: print("Searching with Serper.dev (exact claim)...") serper_results = search_serper(full_claim, num_results=num_results) if serper_results: print(f"Found {len(serper_results)} results from Serper.dev (exact claim)") all_results.extend(serper_results) else: print("No results from Serper.dev (exact claim)") # For crime-related claims, also try targeted queries crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"]) kelantan_related = "kelantan" in full_claim.lower() if crime_related and kelantan_related: # Check if this is about sexual crimes or ammunition ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"]) if ammunition_related: targeted_queries = [ "50 tan kelongsong peluru ditemui", "kilang haram proses kelongsong peluru", "penemuan kelongsong peluru di kilang", "kelongsong peluru musuh negara" ] else: # Default to sexual crime queries targeted_queries = [ "statistik jenayah seksual di kelantan", "kes rogol dan sumbang mahram di kelantan meningkat", "pdrm kelantan lapor kes rogol" ] for query in targeted_queries: print(f"Using targeted query: '{query}'") # Search using SerpApi if use_serpapi: print(f"Searching with SerpApi (targeted query: {query})...") serpapi_results = search_serpapi(query, num_results=num_results//2) # Use fewer results for each targeted query if serpapi_results: print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)") all_results.extend(serpapi_results) else: print(f"No results from SerpApi (targeted query: {query})") # Search using Serper.dev if use_serper: print(f"Searching with Serper.dev (targeted query: {query})...") serper_results = search_serper(query, num_results=num_results//2) # Use fewer results for each targeted query if serper_results: print(f"Found {len(serper_results)} results from Serper.dev (targeted query)") all_results.extend(serper_results) else: print(f"No results from Serper.dev (targeted query: {query})") else: # For other claims, use the original approach with keywords # 1. Full claim query (if available) full_claim_query = f'"{full_claim}"' if full_claim else None # 2. Keyword-based query search_terms = [] for kw in keywords: # If keyword contains spaces (multi-word phrase), wrap in quotes if " " in kw: search_terms.append(f'"{kw}"') else: # For single words, don't use quotes to get broader results search_terms.append(kw) keyword_query = " OR ".join(search_terms) # Search using full claim first (if available) if full_claim_query: print(f"Searching with full claim: {full_claim_query}") # Search using SerpApi if use_serpapi: print("Searching with SerpApi (full claim)...") serpapi_results = search_serpapi(full_claim, num_results=num_results) if serpapi_results: print(f"Found {len(serpapi_results)} results from SerpApi (full claim)") all_results.extend(serpapi_results) else: print("No results from SerpApi (full claim)") # Search using Serper.dev if use_serper: print("Searching with Serper.dev (full claim)...") serper_results = search_serper(full_claim, num_results=num_results) if serper_results: print(f"Found {len(serper_results)} results from Serper.dev (full claim)") all_results.extend(serper_results) else: print("No results from Serper.dev (full claim)") # Search using keyword query as fallback if not all_results or len(all_results) < num_results: print(f"Searching with keyword query: {keyword_query}") # Search using SerpApi if use_serpapi: print("Searching with SerpApi (keywords)...") serpapi_results = search_serpapi(keyword_query, num_results=num_results) if serpapi_results: print(f"Found {len(serpapi_results)} results from SerpApi (keywords)") all_results.extend(serpapi_results) else: print("No results from SerpApi (keywords)") # Search using Serper.dev if use_serper: print("Searching with Serper.dev (keywords)...") serper_results = search_serper(keyword_query, num_results=num_results) if serper_results: print(f"Found {len(serper_results)} results from Serper.dev (keywords)") all_results.extend(serper_results) else: print("No results from Serper.dev (keywords)") # Add DuckDuckGo results if use_duckduckgo: query_to_use = full_claim if full_claim else keyword_query print(f"Searching with DuckDuckGo using: {query_to_use}") duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results) if duckduckgo_results: print(f"Found {len(duckduckgo_results)} results from DuckDuckGo") all_results.extend(duckduckgo_results) else: print("No results from DuckDuckGo") # Add Google Trends data trends_data = get_google_trends(keywords) # Convert to DataFrame if all_results: # Remove duplicates based on URL unique_results = [] seen_urls = set() for result in all_results: url = result.get('link', '') if url and url not in seen_urls: seen_urls.add(url) unique_results.append(result) print(f"Removed {len(all_results) - len(unique_results)} duplicate results") df = pd.DataFrame(unique_results) # Add additional columns to match the format expected by the sentiment analyzer df['platform'] = 'web' df['username'] = df['source'] df['post_text'] = df['snippet'] df['post_url'] = df['link'] df['likes'] = 0 df['shares'] = 0 df['comments_count'] = 0 df['comment_text'] = '' df['combined_text'] = df['title'] + ' ' + df['snippet'] df['date'] = datetime.now().strftime('%Y-%m-%d') # Create output directory if it doesn't exist os.makedirs(os.path.dirname(output_path), exist_ok=True) # Save to CSV df.to_csv(output_path, index=False) print(f"Saved {len(df)} web search results to {output_path}") return len(df) else: print("No web search results found") return 0 # Test the module if __name__ == "__main__": import sys # Get keywords from command line or use default if len(sys.argv) > 1: keywords = sys.argv[1:] full_claim = " ".join(sys.argv[1:]) else: keywords = ["polis", "kelantan", "sumbang mahram", "rogol"] full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan" # Run web search output_path = "output/web_search_results.csv" run_web_search(keywords, output_path, num_results=10, full_claim=full_claim)