|
|
""" |
|
|
run_web_search.py |
|
|
Module for running web searches and saving results |
|
|
""" |
|
|
|
|
|
import pandas as pd |
|
|
from datetime import datetime |
|
|
import os |
|
|
|
|
|
def run(keywords, output_path, num_results=5, use_serpapi=True, use_serper=True, use_duckduckgo=True, full_claim=None): |
|
|
""" |
|
|
Run web search for keywords and save results to CSV |
|
|
|
|
|
Args: |
|
|
keywords (list): List of keywords to search for |
|
|
output_path (str): Path to save results |
|
|
num_results (int): Number of results per keyword |
|
|
use_serpapi (bool): Whether to use SerpApi |
|
|
use_serper (bool): Whether to use Serper.dev |
|
|
use_duckduckgo (bool): Whether to use DuckDuckGo |
|
|
full_claim (str): The full claim text to use as a search query |
|
|
|
|
|
Returns: |
|
|
int: Number of results saved |
|
|
""" |
|
|
|
|
|
try: |
|
|
from web_search import search_serpapi, search_serper, search_duckduckgo, get_google_trends |
|
|
except ImportError: |
|
|
print("Error importing web_search module. Make sure it exists and is accessible.") |
|
|
return 0 |
|
|
|
|
|
|
|
|
all_results = [] |
|
|
|
|
|
|
|
|
if full_claim: |
|
|
print(f"Using full claim as direct search query: '{full_claim}'") |
|
|
|
|
|
|
|
|
if use_serpapi: |
|
|
print("Searching with SerpApi (exact claim)...") |
|
|
serpapi_results = search_serpapi(full_claim, num_results=num_results) |
|
|
if serpapi_results: |
|
|
print(f"Found {len(serpapi_results)} results from SerpApi (exact claim)") |
|
|
all_results.extend(serpapi_results) |
|
|
else: |
|
|
print("No results from SerpApi (exact claim)") |
|
|
|
|
|
|
|
|
if use_serper: |
|
|
print("Searching with Serper.dev (exact claim)...") |
|
|
serper_results = search_serper(full_claim, num_results=num_results) |
|
|
if serper_results: |
|
|
print(f"Found {len(serper_results)} results from Serper.dev (exact claim)") |
|
|
all_results.extend(serper_results) |
|
|
else: |
|
|
print("No results from Serper.dev (exact claim)") |
|
|
|
|
|
|
|
|
crime_related = any(term in full_claim.lower() for term in ["polis", "jenayah", "kes", "rogol", "sumbang mahram"]) |
|
|
kelantan_related = "kelantan" in full_claim.lower() |
|
|
|
|
|
if crime_related and kelantan_related: |
|
|
|
|
|
ammunition_related = any(term in full_claim.lower() for term in ["kelongsong", "peluru", "senjata", "tan"]) |
|
|
|
|
|
if ammunition_related: |
|
|
targeted_queries = [ |
|
|
"50 tan kelongsong peluru ditemui", |
|
|
"kilang haram proses kelongsong peluru", |
|
|
"penemuan kelongsong peluru di kilang", |
|
|
"kelongsong peluru musuh negara" |
|
|
] |
|
|
else: |
|
|
|
|
|
targeted_queries = [ |
|
|
"statistik jenayah seksual di kelantan", |
|
|
"kes rogol dan sumbang mahram di kelantan meningkat", |
|
|
"pdrm kelantan lapor kes rogol" |
|
|
] |
|
|
|
|
|
for query in targeted_queries: |
|
|
print(f"Using targeted query: '{query}'") |
|
|
|
|
|
|
|
|
if use_serpapi: |
|
|
print(f"Searching with SerpApi (targeted query: {query})...") |
|
|
serpapi_results = search_serpapi(query, num_results=num_results//2) |
|
|
if serpapi_results: |
|
|
print(f"Found {len(serpapi_results)} results from SerpApi (targeted query)") |
|
|
all_results.extend(serpapi_results) |
|
|
else: |
|
|
print(f"No results from SerpApi (targeted query: {query})") |
|
|
|
|
|
|
|
|
if use_serper: |
|
|
print(f"Searching with Serper.dev (targeted query: {query})...") |
|
|
serper_results = search_serper(query, num_results=num_results//2) |
|
|
if serper_results: |
|
|
print(f"Found {len(serper_results)} results from Serper.dev (targeted query)") |
|
|
all_results.extend(serper_results) |
|
|
else: |
|
|
print(f"No results from Serper.dev (targeted query: {query})") |
|
|
else: |
|
|
|
|
|
|
|
|
full_claim_query = f'"{full_claim}"' if full_claim else None |
|
|
|
|
|
|
|
|
search_terms = [] |
|
|
for kw in keywords: |
|
|
|
|
|
if " " in kw: |
|
|
search_terms.append(f'"{kw}"') |
|
|
else: |
|
|
|
|
|
search_terms.append(kw) |
|
|
|
|
|
keyword_query = " OR ".join(search_terms) |
|
|
|
|
|
|
|
|
if full_claim_query: |
|
|
print(f"Searching with full claim: {full_claim_query}") |
|
|
|
|
|
|
|
|
if use_serpapi: |
|
|
print("Searching with SerpApi (full claim)...") |
|
|
serpapi_results = search_serpapi(full_claim, num_results=num_results) |
|
|
if serpapi_results: |
|
|
print(f"Found {len(serpapi_results)} results from SerpApi (full claim)") |
|
|
all_results.extend(serpapi_results) |
|
|
else: |
|
|
print("No results from SerpApi (full claim)") |
|
|
|
|
|
|
|
|
if use_serper: |
|
|
print("Searching with Serper.dev (full claim)...") |
|
|
serper_results = search_serper(full_claim, num_results=num_results) |
|
|
if serper_results: |
|
|
print(f"Found {len(serper_results)} results from Serper.dev (full claim)") |
|
|
all_results.extend(serper_results) |
|
|
else: |
|
|
print("No results from Serper.dev (full claim)") |
|
|
|
|
|
|
|
|
if not all_results or len(all_results) < num_results: |
|
|
print(f"Searching with keyword query: {keyword_query}") |
|
|
|
|
|
|
|
|
if use_serpapi: |
|
|
print("Searching with SerpApi (keywords)...") |
|
|
serpapi_results = search_serpapi(keyword_query, num_results=num_results) |
|
|
if serpapi_results: |
|
|
print(f"Found {len(serpapi_results)} results from SerpApi (keywords)") |
|
|
all_results.extend(serpapi_results) |
|
|
else: |
|
|
print("No results from SerpApi (keywords)") |
|
|
|
|
|
|
|
|
if use_serper: |
|
|
print("Searching with Serper.dev (keywords)...") |
|
|
serper_results = search_serper(keyword_query, num_results=num_results) |
|
|
if serper_results: |
|
|
print(f"Found {len(serper_results)} results from Serper.dev (keywords)") |
|
|
all_results.extend(serper_results) |
|
|
else: |
|
|
print("No results from Serper.dev (keywords)") |
|
|
|
|
|
|
|
|
if use_duckduckgo: |
|
|
query_to_use = full_claim if full_claim else keyword_query |
|
|
print(f"Searching with DuckDuckGo using: {query_to_use}") |
|
|
duckduckgo_results = search_duckduckgo(query_to_use, num_results=num_results) |
|
|
if duckduckgo_results: |
|
|
print(f"Found {len(duckduckgo_results)} results from DuckDuckGo") |
|
|
all_results.extend(duckduckgo_results) |
|
|
else: |
|
|
print("No results from DuckDuckGo") |
|
|
|
|
|
|
|
|
trends_data = get_google_trends(keywords) |
|
|
|
|
|
|
|
|
if all_results: |
|
|
|
|
|
unique_results = [] |
|
|
seen_urls = set() |
|
|
|
|
|
for result in all_results: |
|
|
url = result.get('link', '') |
|
|
if url and url not in seen_urls: |
|
|
seen_urls.add(url) |
|
|
unique_results.append(result) |
|
|
|
|
|
print(f"Removed {len(all_results) - len(unique_results)} duplicate results") |
|
|
|
|
|
df = pd.DataFrame(unique_results) |
|
|
|
|
|
|
|
|
df['platform'] = 'web' |
|
|
df['username'] = df['source'] |
|
|
df['post_text'] = df['snippet'] |
|
|
df['post_url'] = df['link'] |
|
|
df['likes'] = 0 |
|
|
df['shares'] = 0 |
|
|
df['comments_count'] = 0 |
|
|
df['comment_text'] = '' |
|
|
df['combined_text'] = df['title'] + ' ' + df['snippet'] |
|
|
df['date'] = datetime.now().strftime('%Y-%m-%d') |
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True) |
|
|
|
|
|
|
|
|
df.to_csv(output_path, index=False) |
|
|
print(f"Saved {len(df)} web search results to {output_path}") |
|
|
return len(df) |
|
|
else: |
|
|
print("No web search results found") |
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import sys |
|
|
|
|
|
|
|
|
if len(sys.argv) > 1: |
|
|
keywords = sys.argv[1:] |
|
|
full_claim = " ".join(sys.argv[1:]) |
|
|
else: |
|
|
keywords = ["polis", "kelantan", "sumbang mahram", "rogol"] |
|
|
full_claim = "Polis Kelantan bimbang kes sumbang mahram dan rogol di Kelantan" |
|
|
|
|
|
|
|
|
output_path = "output/web_search_results.csv" |
|
|
run_web_search(keywords, output_path, num_results=10, full_claim=full_claim) |
|
|
|