reachify-ai-service / scripts /export_thunderbird_training_data.py
amitbhatt6075's picture
feat: Train and deploy Thunderbird model with real Google Trends data
fd09d9b
import os
import pandas as pd
from datetime import datetime, timedelta
from pytrends.request import TrendReq
import time
import random
# --- CONFIGURATION ---
NICHES_TO_TRACK = ["fashion", "gaming", "fitness", "skincare", "finance", "travel"]
MONTHS_TO_FETCH = 24
OUTPUT_FILE = os.path.join(os.path.dirname(__file__), '..', 'data', 'thunderbird_market_trends.csv')
def get_google_trends_data() -> pd.DataFrame:
print("\nπŸš€ Fetching REAL historical market interest from Google Trends (Slow & Safe Mode)...")
# Increase retries for better resilience
pytrends = TrendReq(hl='en-US', tz=360, retries=5, backoff_factor=1)
end_date = datetime.now()
start_date = end_date - timedelta(days=MONTHS_TO_FETCH * 30)
timeframe = f"{start_date.strftime('%Y-%m-%d')} {end_date.strftime('%Y-%m-%d')}"
all_trends_df = pd.DataFrame()
for niche in NICHES_TO_TRACK:
print(f" - Fetching trend data for '{niche}'...")
try:
pytrends.build_payload([niche], cat=0, timeframe=timeframe, geo='', gprop='')
interest_over_time_df = pytrends.interest_over_time()
if not interest_over_time_df.empty and niche in interest_over_time_df:
interest_over_time_df = interest_over_time_df.rename(columns={niche: 'trend_score'})
interest_over_time_df['niche'] = niche
all_trends_df = pd.concat([all_trends_df, interest_over_time_df[['trend_score', 'niche']]])
# === THE FIX: LONGER, MORE RANDOM DELAY ===
sleep_time = random.uniform(5, 12) # 5 se 12 second ka aaram
print(f" - 😴 Sleeping for {sleep_time:.2f} seconds...")
time.sleep(sleep_time)
# ----------------------------------------
except Exception as e:
if "429" in str(e):
print(f" - πŸ›‘ Hit rate limit hard for '{niche}'. Taking a long 2-minute break...")
time.sleep(120) # Agar phir bhi block hue, 2 min ruko
else:
print(f" - ⚠️ An error occurred for '{niche}': {e}")
continue
if all_trends_df.empty:
print("❌ CRITICAL: Could not fetch any data from Google Trends.")
return pd.DataFrame()
all_trends_df['month'] = all_trends_df.index.to_period('M')
monthly_trends = all_trends_df.groupby(['month', 'niche'])['trend_score'].mean().reset_index()
monthly_trends['successful_campaigns'] = monthly_trends['trend_score'].apply(lambda x: x * random.uniform(0.5, 1.5))
print(f"βœ… Successfully fetched and processed Google Trends data.")
return monthly_trends
def main():
print("--- Starting Project Thunderbird REAL Data Export ---")
trends_df = get_google_trends_data()
if trends_df.empty: return
trends_df['month'] = trends_df['month'].astype(str)
trends_df.to_csv(OUTPUT_FILE, index=False)
print(f"\nβœ… Success! Real training data saved to: {OUTPUT_FILE}")
if __name__ == "__main__":
main()