Spaces:

amitbhatt6075
/

reachify-ai-service

Running

File size: 3,481 Bytes

0914e96

# File: ai-service/scripts/export_performance_data.py

import pandas as pd
from sqlalchemy import create_engine, text
import os
from dotenv import load_dotenv
import sys

# Root directory ko path mein add karein taaki .env file mil sake
ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(ROOT_DIR)
load_dotenv(dotenv_path=os.path.join(ROOT_DIR, '.env'))

def export_performance_data():
    """
    Connects to the Supabase database, fetches data from approved submissions,
    and saves it to a CSV file for training the performance prediction model.
    """
    print("--- Starting Performance Data Export Process ---")
    db_url = os.getenv("DATABASE_URL")
    if not db_url:
        print("🔴 ERROR: DATABASE_URL not found. Please check your .env file in the ai-service root.")
        return

    # Yeh SQL query hamare AI model ke liye 'khana' (training data) nikalegi.
    sql_query = """
    SELECT
        cs.likes,
        cs.comments,
        cs.caption,
        ip.follower_count,
        CASE
            WHEN c.title ILIKE '%tech%' OR c.description ILIKE '%tech%' THEN 'Tech'
            WHEN c.title ILIKE '%fashion%' OR c.description ILIKE '%fashion%' THEN 'Fashion'
            WHEN c.title ILIKE '%food%' OR c.description ILIKE '%food%' THEN 'Food'
            WHEN c.title ILIKE '%gaming%' OR c.description ILIKE '%gaming%' THEN 'Gaming'
            ELSE 'General'
        END AS campaign_niche,
        CASE
            WHEN c.content_guidelines ILIKE '%reel%' THEN 'Reel'
            WHEN c.content_guidelines ILIKE '%story%' THEN 'Story'
            ELSE 'Post'
        END AS content_format
    FROM
        public.campaign_submissions cs
    JOIN
        public.campaigns c ON cs.campaign_id = c.id
    JOIN
        public.influencer_profiles ip ON cs.influencer_id = ip.profile_id
    WHERE
        cs.status = 'approved'      -- Sirf approved submissions se seekhein
        AND cs.likes IS NOT NULL    -- Jin par likes ka data ho
        AND cs.comments IS NOT NULL -- Jin par comments ka data ho
        AND cs.caption IS NOT NULL  -- Jin par caption ho
        AND ip.follower_count > 0;  -- Jin influencers ke followers pata ho
    """
    try:
        print("Connecting to Supabase to fetch performance data...")
        engine = create_engine(db_url)
        
        with engine.connect() as connection:
            df = pd.DataFrame(connection.execute(text(sql_query)))

        print(f"✅ Fetched {len(df)} approved submission records from the database.")
    
    except Exception as e:
        print(f"🔴 ERROR fetching data: {e}")
        return

    if df.empty:
        print("⚠️ No valid training data found. A blank CSV will be created.")
    else:
        # Feature Engineering: Caption ki lambai (length) ko ek feature banayein
        df['caption_length'] = df['caption'].str.len()
    
    # Sirf zaroori columns ko CSV me save karein
    columns_to_save = ['likes', 'comments', 'follower_count', 'caption_length', 'campaign_niche', 'content_format']
    # Agar koi column na ho (khaali df ke case mein), toh use ignore karein
    df_to_save = df.reindex(columns=columns_to_save).fillna(0)

    # Data ko /data folder mein save karein
    output_path = os.path.join(ROOT_DIR, 'data', 'performance_training_data.csv')
    df_to_save.to_csv(output_path, index=False)
    print(f"🎉 Success! Performance data saved to {output_path}")

if __name__ == '__main__':
    export_performance_data()