youtube_trends / data /dataset.py
molehh's picture
created project
eec3758
import os
from supabase import create_client, Client
import pandas as pd
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
# Replace with your Supabase project details
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_KEY = os.getenv("SUPABASE_KEY")
# Initialize client
supabase: Client = create_client(SUPABASE_URL, SUPABASE_KEY)
# Example: Fetch all rows from a table
df = pd.read_csv("merged_yt_data.csv", on_bad_lines='skip')
# ๐Ÿ”น Data Cleaning
df = df.replace([np.inf, -np.inf], np.nan) # Replace infinite values with NaN
df = df.fillna("") # Replace NaN with an empty string (JSON-safe)
df = df.drop_duplicates(subset=["video_id"])
# ๐Ÿ”น Convert DataFrame to a list of dictionaries
data = df.to_dict(orient="records")
print(f"Total rows to insert: {len(data)}")
# ๐Ÿ”น Insert in Batches with Progress Bar
BATCH_SIZE = 1000
for i in tqdm(range(0, len(data), BATCH_SIZE), desc="Uploading to Supabase", unit="batch"):
batch = data[i : i + BATCH_SIZE]
try:
supabase.table("youtube").insert(batch).execute()
except Exception as e:
print(f"Error inserting batch {i}-{i+len(batch)}: {e}")
# ๐Ÿ”น Get Row Count from Supabase
responses = supabase.table("youtube").select("*", count="exact").execute()
print(f"Total rows in Supabase: {responses.count}")