pif / scripts /daily_refresh.py
pramodmisra's picture
Upload folder using huggingface_hub
996fcf9 verified
"""
Daily Data Refresh β€” BigQuery β†’ SQLite β†’ HF Space
Schedule this notebook in Google Colab:
Edit β†’ Triggers β†’ Add trigger β†’ Daily
What it does:
1. Authenticates with Google Cloud
2. Pulls Team Mapping and Client data from BigQuery
3. Parses and writes to a SQLite database
4. Uploads the DB to Hugging Face Space persistent storage
Environment variables needed:
HF_TOKEN β€” Hugging Face write token (set in Colab secrets)
"""
# ── Config ───────────────────────────────────────────────────
HF_REPO_ID = "your-username/producer-intake" # ← Change this
BQ_PROJECT = "your-gcp-project" # ← Change this
TEAM_MAPPING_QUERY = "SELECT * FROM `dataset.team_mapping`"
CLIENT_EXTRACT_QUERY = "SELECT * FROM `dataset.client_extract`"
DB_PATH = "/content/app_data.db"
# ── 1. Install dependencies ─────────────────────────────────
import subprocess
subprocess.run(["pip", "install", "-q", "google-cloud-bigquery", "huggingface_hub", "pandas"], capture_output=True)
# ── 2. Authenticate ─────────────────────────────────────────
from google.colab import auth as colab_auth
colab_auth.authenticate_user()
from google.cloud import bigquery
import pandas as pd
import sqlite3
import os
client = bigquery.Client(project=BQ_PROJECT)
# ── 3. Pull data from BigQuery ───────────────────────────────
print("πŸ“₯ Pulling Team Mapping...")
df_team = client.query(TEAM_MAPPING_QUERY).to_dataframe()
print(f" β†’ {len(df_team)} rows")
print("πŸ“₯ Pulling Client Extract...")
df_clients = client.query(CLIENT_EXTRACT_QUERY).to_dataframe()
print(f" β†’ {len(df_clients)} rows")
# ── 4. Parse Team Mapping into producers table ───────────────
def parse_producers(df):
"""Convert raw team mapping into the producers table format."""
records = []
for _, row in df.iterrows():
code = str(row.iloc[0]).strip()
name = str(row.iloc[1]).strip()
# Skip rules (same as notebook)
if not code or not name:
continue
if code.upper() == "PRODUCER CODE" or name.upper() == "PRODUCER NAME":
continue
if name == "House Account":
continue
if len(code) < 2 or not code[-1].isdigit():
continue
prefix = code[:-1]
suffix = int(code[-1])
records.append({
"code": code,
"prefix": prefix,
"suffix": suffix,
"name": name,
"is_active": 1,
})
return pd.DataFrame(records)
def parse_clients(df):
"""Convert raw client extract into the clients table format."""
records = []
# Skip header/metadata rows
for _, row in df.iloc[1:].iterrows():
code = str(row.iloc[0]).strip()
name = str(row.iloc[1]).strip()
if not code or not name:
continue
records.append({
"lookup_code": code,
"name": name,
"is_active": 1,
})
return pd.DataFrame(records)
df_producers = parse_producers(df_team)
df_clients_parsed = parse_clients(df_clients)
print(f"\nβœ… Parsed {len(df_producers)} producer entries")
print(f"βœ… Parsed {len(df_clients_parsed)} client entries")
# ── 5. Write to SQLite ──────────────────────────────────────
if os.path.exists(DB_PATH):
os.remove(DB_PATH)
conn = sqlite3.connect(DB_PATH)
# We only replace the DATA tables (producers, clients).
# The CONFIG tables (agreements, constants, etc.) live in app.db
# and are managed through the admin panel.
df_producers.to_sql("producers", conn, if_exists="replace", index=False)
df_clients_parsed.to_sql("clients", conn, if_exists="replace", index=False)
conn.close()
size_kb = os.path.getsize(DB_PATH) / 1024
print(f"\nπŸ’Ύ SQLite DB: {size_kb:.0f} KB")
# ── 6. Upload to HF Space ───────────────────────────────────
from huggingface_hub import HfApi
from google.colab import userdata
hf_token = userdata.get("HF_TOKEN") # Stored in Colab secrets
api = HfApi(token=hf_token)
print("πŸ“€ Uploading to HF Space...")
api.upload_file(
path_or_fileobj=DB_PATH,
path_in_repo="data/app_data.db",
repo_id=HF_REPO_ID,
repo_type="space",
)
print("βœ… Daily refresh complete!")
print(f" Producers: {len(df_producers)}")
print(f" Clients: {len(df_clients_parsed)}")