Spaces:

pramodmisra
/

pif

Running

File size: 4,796 Bytes

996fcf9

"""
Daily Data Refresh — BigQuery → SQLite → HF Space

Schedule this notebook in Google Colab:
  Edit → Triggers → Add trigger → Daily

What it does:
  1. Authenticates with Google Cloud
  2. Pulls Team Mapping and Client data from BigQuery
  3. Parses and writes to a SQLite database
  4. Uploads the DB to Hugging Face Space persistent storage

Environment variables needed:
  HF_TOKEN — Hugging Face write token (set in Colab secrets)
"""

# ── Config ───────────────────────────────────────────────────
HF_REPO_ID = "your-username/producer-intake"   # ← Change this
BQ_PROJECT = "your-gcp-project"                 # ← Change this
TEAM_MAPPING_QUERY = "SELECT * FROM `dataset.team_mapping`"
CLIENT_EXTRACT_QUERY = "SELECT * FROM `dataset.client_extract`"

DB_PATH = "/content/app_data.db"


# ── 1. Install dependencies ─────────────────────────────────
import subprocess
subprocess.run(["pip", "install", "-q", "google-cloud-bigquery", "huggingface_hub", "pandas"], capture_output=True)


# ── 2. Authenticate ─────────────────────────────────────────
from google.colab import auth as colab_auth
colab_auth.authenticate_user()

from google.cloud import bigquery
import pandas as pd
import sqlite3
import os

client = bigquery.Client(project=BQ_PROJECT)


# ── 3. Pull data from BigQuery ───────────────────────────────
print("📥 Pulling Team Mapping...")
df_team = client.query(TEAM_MAPPING_QUERY).to_dataframe()
print(f"   → {len(df_team)} rows")

print("📥 Pulling Client Extract...")
df_clients = client.query(CLIENT_EXTRACT_QUERY).to_dataframe()
print(f"   → {len(df_clients)} rows")


# ── 4. Parse Team Mapping into producers table ───────────────
def parse_producers(df):
    """Convert raw team mapping into the producers table format."""
    records = []
    for _, row in df.iterrows():
        code = str(row.iloc[0]).strip()
        name = str(row.iloc[1]).strip()

        # Skip rules (same as notebook)
        if not code or not name:
            continue
        if code.upper() == "PRODUCER CODE" or name.upper() == "PRODUCER NAME":
            continue
        if name == "House Account":
            continue
        if len(code) < 2 or not code[-1].isdigit():
            continue

        prefix = code[:-1]
        suffix = int(code[-1])
        records.append({
            "code": code,
            "prefix": prefix,
            "suffix": suffix,
            "name": name,
            "is_active": 1,
        })

    return pd.DataFrame(records)


def parse_clients(df):
    """Convert raw client extract into the clients table format."""
    records = []
    # Skip header/metadata rows
    for _, row in df.iloc[1:].iterrows():
        code = str(row.iloc[0]).strip()
        name = str(row.iloc[1]).strip()
        if not code or not name:
            continue
        records.append({
            "lookup_code": code,
            "name": name,
            "is_active": 1,
        })

    return pd.DataFrame(records)


df_producers = parse_producers(df_team)
df_clients_parsed = parse_clients(df_clients)

print(f"\n✅ Parsed {len(df_producers)} producer entries")
print(f"✅ Parsed {len(df_clients_parsed)} client entries")


# ── 5. Write to SQLite ──────────────────────────────────────
if os.path.exists(DB_PATH):
    os.remove(DB_PATH)

conn = sqlite3.connect(DB_PATH)

# We only replace the DATA tables (producers, clients).
# The CONFIG tables (agreements, constants, etc.) live in app.db
# and are managed through the admin panel.
df_producers.to_sql("producers", conn, if_exists="replace", index=False)
df_clients_parsed.to_sql("clients", conn, if_exists="replace", index=False)

conn.close()

size_kb = os.path.getsize(DB_PATH) / 1024
print(f"\n💾 SQLite DB: {size_kb:.0f} KB")


# ── 6. Upload to HF Space ───────────────────────────────────
from huggingface_hub import HfApi
from google.colab import userdata

hf_token = userdata.get("HF_TOKEN")   # Stored in Colab secrets
api = HfApi(token=hf_token)

print("📤 Uploading to HF Space...")
api.upload_file(
    path_or_fileobj=DB_PATH,
    path_in_repo="data/app_data.db",
    repo_id=HF_REPO_ID,
    repo_type="space",
)

print("✅ Daily refresh complete!")
print(f"   Producers: {len(df_producers)}")
print(f"   Clients:   {len(df_clients_parsed)}")