Spaces:

pramodmisra
/

pif

Sleeping

App Files Files Community

pif / scripts /daily_refresh.py

pramodmisra

Upload folder using huggingface_hub

996fcf9 verified 23 days ago

raw

history blame contribute delete

4.8 kB

	"""
	Daily Data Refresh — BigQuery → SQLite → HF Space

	Schedule this notebook in Google Colab:
	Edit → Triggers → Add trigger → Daily

	What it does:
	1. Authenticates with Google Cloud
	2. Pulls Team Mapping and Client data from BigQuery
	3. Parses and writes to a SQLite database
	4. Uploads the DB to Hugging Face Space persistent storage

	Environment variables needed:
	HF_TOKEN — Hugging Face write token (set in Colab secrets)
	"""

	# ── Config ───────────────────────────────────────────────────
	HF_REPO_ID = "your-username/producer-intake" # ← Change this
	BQ_PROJECT = "your-gcp-project" # ← Change this
	TEAM_MAPPING_QUERY = "SELECT * FROM `dataset.team_mapping`"
	CLIENT_EXTRACT_QUERY = "SELECT * FROM `dataset.client_extract`"

	DB_PATH = "/content/app_data.db"


	# ── 1. Install dependencies ─────────────────────────────────
	import subprocess
	subprocess.run(["pip", "install", "-q", "google-cloud-bigquery", "huggingface_hub", "pandas"], capture_output=True)


	# ── 2. Authenticate ─────────────────────────────────────────
	from google.colab import auth as colab_auth
	colab_auth.authenticate_user()

	from google.cloud import bigquery
	import pandas as pd
	import sqlite3
	import os

	client = bigquery.Client(project=BQ_PROJECT)


	# ── 3. Pull data from BigQuery ───────────────────────────────
	print("📥 Pulling Team Mapping...")
	df_team = client.query(TEAM_MAPPING_QUERY).to_dataframe()
	print(f" → {len(df_team)} rows")

	print("📥 Pulling Client Extract...")
	df_clients = client.query(CLIENT_EXTRACT_QUERY).to_dataframe()
	print(f" → {len(df_clients)} rows")


	# ── 4. Parse Team Mapping into producers table ───────────────
	def parse_producers(df):
	"""Convert raw team mapping into the producers table format."""
	records = []
	for _, row in df.iterrows():
	code = str(row.iloc[0]).strip()
	name = str(row.iloc[1]).strip()

	# Skip rules (same as notebook)
	if not code or not name:
	continue
	if code.upper() == "PRODUCER CODE" or name.upper() == "PRODUCER NAME":
	continue
	if name == "House Account":
	continue
	if len(code) < 2 or not code[-1].isdigit():
	continue

	prefix = code[:-1]
	suffix = int(code[-1])
	records.append({
	"code": code,
	"prefix": prefix,
	"suffix": suffix,
	"name": name,
	"is_active": 1,
	})

	return pd.DataFrame(records)


	def parse_clients(df):
	"""Convert raw client extract into the clients table format."""
	records = []
	# Skip header/metadata rows
	for _, row in df.iloc[1:].iterrows():
	code = str(row.iloc[0]).strip()
	name = str(row.iloc[1]).strip()
	if not code or not name:
	continue
	records.append({
	"lookup_code": code,
	"name": name,
	"is_active": 1,
	})

	return pd.DataFrame(records)


	df_producers = parse_producers(df_team)
	df_clients_parsed = parse_clients(df_clients)

	print(f"\n✅ Parsed {len(df_producers)} producer entries")
	print(f"✅ Parsed {len(df_clients_parsed)} client entries")


	# ── 5. Write to SQLite ──────────────────────────────────────
	if os.path.exists(DB_PATH):
	os.remove(DB_PATH)

	conn = sqlite3.connect(DB_PATH)

	# We only replace the DATA tables (producers, clients).
	# The CONFIG tables (agreements, constants, etc.) live in app.db
	# and are managed through the admin panel.
	df_producers.to_sql("producers", conn, if_exists="replace", index=False)
	df_clients_parsed.to_sql("clients", conn, if_exists="replace", index=False)

	conn.close()

	size_kb = os.path.getsize(DB_PATH) / 1024
	print(f"\n💾 SQLite DB: {size_kb:.0f} KB")


	# ── 6. Upload to HF Space ───────────────────────────────────
	from huggingface_hub import HfApi
	from google.colab import userdata

	hf_token = userdata.get("HF_TOKEN") # Stored in Colab secrets
	api = HfApi(token=hf_token)

	print("📤 Uploading to HF Space...")
	api.upload_file(
	path_or_fileobj=DB_PATH,
	path_in_repo="data/app_data.db",
	repo_id=HF_REPO_ID,
	repo_type="space",
	)

	print("✅ Daily refresh complete!")
	print(f" Producers: {len(df_producers)}")
	print(f" Clients: {len(df_clients_parsed)}")