Sales_Toolkit / comscore_site_list.py
github-actions[bot]
sync: automatic content update from github
bc7640d
import csv
import pandas as pd
import streamlit as st
def render(get_conn, get_lookup_table, extract_pmp):
st.header("πŸ“Š Comscore Site List")
# 1) Composition Index Threshold input
threshold = st.number_input(
"Composition Index Threshold",
min_value=0.0,
step=1.0,
value=150.0,
help="Show only rows where Composition Index UV β‰₯ this value",
)
# 2) Exclude Codes selector
# ── Fetch all distinct Codes for the dropdown
conn_codes = get_conn()
cs_codes = conn_codes.cursor()
cs_codes.execute(
"""
SELECT DISTINCT Codes
FROM ANALYTICS.SIGMA_SCRATCH.VIEW_GAMLOG_AGG_SITE_CODE_VOLUME_188E5F89A6FD42C7994D0F012A2EECD5
ORDER BY Codes
"""
)
all_codes = [row[0] for row in cs_codes.fetchall()]
cs_codes.close()
conn_codes.close()
exclude_codes = st.multiselect(
"Exclude Codes",
options=all_codes,
default=["bsopt_23"] if "bsopt_23" in all_codes else [],
help="Select Codes whose SITEIDs you want to filter out of the main table",
)
# 3) File uploader
upload = st.file_uploader(
"Upload your Comscore CSV", type="csv", key="comscore_uploader"
)
if not upload:
st.info("Please upload your file from Comscore.")
else:
try:
# ─── Parse the raw into rows ───────────────────────────────────────
raw_text = upload.read().decode("latin-1")
lines = raw_text.splitlines()
reader = csv.reader(lines, skipinitialspace=True)
rows = list(reader)
# ─── Locate your three headers anywhere in the first N lines ───────
required = ["Media", "Target Audience (000)", "Composition Index UV"]
header_pos = {}
for i, row in enumerate(rows[:30]):
cleaned = [cell.strip() for cell in row]
for col in required:
if col not in header_pos and col in cleaned:
header_pos[col] = i
if len(header_pos) == len(required):
break
missing = [c for c in required if c not in header_pos]
if missing:
st.error(f"Missing required column(s): {', '.join(missing)}")
st.stop()
# map header names β†’ column indices
idx_map = {}
for col, prow in header_pos.items():
cleaned = [cell.strip() for cell in rows[prow]]
idx_map[col] = cleaned.index(col)
# data starts right after the bottom header row
data_start = max(header_pos.values()) + 1
data_rows = rows[data_start:]
# pull out exactly those three columns
data = []
for row in data_rows:
if not row:
continue
vals = []
for col in required:
idx = idx_map[col]
vals.append(row[idx].strip() if idx < len(row) else None)
data.append(vals)
# build dataframe & coerce types
df_clean = pd.DataFrame(data, columns=required)
df_clean["Target Audience (000)"] = pd.to_numeric(
df_clean["Target Audience (000)"], errors="coerce"
)
df_clean["Composition Index UV"] = pd.to_numeric(
df_clean["Composition Index UV"], errors="coerce"
)
# ─── ORIGINAL Snowflake lookup & merge ───────────────────────────
df_lookup = (
get_lookup_table()
) # DISCUSSION: this still uses your SIGMA_SCRATCH view
lookup_small = df_lookup[
["entity", "site_id", "site_name", "status", "ad_options"]
]
df_merged = pd.merge(
df_clean,
lookup_small,
how="left",
left_on="Media",
right_on="entity",
).drop(columns=["entity"])
df_merged["pmp"] = (
df_merged["ad_options"]
.astype(str)
.apply(extract_pmp)
.map({True: "Yes", False: "No"})
.fillna("No")
)
# ─── NEW: fetch SITEIDs to exclude based on Codes ────────────────
if exclude_codes:
conn_ex = get_conn()
cs_ex = conn_ex.cursor()
quoted = ", ".join(f"'{c}'" for c in exclude_codes)
cs_ex.execute(
f"""
SELECT DISTINCT SITEID
FROM ANALYTICS.SIGMA_SCRATCH.VIEW_GAMLOG_AGG_SITE_CODE_VOLUME_188E5F89A6FD42C7994D0F012A2EECD5
WHERE Codes IN ({quoted})
"""
)
excluded_site_ids = {row[0] for row in cs_ex.fetchall()}
cs_ex.close()
conn_ex.close()
else:
excluded_site_ids = set()
# ─── Combined filter ─────────────────────────────────────────────
df_filtered = df_merged[
(df_merged["Composition Index UV"] >= threshold)
& (df_merged["site_name"].notna())
& (df_merged["status"] != "Dropped")
& (df_merged["pmp"] == "Yes")
& (~df_merged["site_id"].isin(excluded_site_ids))
]
# ─── drop extras, sort, display & export ──────────────────────────
df_display = (
df_filtered.drop(columns=["ad_options"])
.sort_values("Target Audience (000)", ascending=False)
.reset_index(drop=True)
)
if df_display.empty:
st.warning("No rows meet the criteria.")
else:
st.success(f"Showing {len(df_display)} rows after filtering")
st.dataframe(df_display)
# Export – GAM
site_ids = df_display["site_id"].dropna().astype(str)
gam_str = ",".join(site_ids)
gam_df = pd.DataFrame({"site_ids": [gam_str]})
gam_csv = gam_df.to_csv(index=False, header=False)
st.download_button(
label="Export - GAM",
data=gam_csv,
file_name="gam_export.csv",
mime="text/csv",
)
# Export – Client Facing
client_df = df_display[["site_name", "Media"]].rename(
columns={"Media": "Domain"}
)
client_csv = client_df.to_csv(index=False)
st.download_button(
label="Export - Client Facing",
data=client_csv,
file_name="client_facing_export.csv",
mime="text/csv",
)
# Export – Site Names
site_names = df_display["site_name"].dropna().astype(str)
names_str = ",".join(site_names)
names_df = pd.DataFrame({"site_names": [names_str]})
names_csv = names_df.to_csv(index=False, header=False)
st.download_button(
label="Export - Site Names",
data=names_csv,
file_name="site_names_export.csv",
mime="text/csv",
)
except Exception as e:
st.error(f"Could not parse, merge, or export file: {e}")