import csv import pandas as pd import streamlit as st def render(get_conn, get_lookup_table, extract_pmp): st.header("📊 Comscore Site List") # 1) Composition Index Threshold input threshold = st.number_input( "Composition Index Threshold", min_value=0.0, step=1.0, value=150.0, help="Show only rows where Composition Index UV ≥ this value", ) # 2) Exclude Codes selector # ── Fetch all distinct Codes for the dropdown conn_codes = get_conn() cs_codes = conn_codes.cursor() cs_codes.execute( """ SELECT DISTINCT Codes FROM ANALYTICS.SIGMA_SCRATCH.VIEW_GAMLOG_AGG_SITE_CODE_VOLUME_188E5F89A6FD42C7994D0F012A2EECD5 ORDER BY Codes """ ) all_codes = [row[0] for row in cs_codes.fetchall()] cs_codes.close() conn_codes.close() exclude_codes = st.multiselect( "Exclude Codes", options=all_codes, default=["bsopt_23"] if "bsopt_23" in all_codes else [], help="Select Codes whose SITEIDs you want to filter out of the main table", ) # 3) File uploader upload = st.file_uploader( "Upload your Comscore CSV", type="csv", key="comscore_uploader" ) if not upload: st.info("Please upload your file from Comscore.") else: try: # ─── Parse the raw into rows ─────────────────────────────────────── raw_text = upload.read().decode("latin-1") lines = raw_text.splitlines() reader = csv.reader(lines, skipinitialspace=True) rows = list(reader) # ─── Locate your three headers anywhere in the first N lines ─────── required = ["Media", "Target Audience (000)", "Composition Index UV"] header_pos = {} for i, row in enumerate(rows[:30]): cleaned = [cell.strip() for cell in row] for col in required: if col not in header_pos and col in cleaned: header_pos[col] = i if len(header_pos) == len(required): break missing = [c for c in required if c not in header_pos] if missing: st.error(f"Missing required column(s): {', '.join(missing)}") st.stop() # map header names → column indices idx_map = {} for col, prow in header_pos.items(): cleaned = [cell.strip() for cell in rows[prow]] idx_map[col] = cleaned.index(col) # data starts right after the bottom header row data_start = max(header_pos.values()) + 1 data_rows = rows[data_start:] # pull out exactly those three columns data = [] for row in data_rows: if not row: continue vals = [] for col in required: idx = idx_map[col] vals.append(row[idx].strip() if idx < len(row) else None) data.append(vals) # build dataframe & coerce types df_clean = pd.DataFrame(data, columns=required) df_clean["Target Audience (000)"] = pd.to_numeric( df_clean["Target Audience (000)"], errors="coerce" ) df_clean["Composition Index UV"] = pd.to_numeric( df_clean["Composition Index UV"], errors="coerce" ) # ─── ORIGINAL Snowflake lookup & merge ─────────────────────────── df_lookup = ( get_lookup_table() ) # DISCUSSION: this still uses your SIGMA_SCRATCH view lookup_small = df_lookup[ ["entity", "site_id", "site_name", "status", "ad_options"] ] df_merged = pd.merge( df_clean, lookup_small, how="left", left_on="Media", right_on="entity", ).drop(columns=["entity"]) df_merged["pmp"] = ( df_merged["ad_options"] .astype(str) .apply(extract_pmp) .map({True: "Yes", False: "No"}) .fillna("No") ) # ─── NEW: fetch SITEIDs to exclude based on Codes ──────────────── if exclude_codes: conn_ex = get_conn() cs_ex = conn_ex.cursor() quoted = ", ".join(f"'{c}'" for c in exclude_codes) cs_ex.execute( f""" SELECT DISTINCT SITEID FROM ANALYTICS.SIGMA_SCRATCH.VIEW_GAMLOG_AGG_SITE_CODE_VOLUME_188E5F89A6FD42C7994D0F012A2EECD5 WHERE Codes IN ({quoted}) """ ) excluded_site_ids = {row[0] for row in cs_ex.fetchall()} cs_ex.close() conn_ex.close() else: excluded_site_ids = set() # ─── Combined filter ───────────────────────────────────────────── df_filtered = df_merged[ (df_merged["Composition Index UV"] >= threshold) & (df_merged["site_name"].notna()) & (df_merged["status"] != "Dropped") & (df_merged["pmp"] == "Yes") & (~df_merged["site_id"].isin(excluded_site_ids)) ] # ─── drop extras, sort, display & export ────────────────────────── df_display = ( df_filtered.drop(columns=["ad_options"]) .sort_values("Target Audience (000)", ascending=False) .reset_index(drop=True) ) if df_display.empty: st.warning("No rows meet the criteria.") else: st.success(f"Showing {len(df_display)} rows after filtering") st.dataframe(df_display) # Export – GAM site_ids = df_display["site_id"].dropna().astype(str) gam_str = ",".join(site_ids) gam_df = pd.DataFrame({"site_ids": [gam_str]}) gam_csv = gam_df.to_csv(index=False, header=False) st.download_button( label="Export - GAM", data=gam_csv, file_name="gam_export.csv", mime="text/csv", ) # Export – Client Facing client_df = df_display[["site_name", "Media"]].rename( columns={"Media": "Domain"} ) client_csv = client_df.to_csv(index=False) st.download_button( label="Export - Client Facing", data=client_csv, file_name="client_facing_export.csv", mime="text/csv", ) # Export – Site Names site_names = df_display["site_name"].dropna().astype(str) names_str = ",".join(site_names) names_df = pd.DataFrame({"site_names": [names_str]}) names_csv = names_df.to_csv(index=False, header=False) st.download_button( label="Export - Site Names", data=names_csv, file_name="site_names_export.csv", mime="text/csv", ) except Exception as e: st.error(f"Could not parse, merge, or export file: {e}")