farmentano12 commited on
Commit
b34d524
·
verified ·
1 Parent(s): 6d779ed

Fix de features y slider de App Id

Browse files
Files changed (1) hide show
  1. app.py +224 -134
app.py CHANGED
@@ -1,17 +1,18 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from io import BytesIO
4
  import os
 
 
5
  import tempfile
 
 
6
 
7
- # BigQuery (opcional)
8
  try:
9
  from google.cloud import bigquery
10
  _HAS_BQ = True
11
  except Exception:
12
  _HAS_BQ = False
13
 
14
- # Para dtypes de BQ -> pandas (opcional)
15
  try:
16
  import db_dtypes # noqa: F401
17
  _HAS_DB_DTYPES = True
@@ -24,10 +25,11 @@ APP_DESC = """
24
  **BigQuery (tabla única)**: `leadgenios-tech.afiliacion_datalake.daily_afiliate_datalake`
25
 
26
  Pasos BQ:
27
- 1) Ingresá **App ID** y **rango de fechas** (YYYY-MM-DD).
28
- 2) **Obtener columnas (schema)** sugiere **columna temporal (event_time)**, **evento (event_name)**, **ID en MMP (appsflyer_id)** y **App ID columna** (app_id).
29
- 3) **Listar eventos por rango** (usa App ID + fechas + columna de evento).
30
- 4) **Consultar y cargar MMP** genera CSV temporal, preview y descarga.
 
31
 
32
  **Archivo**: subir archivo, detectar columnas y (opcional) **listar eventos** para filtrar. No hace falta App ID ni fechas.
33
 
@@ -43,7 +45,7 @@ Pasos BQ:
43
  - Excel: **Hoja 1** tablas por evento; **Hoja 2** `raw_merge`.
44
  """
45
 
46
- # -------------------------- Helpers --------------------------
47
  def _read_excel(pathlike):
48
  return pd.read_excel(pathlike, engine="openpyxl")
49
 
@@ -82,7 +84,6 @@ def _guess(cols, candidates):
82
  return cols[0] if cols else None
83
 
84
  def _guess_optional(cols, candidates):
85
- """Como _guess, pero devuelve None si no encuentra coincidencia."""
86
  lower_map = {c.lower(): c for c in cols}
87
  for cand in candidates:
88
  if cand.lower() in lower_map:
@@ -94,7 +95,35 @@ def _safe_file_output(path):
94
  return path
95
  return None
96
 
97
- # -------------------------- BQ helpers (tabla fija) --------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  BQ_PROJECT = "leadgenios-tech"
99
  BQ_TABLE_FQN = "leadgenios-tech.afiliacion_datalake.daily_afiliate_datalake"
100
 
@@ -108,7 +137,6 @@ def _need_bq_client():
108
 
109
  sa_json = os.getenv("GCP_SA_JSON")
110
  if sa_json:
111
- import json
112
  try:
113
  from google.oauth2 import service_account
114
  except Exception as e:
@@ -121,7 +149,6 @@ def _need_bq_client():
121
  except Exception as e:
122
  raise RuntimeError(f"GCP_SA_JSON inválido o no utilizable: {e}")
123
 
124
- # Fallback local
125
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
126
  try:
127
  return bigquery.Client(project=BQ_PROJECT)
@@ -134,12 +161,28 @@ def bq_get_columns_fixed():
134
  client = _need_bq_client()
135
  table = client.get_table(BQ_TABLE_FQN)
136
  cols = [sch.name for sch in table.schema]
137
- time_guess = _guess(cols, ["event_time","event_date","event_datetime","timestamp","date"])
138
- event_guess = _guess(cols, ["event_name","Event Name","evento","event"])
139
- id_guess = _guess(cols, ["appsflyer_id","advertising_id","adid","idfa","ID","Id"])
140
- appid_guess = _guess(cols, ["app_id","bundle_id","app","appId"])
 
 
141
  return cols, time_guess, event_guess, id_guess, appid_guess
142
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  def bq_list_events_fixed(event_col, time_col, app_id_col, app_id_value, start_date, end_date, limit=500):
144
  client = _need_bq_client()
145
  cols, t_guess, e_guess, _, a_guess = bq_get_columns_fixed()
@@ -196,21 +239,19 @@ def bq_query_to_temp_fixed(event_col, time_col, app_id_col, app_id_value, start_
196
  df.to_csv(tmp.name, index=False)
197
  return tmp.name, df.head(20).to_dict(orient="records")
198
 
199
- # -------------------------- MMP por archivo --------------------------
200
  def file_mmp_schema(file):
201
  try:
202
  df = _safe_read(file)
203
  except Exception as e:
204
  return (gr.update(), gr.update(), gr.update(), gr.update(), f"Error al leer MMP: {e}")
205
  cols = list(df.columns)
206
-
207
- # Requeridas (para el flujo de archivo)
208
- event_guess = _guess(cols, ["event_name","Event Name","evento","EVENTO","Event"])
209
- id_guess = _guess(cols, ["appsflyer_id","Advertising ID","advertising_id","adid","idfa","ID","Id"])
210
-
211
- # Opcionales (NO preseleccionar si no existen)
212
- time_guess = _guess_optional(cols, ["event_time","event_date","event_time_millis","timestamp","date","Date","Event Time"])
213
- appid_guess = _guess_optional(cols, ["app_id","bundle_id","app","appId","App ID"])
214
 
215
  return (gr.update(choices=cols, value=time_guess),
216
  gr.update(choices=cols, value=event_guess),
@@ -239,29 +280,26 @@ def file_prepare(src_file, ev_col, selected_events):
239
  except Exception as e:
240
  raise RuntimeError(f"Error al preparar MMP (archivo): {e}")
241
 
242
- # -------------------------- CLIENTE helpers --------------------------
243
  def cliente_map_columns(cliente_file):
244
  try:
245
  df = _safe_read(cliente_file)
246
  except Exception as e:
247
  return (gr.update(), gr.update(), gr.update(), gr.update(), "Error al leer CLIENTE: "+str(e))
248
  cols = list(df.columns)
249
-
250
- # Requerida
251
  id_guess = _guess(cols, [
252
- "appsflyer_id","Advertising ID","advertising_id","user_id","User Id",
 
253
  "transaction_id","Transaction Id","ID","Id","rut"
254
  ])
255
-
256
- # Opcionales: NO preseleccionar si no existen
257
  valid_guess = None
258
  metric_guess = _guess_optional(cols, ["revenue","amount","value","ticket","Event Revenue","importe","monto"])
259
  event_guess = _guess_optional(cols, ["event_name","Event Name","evento","EVENTO","Event"])
260
 
261
  return (gr.update(choices=cols, value=id_guess),
262
- gr.update(choices=cols, value=valid_guess), # opcional
263
- gr.update(choices=cols, value=metric_guess), # opcional
264
- gr.update(choices=cols, value=event_guess), # opcional
265
  "Columnas de CLIENTE listas.")
266
 
267
  def load_validation_values(cliente_file, validation_col):
@@ -274,13 +312,55 @@ def load_validation_values(cliente_file, validation_col):
274
  vals = sorted(pd.Series(df_c[validation_col].astype(str).unique()).dropna().tolist())
275
  return gr.update(choices=vals, value=[]), f"{len(vals)} valores posibles de validación."
276
 
277
- # -------------------------- Compute --------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  def compute(cliente_file, mmp_final_path,
279
  id_cliente_col, id_mmp_col,
280
- validation_col_client, validation_values,
281
- metric_col_client,
282
- client_event_col, # opcional
283
- mmp_event_col, # requerido
284
  selected_events_mmp):
285
 
286
  if not mmp_final_path:
@@ -288,124 +368,121 @@ def compute(cliente_file, mmp_final_path,
288
  if not cliente_file:
289
  return None, None, "Subí CLIENTE y mapeá las columnas."
290
 
 
291
  try:
292
  df_c = _safe_read(cliente_file)
293
  df_m = _safe_read(mmp_final_path)
294
  except Exception as e:
295
  return None, None, f"Error al leer fuentes: {e}"
296
 
297
- # Requeridos
298
  for name, col, df in [
299
- ("ID CLIENTE", id_cliente_col, df_c),
300
- ("ID MMP", id_mmp_col, df_m),
301
- ("EVENTO (MMP)", mmp_event_col, df_m),
302
  ]:
303
  if not col or col not in df.columns:
304
  return None, None, f"Columna inválida: {name} = {col}"
305
 
306
- # Merge 1: raw (CLIENTE ← MMP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  try:
308
- merged_left = df_c.merge(df_m, left_on=id_cliente_col, right_on=id_mmp_col, how="left",
309
- suffixes=("_CLIENTE", "_MMP"))
 
 
 
 
 
310
  except Exception as e:
311
- return None, None, f"Error durante el merge por IDs: {e}"
312
-
313
- # Merge 2: contar sobre MMP (MMP CLIENTE)
314
- merged_by_mmp = df_m.merge(df_c, left_on=id_mmp_col, right_on=id_cliente_col, how="left",
315
- suffixes=("_MMP", "_CLIENTE"))
316
-
317
- # Resolver nombres tras el merge (manejo de sufijos)
318
- def _resolve(df, col, prefer_suffix):
319
- if not col:
320
- return None
321
- if col in df.columns:
322
- return col
323
- for c in (f"{col}{prefer_suffix}", f"{col}_x", f"{col}_y"):
324
- if c in df.columns:
325
- return c
326
- lower_map = {c.lower(): c for c in df.columns}
327
- return lower_map.get(col.lower(), col)
328
-
329
- client_event_in_left = _resolve(merged_left, client_event_col, "_CLIENTE") if client_event_col else None
330
- mmp_event_in_left = _resolve(merged_left, mmp_event_col, "_MMP")
331
- validation_in_left = _resolve(merged_left, validation_col_client, "_CLIENTE") if validation_col_client else None
332
- metric_in_left = _resolve(merged_left, metric_col_client, "_CLIENTE") if metric_col_client else None
333
-
334
- client_event_in_mmp = _resolve(merged_by_mmp, client_event_col, "_CLIENTE") if client_event_col else None
335
- validation_in_mmp = _resolve(merged_by_mmp, validation_col_client, "_CLIENTE") if validation_col_client else None
336
- metric_in_mmp = _resolve(merged_by_mmp, metric_col_client, "_CLIENTE") if metric_col_client else None
337
- mmp_event_in_mmp = _resolve(merged_by_mmp, mmp_event_col, "_MMP")
338
-
339
- # Eventos objetivo
340
  if not selected_events_mmp:
341
- selected_events_mmp = sorted(df_m[mmp_event_col].astype(str).dropna().unique().tolist())
 
 
 
 
 
 
342
 
343
- # Denominador: conteo MMP por evento
344
- mmp_counts_map = df_m[mmp_event_col].astype(str).value_counts(dropna=False).to_dict()
345
 
346
- tables_by_event = {}
 
 
 
 
 
347
 
 
 
348
  for ev in selected_events_mmp:
349
  ev_str = str(ev)
350
- mmp_total = int(mmp_counts_map.get(ev_str, 0))
351
-
352
- # Numerador: filas MMP con match por ID en CLIENTE (y validación si aplica).
353
- sub_mmp = merged_by_mmp[merged_by_mmp[mmp_event_in_mmp].astype(str) == ev_str]
354
-
355
- if client_event_in_mmp and client_event_in_mmp in merged_by_mmp.columns:
356
- # Si hay evento en CLIENTE, además debe coincidir con el ev del MMP
357
- sub_mmp = sub_mmp[sub_mmp[client_event_in_mmp].astype(str) == ev_str]
358
-
359
- has_client = sub_mmp[id_cliente_col].notna()
360
- valid_mask = has_client
361
- if validation_in_mmp and validation_values:
362
- valid_mask = valid_mask & sub_mmp[validation_in_mmp].astype(str).isin([str(v) for v in validation_values])
363
-
364
- cliente_count = int(valid_mask.sum())
365
-
366
- metric_sum = 0.0
367
- if metric_in_mmp and metric_in_mmp in sub_mmp.columns:
368
- vals = pd.to_numeric(sub_mmp.loc[valid_mask, metric_in_mmp], errors="coerce")
369
- metric_sum = float(vals.sum()) if cliente_count else 0.0
370
-
371
- pct = round((cliente_count / mmp_total * 100), 1) if mmp_total else 0.0
372
- row = {"Cliente": cliente_count, "MMP": mmp_total, "%": pct}
373
- if metric_col_client and metric_in_mmp and metric_in_mmp in merged_by_mmp.columns:
374
- row[f"CLIENTE_{metric_col_client}_suma_validado"] = metric_sum
375
 
376
- tables_by_event[ev] = pd.DataFrame([row])
377
-
378
- # ===== Excel =====
379
  xls_bytes = BytesIO()
380
  with pd.ExcelWriter(xls_bytes, engine="xlsxwriter") as writer:
 
381
  sheet_name = "tablas_por_EVENTO"
382
  start_row = 0
383
  for ev, table_df in tables_by_event.items():
384
- pd.DataFrame([[ev]]).to_excel(writer, sheet_name=sheet_name,
385
- startrow=start_row, index=False, header=False)
 
 
386
  start_row += 1
387
- table_df.to_excel(writer, sheet_name=sheet_name,
388
- startrow=start_row, index=False)
 
389
  start_row += len(table_df) + 2
390
 
391
- # Hoja 2: raw_merge (cliente mmp)
392
- cols_keep = []
393
- for col in [id_cliente_col, id_mmp_col, client_event_in_left, mmp_event_in_left]:
394
- if col and col in merged_left.columns and col not in cols_keep:
395
- cols_keep.append(col)
396
- if validation_in_left and validation_in_left in merged_left.columns and validation_in_left not in cols_keep:
397
- cols_keep.append(validation_in_left)
398
- if metric_in_left and metric_in_left in merged_left.columns and metric_in_left not in cols_keep:
399
- cols_keep.append(metric_in_left)
400
- cols_rest = [c for c in merged_left.columns if c not in cols_keep]
401
- merged_left[cols_keep + cols_rest].to_excel(writer, sheet_name="raw_merge", index=False)
402
 
403
  xls_bytes.seek(0)
404
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
405
  tmp.write(xls_bytes.getvalue()); tmp.flush(); tmp.close()
406
  download_path = tmp.name
407
 
408
- # Preview
409
  preview = None
410
  if tables_by_event:
411
  first_ev = list(tables_by_event.keys())[0]
@@ -413,7 +490,7 @@ def compute(cliente_file, mmp_final_path,
413
 
414
  return preview, download_path, "Listo ✅"
415
 
416
- # -------------------------- UI --------------------------
417
  with gr.Blocks(title=APP_TITLE) as demo:
418
  gr.Markdown(f"# {APP_TITLE}\n\n{APP_DESC}")
419
 
@@ -421,11 +498,15 @@ with gr.Blocks(title=APP_TITLE) as demo:
421
  gr.Markdown("## Fuente 1: MMP")
422
  mmp_source = gr.Radio(choices=["Subir archivo", "BigQuery"], value="Subir archivo", label="Fuente de MMP")
423
 
424
- # --- BigQuery Panel (tabla fija) ---
425
  with gr.Column(visible=False) as bq_panel:
426
  gr.Markdown("**Paso MMP-BQ 1**: App ID y Fechas")
427
  with gr.Row():
428
- bq_app_id_value = gr.Textbox(label="App ID (valor exacto)", placeholder="com.tu.app")
 
 
 
 
429
  bq_start = gr.Textbox(label="Fecha desde (YYYY-MM-DD)", placeholder="YYYY-MM-DD")
430
  bq_end = gr.Textbox(label="Fecha hasta (YYYY-MM-DD)", placeholder="YYYY-MM-DD")
431
 
@@ -433,7 +514,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
433
  with gr.Row():
434
  bq_time_col = gr.Dropdown(choices=[], value=None, label="Columna temporal (ej: event_time)")
435
  mmp_event_col_bq = gr.Dropdown(choices=[], value=None, label="Columna de EVENTO en MMP (ej: event_name)")
436
- id_mmp_col_bq = gr.Dropdown(choices=[], value=None, label="ID en MMP (para cruce) (ej: appsflyer_id)")
437
  bq_app_id_col = gr.Dropdown(choices=[], value=None, label="Columna App ID (ej: app_id)")
438
  bq_schema_btn = gr.Button("Obtener columnas (schema)")
439
  bq_schema_msg = gr.Markdown()
@@ -450,7 +531,7 @@ with gr.Blocks(title=APP_TITLE) as demo:
450
  bq_query_btn = gr.Button("Consultar y cargar MMP (BigQuery)")
451
  bq_query_msg = gr.Markdown()
452
 
453
- # --- File Panel (simplificado) ---
454
  with gr.Column(visible=True) as file_panel:
455
  gr.Markdown("**Paso MMP-Archivo 1**: Subir y detectar columnas")
456
  mmp_file = gr.File(label="Subí MMP.xlsx/csv", file_types=[".xlsx", ".csv"])
@@ -479,6 +560,15 @@ with gr.Blocks(title=APP_TITLE) as demo:
479
  return (gr.update(visible=(src=="Subir archivo")), gr.update(visible=(src=="BigQuery")))
480
  mmp_source.change(_toggle_source, inputs=[mmp_source], outputs=[file_panel, bq_panel])
481
 
 
 
 
 
 
 
 
 
 
482
  # BQ: schema
483
  def _bq_schema_fixed():
484
  try:
@@ -582,13 +672,13 @@ with gr.Blocks(title=APP_TITLE) as demo:
582
 
583
  if source == "Subir archivo":
584
  mmp_path = mmp_final_file_panel
585
- id_mmp = id_mmp_file # valor seleccionado
586
- mmp_evt_col = mmp_evt_file # valor seleccionado
587
  selected_events = events_file
588
  else:
589
  mmp_path = mmp_final_bq_panel
590
- id_mmp = id_mmp_bq # valor seleccionado
591
- mmp_evt_col = mmp_evt_bq # valor seleccionado
592
  selected_events = events_bq
593
 
594
  if not id_cli:
 
1
+ # app.py
 
 
2
  import os
3
+ import json
4
+ from io import BytesIO
5
  import tempfile
6
+ import pandas as pd
7
+ import gradio as gr
8
 
9
+ # ================== BigQuery deps (opcionales) ==================
10
  try:
11
  from google.cloud import bigquery
12
  _HAS_BQ = True
13
  except Exception:
14
  _HAS_BQ = False
15
 
 
16
  try:
17
  import db_dtypes # noqa: F401
18
  _HAS_DB_DTYPES = True
 
25
  **BigQuery (tabla única)**: `leadgenios-tech.afiliacion_datalake.daily_afiliate_datalake`
26
 
27
  Pasos BQ:
28
+ 1) **Listar App IDs (BigQuery)** y seleccionar uno.
29
+ 2) Ingresá **rango de fechas** (YYYY-MM-DD).
30
+ 3) **Obtener columnas (schema)** → sugiere **columna temporal (event_time)**, **evento (event_name)**, **ID en MMP (appsflyer_id/customer_user_id/advertising_id)** y **App ID** (app_id).
31
+ 4) **Listar eventos por rango** (usa App ID + fechas + columna de evento).
32
+ 5) **Consultar y cargar MMP** → genera CSV temporal, preview y descarga.
33
 
34
  **Archivo**: subir archivo, detectar columnas y (opcional) **listar eventos** para filtrar. No hace falta App ID ni fechas.
35
 
 
45
  - Excel: **Hoja 1** tablas por evento; **Hoja 2** `raw_merge`.
46
  """
47
 
48
+ # ================== Helpers de lectura ==================
49
  def _read_excel(pathlike):
50
  return pd.read_excel(pathlike, engine="openpyxl")
51
 
 
84
  return cols[0] if cols else None
85
 
86
  def _guess_optional(cols, candidates):
 
87
  lower_map = {c.lower(): c for c in cols}
88
  for cand in candidates:
89
  if cand.lower() in lower_map:
 
95
  return path
96
  return None
97
 
98
+ # ================== Normalización de IDs ==================
99
+ def normalize_id_series(s: pd.Series) -> pd.Series:
100
+ """
101
+ Normaliza IDs para merges:
102
+ - Convierte a string, quita espacios.
103
+ - Si es float 'entero' (123.0) lo transforma a '123'.
104
+ - Deja NaN como NaN.
105
+ """
106
+ def _norm(v):
107
+ if pd.isna(v):
108
+ return pd.NA
109
+ # floats que representan enteros → sin .0
110
+ if isinstance(v, float):
111
+ if v.is_integer():
112
+ return str(int(v))
113
+ else:
114
+ # si es float no entero, lo pasamos a string tal cual
115
+ return str(v)
116
+ # todo lo demás a str
117
+ vs = str(v).strip()
118
+ # si quedó como "nan" literal, considerar NA
119
+ if vs.lower() in ("nan", "none", ""):
120
+ return pd.NA
121
+ return vs
122
+ out = s.map(_norm)
123
+ # asegura dtype string que permite NA
124
+ return out.astype("string")
125
+
126
+ # ================== BigQuery helpers ==================
127
  BQ_PROJECT = "leadgenios-tech"
128
  BQ_TABLE_FQN = "leadgenios-tech.afiliacion_datalake.daily_afiliate_datalake"
129
 
 
137
 
138
  sa_json = os.getenv("GCP_SA_JSON")
139
  if sa_json:
 
140
  try:
141
  from google.oauth2 import service_account
142
  except Exception as e:
 
149
  except Exception as e:
150
  raise RuntimeError(f"GCP_SA_JSON inválido o no utilizable: {e}")
151
 
 
152
  if os.getenv("GOOGLE_APPLICATION_CREDENTIALS"):
153
  try:
154
  return bigquery.Client(project=BQ_PROJECT)
 
161
  client = _need_bq_client()
162
  table = client.get_table(BQ_TABLE_FQN)
163
  cols = [sch.name for sch in table.schema]
164
+
165
+ time_guess = _guess(cols, ["event_time", "install_time", "attributed_touch_time"])
166
+ event_guess = _guess(cols, ["event_name"])
167
+ # IDs típicos
168
+ id_guess = _guess(cols, ["appsflyer_id", "customer_user_id", "advertising_id"])
169
+ appid_guess = _guess(cols, ["app_id"])
170
  return cols, time_guess, event_guess, id_guess, appid_guess
171
 
172
+ def bq_list_app_ids(limit=500):
173
+ """Lista App IDs de la tabla BQ para el dropdown."""
174
+ client = _need_bq_client()
175
+ sql = f"""
176
+ SELECT DISTINCT CAST(app_id AS STRING) AS app_id
177
+ FROM `{BQ_TABLE_FQN}`
178
+ WHERE app_id IS NOT NULL AND app_id <> ''
179
+ ORDER BY app_id
180
+ LIMIT {int(limit)}
181
+ """
182
+ df = client.query(sql).result().to_dataframe(create_bqstorage_client=False)
183
+ vals = sorted(df["app_id"].dropna().astype(str).tolist())
184
+ return vals, f"{len(vals)} App IDs encontrados."
185
+
186
  def bq_list_events_fixed(event_col, time_col, app_id_col, app_id_value, start_date, end_date, limit=500):
187
  client = _need_bq_client()
188
  cols, t_guess, e_guess, _, a_guess = bq_get_columns_fixed()
 
239
  df.to_csv(tmp.name, index=False)
240
  return tmp.name, df.head(20).to_dict(orient="records")
241
 
242
+ # ================== MMP por archivo ==================
243
  def file_mmp_schema(file):
244
  try:
245
  df = _safe_read(file)
246
  except Exception as e:
247
  return (gr.update(), gr.update(), gr.update(), gr.update(), f"Error al leer MMP: {e}")
248
  cols = list(df.columns)
249
+ event_guess = _guess(cols, ["event_name", "Event Name", "evento", "EVENTO", "Event"])
250
+ id_guess = _guess(cols, ["appsflyer_id", "customer_user_id", "advertising_id",
251
+ "Advertising ID", "adid", "idfa", "ID", "Id"])
252
+ time_guess = _guess_optional(cols, ["event_time", "install_time", "attributed_touch_time",
253
+ "event_date", "timestamp", "date", "Date", "Event Time"])
254
+ appid_guess = _guess_optional(cols, ["app_id", "bundle_id", "app", "appId", "App ID"])
 
 
255
 
256
  return (gr.update(choices=cols, value=time_guess),
257
  gr.update(choices=cols, value=event_guess),
 
280
  except Exception as e:
281
  raise RuntimeError(f"Error al preparar MMP (archivo): {e}")
282
 
283
+ # ================== CLIENTE helpers ==================
284
  def cliente_map_columns(cliente_file):
285
  try:
286
  df = _safe_read(cliente_file)
287
  except Exception as e:
288
  return (gr.update(), gr.update(), gr.update(), gr.update(), "Error al leer CLIENTE: "+str(e))
289
  cols = list(df.columns)
 
 
290
  id_guess = _guess(cols, [
291
+ "appsflyer_id","customer_user_id","advertising_id",
292
+ "Advertising ID","user_id","User Id",
293
  "transaction_id","Transaction Id","ID","Id","rut"
294
  ])
 
 
295
  valid_guess = None
296
  metric_guess = _guess_optional(cols, ["revenue","amount","value","ticket","Event Revenue","importe","monto"])
297
  event_guess = _guess_optional(cols, ["event_name","Event Name","evento","EVENTO","Event"])
298
 
299
  return (gr.update(choices=cols, value=id_guess),
300
+ gr.update(choices=cols, value=valid_guess),
301
+ gr.update(choices=cols, value=metric_guess),
302
+ gr.update(choices=cols, value=event_guess),
303
  "Columnas de CLIENTE listas.")
304
 
305
  def load_validation_values(cliente_file, validation_col):
 
312
  vals = sorted(pd.Series(df_c[validation_col].astype(str).unique()).dropna().tolist())
313
  return gr.update(choices=vals, value=[]), f"{len(vals)} valores posibles de validación."
314
 
315
+
316
+ # ================== Compute ==================
317
+ from io import BytesIO
318
+ import tempfile
319
+ import re
320
+ import pandas as pd
321
+
322
+ # --- helpers ---------------------------------------------------------
323
+
324
+ def normalize_id_series(s: pd.Series) -> pd.Series:
325
+ """
326
+ Normalize IDs for robust equality:
327
+ - cast to string
328
+ - strip whitespace
329
+ - lowercase
330
+ - convert 'nan'/'none' to ''
331
+ """
332
+ x = s.astype(str).str.strip().str.lower()
333
+ x = x.replace({"nan": "", "none": ""})
334
+ return x.fillna("")
335
+
336
+ def _autodetect_validation_col(cols):
337
+ """Try to find a likely validation column if user didn't pick one."""
338
+ candidates = [
339
+ "valid", "valido", "válido", "is_valid", "usable", "status",
340
+ "approved", "aprobado", "ok", "flag", "validated", "validation"
341
+ ]
342
+ lower = {c.lower(): c for c in cols}
343
+ for cand in candidates:
344
+ if cand in lower:
345
+ return lower[cand]
346
+ return None
347
+
348
+ def _default_truthy_set():
349
+ # NOTE: all lowercased string checks
350
+ return {
351
+ "true", "1", "yes", "y", "ok", "si", "sí",
352
+ "valid", "valido", "válido", "usable", "approved", "aprobado",
353
+ "x", "t"
354
+ }
355
+
356
+ # --- main ------------------------------------------------------------
357
+
358
  def compute(cliente_file, mmp_final_path,
359
  id_cliente_col, id_mmp_col,
360
+ validation_col_client, validation_values, # optional
361
+ metric_col_client, # ignored in this logic
362
+ client_event_col, # ignored (denominator is MMP)
363
+ mmp_event_col, # required
364
  selected_events_mmp):
365
 
366
  if not mmp_final_path:
 
368
  if not cliente_file:
369
  return None, None, "Subí CLIENTE y mapeá las columnas."
370
 
371
+ # Read sources
372
  try:
373
  df_c = _safe_read(cliente_file)
374
  df_m = _safe_read(mmp_final_path)
375
  except Exception as e:
376
  return None, None, f"Error al leer fuentes: {e}"
377
 
378
+ # Required columns present?
379
  for name, col, df in [
380
+ ("ID en CLIENTE", id_cliente_col, df_c),
381
+ ("ID en MMP", id_mmp_col, df_m),
382
+ ("EVENTO en MMP", mmp_event_col, df_m),
383
  ]:
384
  if not col or col not in df.columns:
385
  return None, None, f"Columna inválida: {name} = {col}"
386
 
387
+ # Normalize IDs
388
+ try:
389
+ ids_cli_norm = normalize_id_series(df_c[id_cliente_col])
390
+ ids_mmp_norm = normalize_id_series(df_m[id_mmp_col])
391
+ except Exception as e:
392
+ return None, None, f"Error normalizando IDs: {e}"
393
+
394
+ # If user didn't select a validation col, try to autodetect one
395
+ if not validation_col_client or validation_col_client not in df_c.columns:
396
+ auto_val_col = _autodetect_validation_col(df_c.columns)
397
+ validation_col_client = auto_val_col if auto_val_col else None
398
+
399
+ # If a validation column exists but user didn't pick values, use default “truthy” set
400
+ truthy = _default_truthy_set()
401
+ use_validation = validation_col_client is not None
402
+ if use_validation:
403
+ cand_vals = validation_values or []
404
+ if cand_vals:
405
+ truthy = {str(v).strip().lower() for v in cand_vals}
406
+
407
+ # Build set of CLIENTE IDs that are considered valid
408
  try:
409
+ if use_validation:
410
+ val_series = df_c[validation_col_client].astype(str).str.strip().str.lower()
411
+ mask_valid = val_series.isin(truthy)
412
+ valid_client_ids = set(ids_cli_norm[mask_valid][ids_cli_norm[mask_valid] != ""])
413
+ else:
414
+ # No validation column → any presence in CLIENTE counts as valid
415
+ valid_client_ids = set(ids_cli_norm[ids_cli_norm != ""])
416
  except Exception as e:
417
+ return None, None, f"Error aplicando validación en CLIENTE: {e}"
418
+
419
+ # Create VALIDO flag in MMP: True if MMP id ∈ valid_client_ids
420
+ df_m = df_m.copy()
421
+ df_m["VALIDO"] = ids_mmp_norm.isin(valid_client_ids)
422
+
423
+ # Events to process (if none selected, use all present in MMP)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
424
  if not selected_events_mmp:
425
+ try:
426
+ selected_events_mmp = (
427
+ df_m[mmp_event_col].astype(str).dropna().unique().tolist()
428
+ )
429
+ selected_events_mmp = sorted(map(str, selected_events_mmp))
430
+ except Exception as e:
431
+ return None, None, f"Error obteniendo lista de eventos MMP: {e}"
432
 
433
+ # B: total rows in MMP per event
434
+ mmp_counts = df_m.groupby(df_m[mmp_event_col].astype(str), dropna=False).size()
435
 
436
+ # A: total rows in MMP per event with VALIDO=True
437
+ cliente_counts = (
438
+ df_m[df_m["VALIDO"]]
439
+ .groupby(df_m.loc[df_m["VALIDO"], mmp_event_col].astype(str), dropna=False)
440
+ .size()
441
+ )
442
 
443
+ # Build event tables
444
+ tables_by_event = {}
445
  for ev in selected_events_mmp:
446
  ev_str = str(ev)
447
+ B = int(mmp_counts.get(ev_str, 0))
448
+ A = int(cliente_counts.get(ev_str, 0))
449
+ pct = round((A / B * 100), 1) if B else 0.0
450
+ tables_by_event[ev] = pd.DataFrame([{"Cliente": A, "MMP": B, "%": pct}])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
451
 
452
+ # ===== Excel output =====
 
 
453
  xls_bytes = BytesIO()
454
  with pd.ExcelWriter(xls_bytes, engine="xlsxwriter") as writer:
455
+ # Sheet 1: tables by EVENT
456
  sheet_name = "tablas_por_EVENTO"
457
  start_row = 0
458
  for ev, table_df in tables_by_event.items():
459
+ pd.DataFrame([[ev]]).to_excel(
460
+ writer, sheet_name=sheet_name, startrow=start_row,
461
+ index=False, header=False
462
+ )
463
  start_row += 1
464
+ table_df.to_excel(
465
+ writer, sheet_name=sheet_name, startrow=start_row, index=False
466
+ )
467
  start_row += len(table_df) + 2
468
 
469
+ # Sheet 2: raw MMP + only VALIDO (explicitly drop the ID columns)
470
+ cols_front = ["VALIDO"] # first column
471
+ # Keep event column visible & useful
472
+ if mmp_event_col in df_m.columns:
473
+ cols_front.insert(0, mmp_event_col)
474
+
475
+ # Exclude ID & any helper columns from raw output
476
+ drop_cols = {id_mmp_col, "_id_norm_mmp"} # (we never created _id_norm_mmp here)
477
+ cols_rest = [c for c in df_m.columns if c not in set(cols_front) | drop_cols]
478
+ df_m[cols_front + cols_rest].to_excel(writer, sheet_name="raw_mmp", index=False)
 
479
 
480
  xls_bytes.seek(0)
481
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx")
482
  tmp.write(xls_bytes.getvalue()); tmp.flush(); tmp.close()
483
  download_path = tmp.name
484
 
485
+ # Preview: first event table
486
  preview = None
487
  if tables_by_event:
488
  first_ev = list(tables_by_event.keys())[0]
 
490
 
491
  return preview, download_path, "Listo ✅"
492
 
493
+ # ================== UI ==================
494
  with gr.Blocks(title=APP_TITLE) as demo:
495
  gr.Markdown(f"# {APP_TITLE}\n\n{APP_DESC}")
496
 
 
498
  gr.Markdown("## Fuente 1: MMP")
499
  mmp_source = gr.Radio(choices=["Subir archivo", "BigQuery"], value="Subir archivo", label="Fuente de MMP")
500
 
501
+ # --- BigQuery Panel ---
502
  with gr.Column(visible=False) as bq_panel:
503
  gr.Markdown("**Paso MMP-BQ 1**: App ID y Fechas")
504
  with gr.Row():
505
+ bq_app_id_value = gr.Dropdown(choices=[], value=None, label="App ID (BigQuery)")
506
+ list_app_ids_btn = gr.Button("Listar App IDs (BigQuery)")
507
+ list_app_ids_msg = gr.Markdown()
508
+
509
+ with gr.Row():
510
  bq_start = gr.Textbox(label="Fecha desde (YYYY-MM-DD)", placeholder="YYYY-MM-DD")
511
  bq_end = gr.Textbox(label="Fecha hasta (YYYY-MM-DD)", placeholder="YYYY-MM-DD")
512
 
 
514
  with gr.Row():
515
  bq_time_col = gr.Dropdown(choices=[], value=None, label="Columna temporal (ej: event_time)")
516
  mmp_event_col_bq = gr.Dropdown(choices=[], value=None, label="Columna de EVENTO en MMP (ej: event_name)")
517
+ id_mmp_col_bq = gr.Dropdown(choices=[], value=None, label="ID en MMP (para cruce)")
518
  bq_app_id_col = gr.Dropdown(choices=[], value=None, label="Columna App ID (ej: app_id)")
519
  bq_schema_btn = gr.Button("Obtener columnas (schema)")
520
  bq_schema_msg = gr.Markdown()
 
531
  bq_query_btn = gr.Button("Consultar y cargar MMP (BigQuery)")
532
  bq_query_msg = gr.Markdown()
533
 
534
+ # --- File Panel ---
535
  with gr.Column(visible=True) as file_panel:
536
  gr.Markdown("**Paso MMP-Archivo 1**: Subir y detectar columnas")
537
  mmp_file = gr.File(label="Subí MMP.xlsx/csv", file_types=[".xlsx", ".csv"])
 
560
  return (gr.update(visible=(src=="Subir archivo")), gr.update(visible=(src=="BigQuery")))
561
  mmp_source.change(_toggle_source, inputs=[mmp_source], outputs=[file_panel, bq_panel])
562
 
563
+ # BQ: listar App IDs
564
+ def _bq_list_app_ids():
565
+ try:
566
+ vals, msg = bq_list_app_ids()
567
+ return gr.update(choices=vals, value=(vals[0] if vals else None)), msg
568
+ except Exception as e:
569
+ return gr.update(choices=[], value=None), f"Error listando App IDs: {e}"
570
+ list_app_ids_btn.click(_bq_list_app_ids, inputs=[], outputs=[bq_app_id_value, list_app_ids_msg])
571
+
572
  # BQ: schema
573
  def _bq_schema_fixed():
574
  try:
 
672
 
673
  if source == "Subir archivo":
674
  mmp_path = mmp_final_file_panel
675
+ id_mmp = id_mmp_file
676
+ mmp_evt_col = mmp_evt_file
677
  selected_events = events_file
678
  else:
679
  mmp_path = mmp_final_bq_panel
680
+ id_mmp = id_mmp_bq
681
+ mmp_evt_col = mmp_evt_bq
682
  selected_events = events_bq
683
 
684
  if not id_cli: