rogergs94 commited on
Commit
5700668
·
verified ·
1 Parent(s): 29b8dcc

Update app.py

Browse files

Updated with grouped option and map tab (to be built)

Files changed (1) hide show
  1. app.py +556 -55
app.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  import pandas as pd
2
  import requests
3
  import xml.etree.ElementTree as ET
@@ -7,6 +11,17 @@ import gzip
7
  import datetime
8
  import gradio as gr
9
  import os
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  class FeedReader:
12
  def __init__(self):
@@ -106,7 +121,7 @@ class FeedReader:
106
  def process_feed(self, url, job_tag="job"):
107
  """Main function to process feed and return results"""
108
  if not url.strip():
109
- return "Please enter a valid URL", None, "", ""
110
 
111
  # Load the feed
112
  result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
@@ -114,7 +129,7 @@ class FeedReader:
114
  if isinstance(result, tuple):
115
  df, message = result
116
  if df.empty:
117
- return f"Error: {message}", None, "", ""
118
  else:
119
  df = result
120
  message = "Success"
@@ -133,17 +148,39 @@ class FeedReader:
133
  📊 **Feed Processing Results**
134
 
135
  ✅ **Status:** {message}
136
- 📋 **Rows:** {df_processed.shape[0]:,}
137
- 📝 **Columns:** {df_processed.shape[1]}
138
 
139
- 🔍 **Column Names:**
140
- {', '.join(df_processed.columns.tolist())}
141
 
142
- 📈 **Data Types:**
143
- {df_processed.dtypes.to_string()}
144
  """
145
 
146
- return summary, df_processed, self.generate_csv(df_processed), self.get_preview(df_processed)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
  def filter_by_column(self, column_name, filter_value):
149
  """Filter dataframe by column value"""
@@ -163,32 +200,73 @@ class FeedReader:
163
 
164
  actual_column = matching_columns[0]
165
 
166
- # Filter the dataframe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  if self.df[actual_column].dtype == 'object': # String column
168
- filtered_df = self.df[self.df[actual_column].str.contains(filter_value, na=False, case=False)]
 
169
  else: # Numeric column
170
  try:
171
  filter_val_numeric = float(filter_value)
172
  filtered_df = self.df[self.df[actual_column] == filter_val_numeric]
173
  except ValueError:
174
- filtered_df = self.df[self.df[actual_column].astype(str).str.contains(filter_value, na=False, case=False)]
175
 
176
  if filtered_df.empty:
177
- return f"No records found matching '{filter_value}' in column '{actual_column}'", None, ""
178
 
179
  filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
180
 
 
 
 
181
  summary = f"""
182
  🔍 **Filtered Results**
183
 
184
  📋 **Matching Rows:** {filtered_df.shape[0]:,}
185
- 🎯 **Filter:** {actual_column} contains '{filter_value}'
186
  """
187
 
188
- return summary, filtered_df, self.generate_csv(filtered_df, f"filtered_{filter_value}")
189
 
190
  except Exception as e:
191
- return f"Error filtering data: {str(e)}", None, ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  def get_column_stats(self):
194
  """Get statistics for each column"""
@@ -223,46 +301,194 @@ class FeedReader:
223
  except Exception as e:
224
  return f"Error generating statistics: {str(e)}"
225
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  def generate_csv(self, df, filename_prefix="feed"):
227
- """Generate CSV file for download"""
228
  if df is None or df.empty:
229
  return None
230
 
231
- # Create a temporary file
232
- import tempfile
233
- temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix=f'{filename_prefix}_')
234
- df.to_csv(temp_file.name, index=False)
235
- temp_file.close()
236
- return temp_file.name
 
 
 
 
 
 
237
 
238
  def get_preview(self, df, max_rows=10):
239
- """Get a preview of the dataframe"""
240
  if df is None or df.empty:
241
- return "No data to preview"
242
 
243
  # Limit the preview to avoid overwhelming display
244
- preview_df = df.head(max_rows)
245
 
246
  # Truncate long string values for better display
247
- preview_df = preview_df.copy()
248
  for col in preview_df.select_dtypes(include=['object']).columns:
249
- preview_df[col] = preview_df[col].astype(str).apply(lambda x: x[:50] + '...' if len(str(x)) > 50 else x)
 
 
250
 
251
- preview = preview_df.to_string(max_cols=8, max_rows=max_rows, show_dimensions=True)
252
- return f"**Data Preview (First {min(max_rows, len(df))} rows):**\n```\n{preview}\n```"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  # Initialize the feed reader
255
  feed_reader = FeedReader()
256
 
257
  # Create Gradio interface
258
  def create_gradio_app():
259
  with gr.Blocks(title="Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
260
- gr.Markdown("""
261
- # 📡 Feed Reader & Analyzer
 
 
 
 
 
 
262
 
263
- Load and analyze XML or JSON feeds from URLs. Supports compressed files (.gz) and various data formats.
264
- """)
265
 
 
 
 
266
  with gr.Tab("📥 Load Feed"):
267
  with gr.Row():
268
  with gr.Column():
@@ -278,36 +504,58 @@ def create_gradio_app():
278
  )
279
  load_btn = gr.Button("🔄 Load Feed", variant="primary")
280
 
 
281
  with gr.Column():
282
  summary_output = gr.Markdown(label="Summary")
 
 
 
 
 
 
 
283
 
284
  with gr.Row():
285
- preview_output = gr.Markdown(label="Data Preview")
 
 
 
 
 
 
286
 
287
  with gr.Row():
288
  csv_download = gr.File(label="📥 Download Full Dataset (CSV)", visible=True)
289
 
 
 
 
290
  # Load feed functionality
291
  def process_and_download(url, job_tag):
292
- summary, df_processed, csv_file, preview = feed_reader.process_feed(url, job_tag)
293
- return summary, preview, csv_file
294
 
295
  load_btn.click(
296
  process_and_download,
297
  inputs=[url_input, job_tag_input],
298
- outputs=[summary_output, preview_output, csv_download]
299
  )
300
 
301
  with gr.Tab("🔍 Filter Data"):
302
  with gr.Row():
303
  with gr.Column():
304
- filter_column = gr.Textbox(
305
- label="Column Name",
306
- placeholder="e.g., clientname, title, category"
 
 
307
  )
308
- filter_value = gr.Textbox(
 
309
  label="Filter Value",
310
- placeholder="Value to search for"
 
 
311
  )
312
  filter_btn = gr.Button("🔍 Filter", variant="primary")
313
 
@@ -315,36 +563,288 @@ def create_gradio_app():
315
  filter_summary = gr.Markdown(label="Filter Results")
316
 
317
  with gr.Row():
318
- filtered_csv = gr.File(label="📥 Download Filtered Data (CSV)", visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
  # Filter functionality
321
  def filter_and_download(column_name, filter_value):
322
  summary, df_filtered, csv_file = feed_reader.filter_by_column(column_name, filter_value)
323
- return summary, csv_file
 
 
 
 
 
324
 
325
  filter_btn.click(
326
  filter_and_download,
327
- inputs=[filter_column, filter_value],
328
- outputs=[filter_summary, filtered_csv]
329
  )
330
 
331
  with gr.Tab("📊 Statistics"):
332
- with gr.Column():
333
- stats_btn = gr.Button("📊 Generate Column Statistics", variant="primary")
334
- stats_output = gr.Dataframe(label="Column Statistics")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
 
336
- # Statistics functionality
337
- stats_btn.click(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  feed_reader.get_column_stats,
339
- outputs=[stats_output]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
340
  )
341
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  gr.Markdown("""
343
  ---
344
  ### 📝 Instructions:
345
 
346
  1. **Load Feed**: Enter a URL pointing to an XML or JSON feed and click "Load Feed"
347
- 2. **Filter Data**: Use column names to filter the loaded data
348
  3. **Statistics**: View detailed statistics about each column in your dataset
349
  4. **Download**: CSV files are automatically generated for download
350
 
@@ -356,9 +856,10 @@ def create_gradio_app():
356
  **Features:**
357
  - Automatic format detection
358
  - Data cleaning and validation
359
- - Column-based filtering
360
  - Statistical analysis
361
  - CSV export functionality
 
362
  """)
363
 
364
  return app
 
1
+ import folium
2
+ from folium.plugins import MarkerCluster
3
+ from geopy.geocoders import Nominatim
4
+ from functools import lru_cache
5
  import pandas as pd
6
  import requests
7
  import xml.etree.ElementTree as ET
 
11
  import datetime
12
  import gradio as gr
13
  import os
14
+ import tempfile
15
+ import pytz
16
+
17
+ geolocator = Nominatim(user_agent="feed_reader_app")
18
+
19
+ @lru_cache(maxsize=10000)
20
+ def geocode_cached(query):
21
+ try:
22
+ return geolocator.geocode(query, timeout=10)
23
+ except Exception:
24
+ return None
25
 
26
  class FeedReader:
27
  def __init__(self):
 
121
  def process_feed(self, url, job_tag="job"):
122
  """Main function to process feed and return results"""
123
  if not url.strip():
124
+ return "Please enter a valid URL", None, "", "", []
125
 
126
  # Load the feed
127
  result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
 
129
  if isinstance(result, tuple):
130
  df, message = result
131
  if df.empty:
132
+ return f"Error: {message}", None, "", "", []
133
  else:
134
  df = result
135
  message = "Success"
 
148
  📊 **Feed Processing Results**
149
 
150
  ✅ **Status:** {message}
 
 
151
 
152
+ 📋 **Rows:** {df_processed.shape[0]:,}
 
153
 
154
+ 📝 **Columns:** {df_processed.shape[1]}
 
155
  """
156
 
157
+ # Create metadata dataframe
158
+ metadata_df = pd.DataFrame({
159
+ 'Column Name': df_processed.columns.tolist(),
160
+ 'Data Type': [str(df_processed[col].dtype) for col in df_processed.columns],
161
+ 'Unique Values': [df_processed[col].nunique() for col in df_processed.columns],
162
+ 'Null Values': [df_processed[col].isnull().sum() for col in df_processed.columns]
163
+ })
164
+
165
+ # Get column choices for filter tab
166
+ column_choices = df_processed.columns.tolist()
167
+
168
+ return summary, df_processed, self.generate_csv(df_processed, "feed"), self.get_preview(df_processed), column_choices, metadata_df
169
+
170
+ def get_column_unique_values(self, column_name):
171
+ """Get unique values for a specific column"""
172
+ if self.df is None:
173
+ return []
174
+
175
+ if column_name not in self.df.columns:
176
+ return []
177
+
178
+ # Get unique values and convert to string, sort them
179
+ unique_values = self.df[column_name].dropna().astype(str).unique()
180
+ unique_values = sorted([str(val) for val in unique_values if str(val) != 'nan'])
181
+
182
+ # Add "All" option at the beginning
183
+ return ["All"] + unique_values
184
 
185
  def filter_by_column(self, column_name, filter_value):
186
  """Filter dataframe by column value"""
 
200
 
201
  actual_column = matching_columns[0]
202
 
203
+ # If "All" is selected, return the entire dataframe
204
+ if filter_value == "All":
205
+ filtered_df = self.df.copy()
206
+ filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
207
+
208
+ # Truncate long columns for display only
209
+ display_df = self.truncate_display_columns(filtered_df.copy())
210
+
211
+ summary = f"""
212
+ 🔍 **Filtered Results**
213
+
214
+ 📋 **Total Rows:** {filtered_df.shape[0]:,}
215
+ 🎯 **Filter:** Showing all records from column '{actual_column}'
216
+ """
217
+
218
+ return summary, display_df, self.generate_csv(filtered_df, f"all_{actual_column}")
219
+
220
+ # Filter the dataframe for specific value
221
  if self.df[actual_column].dtype == 'object': # String column
222
+ # Exact match for dropdown selection
223
+ filtered_df = self.df[self.df[actual_column].astype(str) == str(filter_value)]
224
  else: # Numeric column
225
  try:
226
  filter_val_numeric = float(filter_value)
227
  filtered_df = self.df[self.df[actual_column] == filter_val_numeric]
228
  except ValueError:
229
+ filtered_df = self.df[self.df[actual_column].astype(str) == str(filter_value)]
230
 
231
  if filtered_df.empty:
232
+ return f"No records found matching '{filter_value}' in column '{actual_column}'", pd.DataFrame(), ""
233
 
234
  filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
235
 
236
+ # Truncate long columns for display only
237
+ display_df = self.truncate_display_columns(filtered_df.copy())
238
+
239
  summary = f"""
240
  🔍 **Filtered Results**
241
 
242
  📋 **Matching Rows:** {filtered_df.shape[0]:,}
243
+ 🎯 **Filter:** {actual_column} = '{filter_value}'
244
  """
245
 
246
+ return summary, display_df, self.generate_csv(filtered_df, f"filtered_{filter_value}")
247
 
248
  except Exception as e:
249
+ return f"Error filtering data: {str(e)}", pd.DataFrame(), ""
250
+
251
+ def truncate_display_columns(self, df):
252
+ """Truncate long columns for better display in DataFrames"""
253
+ display_df = df.copy()
254
+
255
+ # Define columns that typically have long content
256
+ long_content_columns = ['url', 'description', 'link', 'content', 'summary', 'text']
257
+
258
+ for col in display_df.select_dtypes(include=['object']).columns:
259
+ # Apply more aggressive truncation to known long columns
260
+ if any(long_col in col.lower() for long_col in long_content_columns):
261
+ display_df[col] = display_df[col].astype(str).apply(
262
+ lambda x: x[:30] + '...' if len(str(x)) > 30 else x
263
+ )
264
+ else:
265
+ # Standard truncation for other text columns
266
+ display_df[col] = display_df[col].astype(str).apply(
267
+ lambda x: x[:50] + '...' if len(str(x)) > 50 else x
268
+ )
269
+ return display_df
270
 
271
  def get_column_stats(self):
272
  """Get statistics for each column"""
 
301
  except Exception as e:
302
  return f"Error generating statistics: {str(e)}"
303
 
304
+ def calcular_ponderados(self, df):
305
+ """Función para calcular medias ponderadas"""
306
+ total_count = df["count"].sum()
307
+ mean_cpa = (df["cpa_goal"] * df["count"]).sum() / total_count if total_count > 0 else 0
308
+ mean_sponsored = (df["sponsored"] * df["count"]).sum() / total_count if total_count > 0 else 0
309
+ min_cpc = (df["sponsored"]).min()
310
+ max_cpc = (df["sponsored"]).max()
311
+ min_cpa = (df["cpa_goal"]).min()
312
+ max_cpa = (df["cpa_goal"]).max()
313
+
314
+ # Obtener la hora actual en PST
315
+ pacific_tz = pytz.timezone("America/Los_Angeles")
316
+ now_pst = datetime.datetime.now(pytz.utc).astimezone(pacific_tz)
317
+
318
+ return pd.Series({
319
+ "total_jobs": int(total_count),
320
+ "mean_cpa_goal": round(mean_cpa,2),
321
+ "mean_cpc": round(mean_sponsored,2),
322
+ "target_cvr": round((mean_sponsored/mean_cpa)*100,2) if mean_cpa > 0 else 0,
323
+ "min_cpc": round(min_cpc,2),
324
+ "max_cpc": round(max_cpc,2),
325
+ "min_cpa": round(min_cpa,2),
326
+ "max_cpa": round(max_cpa,2),
327
+ "last_update": now_pst.strftime("%Y-%m-%d %H:%M:%S %Z")
328
+ })
329
+
330
+ def get_weighted_stats_by_group(self, group_column, reference_col=None, cpa_col=None, cpc_col=None):
331
+ """Get weighted statistics grouped by specified column with flexible column selection"""
332
+ if self.df is None:
333
+ return pd.DataFrame(), "Please load a feed first"
334
+
335
+ # Check if group column exists
336
+ if group_column not in self.df.columns:
337
+ available_columns = [col for col in self.df.columns if col != 'last_update']
338
+ return pd.DataFrame(), f"Column '{group_column}' not found. Available columns: {', '.join(available_columns)}"
339
+
340
+ # Check if selected columns exist
341
+ selected_columns = [col for col in [reference_col, cpa_col, cpc_col] if col is not None]
342
+ missing_columns = [col for col in selected_columns if col not in self.df.columns]
343
+
344
+ if missing_columns:
345
+ available_columns = list(self.df.columns)
346
+ return pd.DataFrame(), f"Missing selected columns: {', '.join(missing_columns)}. Available columns: {', '.join(available_columns)}"
347
+
348
+ try:
349
+ def calculate_group_stats(group_df):
350
+ results = {}
351
+
352
+ # Always calculate total postings
353
+ results["total_postings"] = int(len(group_df))
354
+
355
+ # Calculate unique references if reference column is provided
356
+ if reference_col:
357
+ results["unique_references"] = int(group_df[reference_col].nunique())
358
+
359
+ # Calculate CPA statistics if CPA column is provided
360
+ if cpa_col:
361
+ cpa_series = pd.to_numeric(group_df[cpa_col], errors='coerce')
362
+ results["mean_cpa_goal"] = round(cpa_series.mean(), 2) if not cpa_series.isna().all() else 0
363
+ results["min_cpa"] = round(cpa_series.min(), 2) if not cpa_series.isna().all() else 0
364
+ results["max_cpa"] = round(cpa_series.max(), 2) if not cpa_series.isna().all() else 0
365
+
366
+ # Calculate CPC/Payout statistics if CPC column is provided
367
+ if cpc_col:
368
+ cpc_series = pd.to_numeric(group_df[cpc_col], errors='coerce')
369
+ results["mean_payouts"] = round(cpc_series.mean(), 2) if not cpc_series.isna().all() else 0
370
+ results["min_payouts"] = round(cpc_series.min(), 2) if not cpc_series.isna().all() else 0
371
+ results["max_payouts"] = round(cpc_series.max(), 2) if not cpc_series.isna().all() else 0
372
+
373
+ # Calculate Target CVR if both CPA and CPC columns are provided
374
+ if cpa_col and cpc_col:
375
+ mean_cpa = results.get("mean_cpa_goal", 0)
376
+ mean_payouts = results.get("mean_payouts", 0)
377
+ if mean_cpa > 0 and mean_payouts > 0:
378
+ results["target_cvr"] = round((mean_payouts/mean_cpa)*100, 2)
379
+ else:
380
+ results["target_cvr"] = 0
381
+
382
+ # Get current time in PST
383
+ pacific_tz = pytz.timezone("America/Los_Angeles")
384
+ now_pst = datetime.datetime.now(pytz.utc).astimezone(pacific_tz)
385
+ results["last_update"] = now_pst.strftime("%Y-%m-%d %H:%M:%S %Z")
386
+
387
+ return pd.Series(results)
388
+
389
+ # Group by selected column and apply calculations
390
+ grouped_stats = self.df.groupby(group_column).apply(calculate_group_stats).reset_index()
391
+
392
+ # Sort by most relevant metric
393
+ if "unique_references" in grouped_stats.columns:
394
+ grouped_stats = grouped_stats.sort_values('unique_references', ascending=False)
395
+ else:
396
+ grouped_stats = grouped_stats.sort_values('total_postings', ascending=False)
397
+
398
+ return grouped_stats, "Success"
399
+
400
+ except Exception as e:
401
+ return pd.DataFrame(), f"Error calculating weighted statistics: {str(e)}"
402
+
403
  def generate_csv(self, df, filename_prefix="feed"):
404
+ """Generate CSV file for download with fixed filename"""
405
  if df is None or df.empty:
406
  return None
407
 
408
+ # Create a temporary file with the exact name we want
409
+ temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix='')
410
+ temp_file.close() # Close to get the filename
411
+
412
+ # Rename the file to what we want
413
+ import shutil
414
+ final_filename = temp_file.name.replace(os.path.basename(temp_file.name), f"{filename_prefix}.csv")
415
+
416
+ # Save CSV with the desired name
417
+ df.to_csv(final_filename, index=False)
418
+
419
+ return final_filename
420
 
421
  def get_preview(self, df, max_rows=10):
422
+ """Get a preview of the dataframe as a Gradio DataFrame component with truncated long columns"""
423
  if df is None or df.empty:
424
+ return None
425
 
426
  # Limit the preview to avoid overwhelming display
427
+ preview_df = df.head(max_rows).copy()
428
 
429
  # Truncate long string values for better display
 
430
  for col in preview_df.select_dtypes(include=['object']).columns:
431
+ preview_df[col] = preview_df[col].astype(str).apply(
432
+ lambda x: x[:50] + '...' if len(str(x)) > 50 else x
433
+ )
434
 
435
+ return preview_df
436
+
437
+ def generate_map(self, city_col, state_col=None, country_col=None, max_points=500):
438
+ if self.df is None or self.df.empty:
439
+ return None, "⚠️ Please load a feed first"
440
+
441
+ if city_col not in self.df.columns:
442
+ return None, f"⚠️ Column '{city_col}' not found in dataset"
443
+
444
+ m = folium.Map(location=[20, 0], zoom_start=2)
445
+ marker_cluster = MarkerCluster().add_to(m)
446
+
447
+ count = 0
448
+ for _, row in self.df.iterrows():
449
+ if count >= max_points:
450
+ break
451
 
452
+ city = str(row[city_col]) if city_col else ""
453
+ state = str(row[state_col]) if state_col and state_col in self.df.columns else ""
454
+ country = str(row[country_col]) if country_col and country_col in self.df.columns else ""
455
+
456
+ query = ", ".join([p for p in [city, state, country] if p])
457
+ if not query.strip():
458
+ continue
459
+
460
+ location = geocode_cached(query)
461
+ if location:
462
+ folium.Marker(
463
+ location=[location.latitude, location.longitude],
464
+ popup=query
465
+ ).add_to(marker_cluster)
466
+ count += 1
467
+
468
+ return m._repr_html_(), f"✅ Mapped {count} locations"
469
+
470
+
471
  # Initialize the feed reader
472
  feed_reader = FeedReader()
473
 
474
  # Create Gradio interface
475
  def create_gradio_app():
476
  with gr.Blocks(title="Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
477
+ # Header with theme toggle
478
+ with gr.Row():
479
+ with gr.Column(scale=4):
480
+ gr.Markdown("""
481
+ # 📡 Feed Reader & Analyzer
482
+
483
+ Load and analyze XML or JSON feeds from URLs. Supports compressed files (.gz) and various data formats.
484
+ """)
485
 
486
+ # Theme state
487
+ is_dark_theme = gr.State(False)
488
 
489
+ # CSS output for theme switching
490
+ theme_css = gr.HTML()
491
+
492
  with gr.Tab("📥 Load Feed"):
493
  with gr.Row():
494
  with gr.Column():
 
504
  )
505
  load_btn = gr.Button("🔄 Load Feed", variant="primary")
506
 
507
+ with gr.Row():
508
  with gr.Column():
509
  summary_output = gr.Markdown(label="Summary")
510
+ with gr.Column():
511
+ metadata_output = gr.Dataframe(
512
+ label="📊 Columns Metadata",
513
+ visible=True,
514
+ interactive=False,
515
+ wrap=False
516
+ )
517
 
518
  with gr.Row():
519
+ preview_dataframe = gr.Dataframe(
520
+ label="Data Preview",
521
+ visible=True,
522
+ interactive=False,
523
+ wrap=False, # Keep rows small
524
+ row_count=(1, "dynamic") # Dynamic row configuration
525
+ )
526
 
527
  with gr.Row():
528
  csv_download = gr.File(label="📥 Download Full Dataset (CSV)", visible=True)
529
 
530
+ # Variable para almacenar las opciones de columnas
531
+ column_choices_state = gr.State([])
532
+
533
  # Load feed functionality
534
  def process_and_download(url, job_tag):
535
+ summary, df_processed, csv_file, preview_df, column_choices, metadata_df = feed_reader.process_feed(url, job_tag)
536
+ return summary, metadata_df, preview_df, csv_file, column_choices
537
 
538
  load_btn.click(
539
  process_and_download,
540
  inputs=[url_input, job_tag_input],
541
+ outputs=[summary_output, metadata_output, preview_dataframe, csv_download, column_choices_state]
542
  )
543
 
544
  with gr.Tab("🔍 Filter Data"):
545
  with gr.Row():
546
  with gr.Column():
547
+ # Botones de columnas (inicialmente vacío)
548
+ columns_radio = gr.Radio(
549
+ label="Select Column",
550
+ choices=[],
551
+ value=None
552
  )
553
+ # Dropdown para los valores de filtro
554
+ filter_value_dropdown = gr.Dropdown(
555
  label="Filter Value",
556
+ choices=[],
557
+ value=None,
558
+ interactive=True
559
  )
560
  filter_btn = gr.Button("🔍 Filter", variant="primary")
561
 
 
563
  filter_summary = gr.Markdown(label="Filter Results")
564
 
565
  with gr.Row():
566
+ filtered_dataframe = gr.Dataframe(
567
+ label="Filtered Data",
568
+ visible=True,
569
+ interactive=False,
570
+ wrap=False, # Disable text wrapping to keep rows small
571
+ row_count=(1, "dynamic") # Allow dynamic rows
572
+ )
573
+
574
+ with gr.Row():
575
+ filtered_csv = gr.File(label="📥 Download Filtered Data (CSV)", visible=True)
576
+
577
+ # Función para actualizar las opciones de columnas
578
+ def update_column_choices(column_choices):
579
+ return gr.Radio(choices=column_choices, value=None if not column_choices else column_choices[0])
580
+
581
+ # Función para actualizar los valores del dropdown cuando se selecciona una columna
582
+ def update_filter_values(selected_column):
583
+ if not selected_column or feed_reader.df is None:
584
+ return gr.Dropdown(choices=[], value=None)
585
+
586
+ unique_values = feed_reader.get_column_unique_values(selected_column)
587
+ return gr.Dropdown(
588
+ choices=unique_values,
589
+ value="All" if unique_values else None
590
+ )
591
+
592
+ # Actualizar las opciones cuando se carga un feed
593
+ column_choices_state.change(
594
+ update_column_choices,
595
+ inputs=[column_choices_state],
596
+ outputs=[columns_radio]
597
+ )
598
+
599
+ # Actualizar los valores del dropdown cuando se selecciona una columna
600
+ columns_radio.change(
601
+ update_filter_values,
602
+ inputs=[columns_radio],
603
+ outputs=[filter_value_dropdown]
604
+ )
605
 
606
  # Filter functionality
607
  def filter_and_download(column_name, filter_value):
608
  summary, df_filtered, csv_file = feed_reader.filter_by_column(column_name, filter_value)
609
+ if df_filtered is not None:
610
+ # Show both summary and dataframe
611
+ return summary, df_filtered, csv_file
612
+ else:
613
+ # Show error and empty dataframe
614
+ return summary, pd.DataFrame(), None
615
 
616
  filter_btn.click(
617
  filter_and_download,
618
+ inputs=[columns_radio, filter_value_dropdown],
619
+ outputs=[filter_summary, filtered_dataframe, filtered_csv]
620
  )
621
 
622
  with gr.Tab("📊 Statistics"):
623
+ with gr.Row():
624
+ with gr.Column():
625
+ gr.Markdown("### 📋 Basic Column Statistics")
626
+ basic_stats_btn = gr.Button("📊 Generate Column Statistics", variant="primary")
627
+ basic_stats_output = gr.Dataframe(label="Column Statistics")
628
+
629
+ with gr.Column():
630
+ gr.Markdown("### 🎯 Weighted Statistics by Group")
631
+
632
+ # Group selection for weighted stats
633
+ stats_group_column = gr.Radio(
634
+ label="Group By Column (company, client, etc.)",
635
+ choices=[],
636
+ value=None
637
+ )
638
+
639
+ # Column mapping for weighted calculations
640
+ with gr.Row():
641
+ reference_column = gr.Dropdown(
642
+ label="Reference ID Column",
643
+ choices=[],
644
+ value=None
645
+ )
646
+ cpa_column = gr.Dropdown(
647
+ label="CPA Goal Column",
648
+ choices=[],
649
+ value=None
650
+ )
651
+
652
+ with gr.Row():
653
+ cpc_column = gr.Dropdown(
654
+ label="Payouts: CPC/CPA Columns",
655
+ choices=[],
656
+ value=None
657
+ )
658
+
659
+ weighted_stats_btn = gr.Button("🧮 Calculate Weighted Statistics", variant="secondary")
660
+ weighted_stats_summary = gr.Markdown(label="Weighted Stats Summary")
661
+
662
+ with gr.Row():
663
+ weighted_stats_output = gr.Dataframe(
664
+ label="📈 Weighted Statistics by Group",
665
+ visible=True,
666
+ interactive=False,
667
+ wrap=False
668
+ )
669
+
670
+ with gr.Row():
671
+ weighted_stats_csv = gr.File(label="📥 Download Weighted Statistics (CSV)", visible=True)
672
 
673
+ # Update all column choices when data is loaded
674
+ def update_all_stats_choices(column_choices):
675
+ # Filter out timestamp columns for grouping
676
+ exclude_columns = ['last_update']
677
+ grouping_choices = [col for col in column_choices if col not in exclude_columns]
678
+
679
+ # All columns available for metric selection with "None" option
680
+ metric_choices = ["None"] + column_choices
681
+
682
+ # Try to auto-detect common column names
683
+ reference_default = "None"
684
+ cpa_default = "None"
685
+ cpc_default = "None"
686
+
687
+ for col in column_choices:
688
+ col_lower = col.lower()
689
+ if 'reference' in col_lower or 'req' in col_lower or col_lower == 'referencenumber':
690
+ reference_default = col
691
+ elif 'cpa' in col_lower or 'goal' in col_lower:
692
+ cpa_default = col
693
+ elif 'cpc' in col_lower or 'sponsored' in col_lower or 'cost' in col_lower or 'payout' in col_lower:
694
+ cpc_default = col
695
+
696
+ return (
697
+ gr.Radio(choices=grouping_choices, value=grouping_choices[0] if grouping_choices else None),
698
+ gr.Dropdown(choices=metric_choices, value=reference_default),
699
+ gr.Dropdown(choices=metric_choices, value=cpa_default),
700
+ gr.Dropdown(choices=metric_choices, value=cpc_default)
701
+ )
702
+
703
+ # Update all dropdown options when feed is loaded
704
+ column_choices_state.change(
705
+ update_all_stats_choices,
706
+ inputs=[column_choices_state],
707
+ outputs=[stats_group_column, reference_column, cpa_column, cpc_column]
708
+ )
709
+
710
+ # Basic statistics functionality
711
+ basic_stats_btn.click(
712
  feed_reader.get_column_stats,
713
+ outputs=[basic_stats_output]
714
+ )
715
+
716
+ # Weighted statistics functionality
717
+ def calculate_weighted_stats(group_column, reference_col, cpa_col, cpc_col):
718
+ if not group_column:
719
+ return "Please select a grouping column", None, None
720
+
721
+ # Handle "None" selections
722
+ reference_col = None if reference_col == "None" else reference_col
723
+ cpa_col = None if cpa_col == "None" else cpa_col
724
+ cpc_col = None if cpc_col == "None" else cpc_col
725
+
726
+ # At least one of the metric columns should be selected
727
+ if not reference_col and not cpa_col and not cpc_col:
728
+ return "Please select at least one metric column (Reference ID, CPA Goal, or Payouts)", None, None
729
+
730
+ weighted_df, message = feed_reader.get_weighted_stats_by_group(group_column, reference_col, cpa_col, cpc_col)
731
+
732
+ if not weighted_df.empty:
733
+ metrics_used = []
734
+ if reference_col: metrics_used.append(f"Reference: {reference_col}")
735
+ if cpa_col: metrics_used.append(f"CPA: {cpa_col}")
736
+ if cpc_col: metrics_used.append(f"Payouts: {cpc_col}")
737
+
738
+ summary = f"""
739
+ 🎯 **Weighted Statistics Results**
740
+
741
+ ✅ **Status:** {message}
742
+ 📊 **Groups:** {len(weighted_df)}
743
+ 🔢 **Grouped by:** {group_column}
744
+ 📈 **Metrics Used:** {' | '.join(metrics_used)}
745
+
746
+ 📊 **Available Metrics:**
747
+ • **Unique References**: Count of unique IDs per group (if Reference ID selected)
748
+ • **Total Postings**: Total rows/postings per group
749
+ • **Mean CPA/Payouts**: Average values across all postings (if columns selected)
750
+ • **Target CVR**: (Mean Payouts / Mean CPA) × 100 (if both selected)
751
+ • **Min/Max Ranges**: Minimum and maximum values per group
752
+
753
+ 💡 **Note:** Only metrics with selected columns will be calculated and displayed.
754
+ """
755
+ csv_file = feed_reader.generate_csv(weighted_df, f"weighted_stats_{group_column}")
756
+ return summary, weighted_df, csv_file
757
+ else:
758
+ return f"❌ **Error:** {message}", None, None
759
+
760
+ weighted_stats_btn.click(
761
+ calculate_weighted_stats,
762
+ inputs=[stats_group_column, reference_column, cpa_column, cpc_column],
763
+ outputs=[weighted_stats_summary, weighted_stats_output, weighted_stats_csv]
764
  )
765
 
766
+ with gr.Tab("🌍 Map"):
767
+ with gr.Row():
768
+ with gr.Column():
769
+ gr.Markdown("### Select Columns for Mapping")
770
+
771
+ city_col = gr.Dropdown(label="City Column", choices=[], value=None)
772
+ state_col = gr.Dropdown(label="State Column (optional)", choices=[], value=None)
773
+ country_col = gr.Dropdown(label="Country Column (optional)", choices=[], value=None)
774
+
775
+ map_btn = gr.Button("🗺️ Generate Map", variant="primary")
776
+
777
+ with gr.Column():
778
+ map_status = gr.Markdown()
779
+ map_output = gr.HTML()
780
+
781
+ # Actualizar dropdowns cuando se cargue un feed
782
+ def update_map_choices(column_choices):
783
+ if not column_choices:
784
+ return (
785
+ gr.Dropdown.update(choices=[]),
786
+ gr.Dropdown.update(choices=[]),
787
+ gr.Dropdown.update(choices=[])
788
+ )
789
+ return (
790
+ gr.Dropdown.update(choices=column_choices, value=column_choices[0]),
791
+ gr.Dropdown.update(choices=["None"] + column_choices, value="None"),
792
+ gr.Dropdown.update(choices=["None"] + column_choices, value="None")
793
+ )
794
+
795
+ column_choices_state.change(
796
+ update_map_choices,
797
+ inputs=[column_choices_state],
798
+ outputs=[city_col, state_col, country_col]
799
+ )
800
+
801
+ # Generar mapa desde feed_reader
802
+ def generate_map_handler(city_col, state_col, country_col):
803
+ state_col = None if state_col == "None" else state_col
804
+ country_col = None if country_col == "None" else country_col
805
+ map_html, msg = feed_reader.generate_map(city_col, state_col, country_col)
806
+ return msg, map_html
807
+
808
+ map_btn.click(
809
+ generate_map_handler,
810
+ inputs=[city_col, state_col, country_col],
811
+ outputs=[map_status, map_output]
812
+ )
813
+
814
+
815
+ # Actualizar dropdowns cuando se cargue un feed
816
+ def update_map_choices(column_choices):
817
+ return (
818
+ gr.Dropdown(choices=column_choices, value=None),
819
+ gr.Dropdown(choices=["None"] + column_choices, value="None"),
820
+ gr.Dropdown(choices=["None"] + column_choices, value="None")
821
+ )
822
+
823
+ column_choices_state.change(
824
+ update_map_choices,
825
+ inputs=[column_choices_state],
826
+ outputs=[city_col, state_col, country_col]
827
+ )
828
+
829
+ # Función para generar mapa
830
+ def generate_map(city_col, state_col, country_col):
831
+ state_col = None if state_col == "None" else state_col
832
+ country_col = None if country_col == "None" else country_col
833
+ map_html, msg = feed_reader.generate_map(city_col, state_col, country_col)
834
+ return msg, map_html
835
+
836
+ map_btn.click(
837
+ generate_map,
838
+ inputs=[city_col, state_col, country_col],
839
+ outputs=[map_status, map_output]
840
+ )
841
+
842
  gr.Markdown("""
843
  ---
844
  ### 📝 Instructions:
845
 
846
  1. **Load Feed**: Enter a URL pointing to an XML or JSON feed and click "Load Feed"
847
+ 2. **Filter Data**: Select a column from the radio buttons and enter a filter value
848
  3. **Statistics**: View detailed statistics about each column in your dataset
849
  4. **Download**: CSV files are automatically generated for download
850
 
 
856
  **Features:**
857
  - Automatic format detection
858
  - Data cleaning and validation
859
+ - Dynamic column-based filtering with dropdown values
860
  - Statistical analysis
861
  - CSV export functionality
862
+ - Resizable dataframe columns (drag column borders to resize)
863
  """)
864
 
865
  return app