rogergs94 commited on
Commit
7dabfd5
Β·
verified Β·
1 Parent(s): d3a234f

feed_map updated

Browse files

Added the map tool plus the different filter options in the Filter Data tab

Files changed (1) hide show
  1. app.py +598 -394
app.py CHANGED
@@ -13,6 +13,7 @@ import gradio as gr
13
  import os
14
  import tempfile
15
  import pytz
 
16
 
17
  geolocator = Nominatim(user_agent="feed_reader_app")
18
 
@@ -44,13 +45,6 @@ class FeedReader:
44
  def load_feed_to_dataframe(self, url, job_tag="job"):
45
  """
46
  Load an XML feed (.xml or .xml.gz) or JSON from a URL and convert to DataFrame.
47
-
48
- Args:
49
- url (str): URL of the feed
50
- job_tag (str): Name of the XML tag representing each job (only for XML feeds)
51
-
52
- Returns:
53
- pd.DataFrame: DataFrame containing the feed data
54
  """
55
  try:
56
  response = requests.get(url, timeout=30)
@@ -71,10 +65,8 @@ class FeedReader:
71
  elif isinstance(data, dict) and "jobs" in data:
72
  df = pd.DataFrame(data["jobs"])
73
  else:
74
- # Try to convert any other dict structure to DataFrame
75
  df = pd.DataFrame([data] if not isinstance(data, list) else data)
76
 
77
- # Truncate and clean
78
  df = df.applymap(lambda x: self.truncate(x) if isinstance(x, str) else x)
79
  df = self.clean_invalid_numbers(df)
80
  return df
@@ -90,7 +82,6 @@ class FeedReader:
90
  items = root.findall(f".//{job_tag}")
91
 
92
  if not items:
93
- # Try common alternative tag names
94
  common_tags = ["item", "entry", "record", "row"]
95
  for tag in common_tags:
96
  items = root.findall(f".//{tag}")
@@ -98,7 +89,7 @@ class FeedReader:
98
  break
99
 
100
  if not items:
101
- return pd.DataFrame(), f"No <{job_tag}> elements found in the XML. Tried common alternatives too."
102
 
103
  jobs_data = []
104
  for job in items:
@@ -109,21 +100,14 @@ class FeedReader:
109
  df = self.clean_invalid_numbers(df)
110
  return df, "Success"
111
 
112
- except requests.exceptions.RequestException as e:
113
- return pd.DataFrame(), f"Request error: {str(e)}"
114
- except ET.ParseError as e:
115
- return pd.DataFrame(), f"XML parsing error: {str(e)}"
116
- except ValueError as e:
117
- return pd.DataFrame(), f"JSON parsing error: {str(e)}"
118
  except Exception as e:
119
- return pd.DataFrame(), f"Unexpected error: {str(e)}"
120
 
121
  def process_feed(self, url, job_tag="job"):
122
  """Main function to process feed and return results"""
123
  if not url.strip():
124
  return "Please enter a valid URL", None, "", "", []
125
 
126
- # Load the feed
127
  result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
128
 
129
  if isinstance(result, tuple):
@@ -134,27 +118,19 @@ class FeedReader:
134
  df = result
135
  message = "Success"
136
 
137
- # Store the dataframe
138
  self.df = df
139
-
140
- # Add timestamp
141
  df['last_update'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 
 
142
 
143
- # Fill NaN values with 0 (with future-proof pandas handling)
144
- df_processed = df.fillna(0).infer_objects(copy=False)
145
-
146
- # Generate summary
147
  summary = f"""
148
  πŸ“Š **Feed Processing Results**
149
 
150
  βœ… **Status:** {message}
151
-
152
  πŸ“‹ **Rows:** {df_processed.shape[0]:,}
153
-
154
  πŸ“ **Columns:** {df_processed.shape[1]}
155
  """
156
 
157
- # Create metadata dataframe
158
  metadata_df = pd.DataFrame({
159
  'Column Name': df_processed.columns.tolist(),
160
  'Data Type': [str(df_processed[col].dtype) for col in df_processed.columns],
@@ -162,271 +138,315 @@ class FeedReader:
162
  'Null Values': [df_processed[col].isnull().sum() for col in df_processed.columns]
163
  })
164
 
165
- # Get column choices for filter tab
166
  column_choices = df_processed.columns.tolist()
167
 
168
  return summary, df_processed, self.generate_csv(df_processed, "feed"), self.get_preview(df_processed), column_choices, metadata_df
169
 
170
  def get_column_unique_values(self, column_name):
171
  """Get unique values for a specific column"""
172
- if self.df is None:
173
  return []
174
 
175
- if column_name not in self.df.columns:
176
- return []
177
-
178
- # Get unique values and convert to string, sort them
179
  unique_values = self.df[column_name].dropna().astype(str).unique()
180
  unique_values = sorted([str(val) for val in unique_values if str(val) != 'nan'])
181
-
182
- # Add "All" option at the beginning
183
  return ["All"] + unique_values
184
 
185
- def filter_by_column(self, column_name, filter_value):
186
- """Filter dataframe by column value"""
187
  if self.df is None:
188
- return "Please load a feed first", None, ""
189
 
190
- if not column_name or not filter_value:
191
- return "Please specify both column name and filter value", None, ""
192
 
193
- try:
194
- # Check if column exists (case insensitive)
195
- available_columns = self.df.columns.tolist()
196
- matching_columns = [col for col in available_columns if col.lower() == column_name.lower()]
197
-
198
- if not matching_columns:
199
- return f"Column '{column_name}' not found. Available columns: {', '.join(available_columns)}", None, ""
200
-
201
- actual_column = matching_columns[0]
202
-
203
- # If "All" is selected, return the entire dataframe
204
- if filter_value == "All":
205
- filtered_df = self.df.copy()
206
- filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
207
-
208
- # Truncate long columns for display only
209
- display_df = self.truncate_display_columns(filtered_df.copy())
210
-
211
- summary = f"""
212
- πŸ” **Filtered Results**
213
-
214
  πŸ“‹ **Total Rows:** {filtered_df.shape[0]:,}
215
- 🎯 **Filter:** Showing all records from column '{actual_column}'
216
- """
 
 
 
 
 
 
 
217
 
218
- return summary, display_df, self.generate_csv(filtered_df, f"all_{actual_column}")
 
219
 
220
- # Filter the dataframe for specific value
221
- if self.df[actual_column].dtype == 'object': # String column
222
- # Exact match for dropdown selection
223
- filtered_df = self.df[self.df[actual_column].astype(str) == str(filter_value)]
224
- else: # Numeric column
225
  try:
226
- filter_val_numeric = float(filter_value)
227
- filtered_df = self.df[self.df[actual_column] == filter_val_numeric]
228
  except ValueError:
229
- filtered_df = self.df[self.df[actual_column].astype(str) == str(filter_value)]
230
-
231
- if filtered_df.empty:
232
- return f"No records found matching '{filter_value}' in column '{actual_column}'", pd.DataFrame(), ""
233
-
234
- filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
235
-
236
- # Truncate long columns for display only
237
- display_df = self.truncate_display_columns(filtered_df.copy())
238
 
239
- summary = f"""
240
- πŸ” **Filtered Results**
 
 
 
 
 
 
 
 
 
 
 
 
 
241
 
242
  πŸ“‹ **Matching Rows:** {filtered_df.shape[0]:,}
243
- 🎯 **Filter:** {actual_column} = '{filter_value}'
244
- """
245
-
246
- return summary, display_df, self.generate_csv(filtered_df, f"filtered_{filter_value}")
247
 
248
- except Exception as e:
249
- return f"Error filtering data: {str(e)}", pd.DataFrame(), ""
 
250
 
251
  def truncate_display_columns(self, df):
252
- """Truncate long columns for better display in DataFrames"""
253
  display_df = df.copy()
254
-
255
- # Define columns that typically have long content
256
  long_content_columns = ['url', 'description', 'link', 'content', 'summary', 'text']
257
 
258
  for col in display_df.select_dtypes(include=['object']).columns:
259
- # Apply more aggressive truncation to known long columns
260
  if any(long_col in col.lower() for long_col in long_content_columns):
261
  display_df[col] = display_df[col].astype(str).apply(
262
  lambda x: x[:30] + '...' if len(str(x)) > 30 else x
263
  )
264
  else:
265
- # Standard truncation for other text columns
266
  display_df[col] = display_df[col].astype(str).apply(
267
  lambda x: x[:50] + '...' if len(str(x)) > 50 else x
268
  )
269
  return display_df
270
 
271
- def get_column_stats(self):
272
- """Get statistics for each column"""
273
- if self.df is None:
274
- return "Please load a feed first"
 
275
 
276
- try:
277
- stats = []
278
- for column in self.df.columns:
279
- unique_values = self.df[column].nunique()
280
- null_count = self.df[column].isnull().sum()
281
- total_count = len(self.df)
282
-
283
- # Get top 5 most common values
284
- if self.df[column].dtype == 'object':
285
- top_values = self.df[column].value_counts().head(5)
286
- top_values_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
287
- else:
288
- top_values_str = f"Min: {self.df[column].min()}, Max: {self.df[column].max()}"
 
 
 
 
 
 
 
 
 
 
 
 
 
289
 
290
- stats.append({
291
- 'Column': column,
292
- 'Unique Values': unique_values,
293
- 'Null Values': null_count,
294
- 'Data Type': str(self.df[column].dtype),
295
- 'Top Values/Range': top_values_str
296
- })
297
 
298
- stats_df = pd.DataFrame(stats)
299
- return stats_df
 
 
 
 
 
300
 
301
- except Exception as e:
302
- return f"Error generating statistics: {str(e)}"
303
-
304
- def calcular_ponderados(self, df):
305
- """FunciΓ³n para calcular medias ponderadas"""
306
- total_count = df["count"].sum()
307
- mean_cpa = (df["cpa_goal"] * df["count"]).sum() / total_count if total_count > 0 else 0
308
- mean_sponsored = (df["sponsored"] * df["count"]).sum() / total_count if total_count > 0 else 0
309
- min_cpc = (df["sponsored"]).min()
310
- max_cpc = (df["sponsored"]).max()
311
- min_cpa = (df["cpa_goal"]).min()
312
- max_cpa = (df["cpa_goal"]).max()
313
-
314
- # Obtener la hora actual en PST
315
- pacific_tz = pytz.timezone("America/Los_Angeles")
316
- now_pst = datetime.datetime.now(pytz.utc).astimezone(pacific_tz)
317
-
318
- return pd.Series({
319
- "total_jobs": int(total_count),
320
- "mean_cpa_goal": round(mean_cpa,2),
321
- "mean_cpc": round(mean_sponsored,2),
322
- "target_cvr": round((mean_sponsored/mean_cpa)*100,2) if mean_cpa > 0 else 0,
323
- "min_cpc": round(min_cpc,2),
324
- "max_cpc": round(max_cpc,2),
325
- "min_cpa": round(min_cpa,2),
326
- "max_cpa": round(max_cpa,2),
327
- "last_update": now_pst.strftime("%Y-%m-%d %H:%M:%S %Z")
328
- })
329
-
330
- def get_weighted_stats_by_group(self, group_column, reference_col=None, cpa_col=None, cpc_col=None):
331
- """Get weighted statistics grouped by specified column with flexible column selection"""
332
- if self.df is None:
333
- return pd.DataFrame(), "Please load a feed first"
334
 
335
- # Check if group column exists
336
- if group_column not in self.df.columns:
337
- available_columns = [col for col in self.df.columns if col != 'last_update']
338
- return pd.DataFrame(), f"Column '{group_column}' not found. Available columns: {', '.join(available_columns)}"
339
 
340
- # Check if selected columns exist
341
- selected_columns = [col for col in [reference_col, cpa_col, cpc_col] if col is not None]
342
- missing_columns = [col for col in selected_columns if col not in self.df.columns]
343
 
344
- if missing_columns:
345
- available_columns = list(self.df.columns)
346
- return pd.DataFrame(), f"Missing selected columns: {', '.join(missing_columns)}. Available columns: {', '.join(available_columns)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
- try:
349
- def calculate_group_stats(group_df):
350
- results = {}
351
-
352
- # Always calculate total postings
353
- results["total_postings"] = int(len(group_df))
354
-
355
- # Calculate unique references if reference column is provided
356
- if reference_col:
357
- results["unique_references"] = int(group_df[reference_col].nunique())
358
-
359
- # Calculate CPA statistics if CPA column is provided
360
- if cpa_col:
361
- cpa_series = pd.to_numeric(group_df[cpa_col], errors='coerce')
362
- results["mean_cpa_goal"] = round(cpa_series.mean(), 2) if not cpa_series.isna().all() else 0
363
- results["min_cpa"] = round(cpa_series.min(), 2) if not cpa_series.isna().all() else 0
364
- results["max_cpa"] = round(cpa_series.max(), 2) if not cpa_series.isna().all() else 0
365
-
366
- # Calculate CPC/Payout statistics if CPC column is provided
367
- if cpc_col:
368
- cpc_series = pd.to_numeric(group_df[cpc_col], errors='coerce')
369
- results["mean_payouts"] = round(cpc_series.mean(), 2) if not cpc_series.isna().all() else 0
370
- results["min_payouts"] = round(cpc_series.min(), 2) if not cpc_series.isna().all() else 0
371
- results["max_payouts"] = round(cpc_series.max(), 2) if not cpc_series.isna().all() else 0
372
-
373
- # Calculate Target CVR if both CPA and CPC columns are provided
374
- if cpa_col and cpc_col:
375
- mean_cpa = results.get("mean_cpa_goal", 0)
376
- mean_payouts = results.get("mean_payouts", 0)
377
- if mean_cpa > 0 and mean_payouts > 0:
378
- results["target_cvr"] = round((mean_payouts/mean_cpa)*100, 2)
379
- else:
380
- results["target_cvr"] = 0
381
-
382
- # Get current time in PST
383
- pacific_tz = pytz.timezone("America/Los_Angeles")
384
- now_pst = datetime.datetime.now(pytz.utc).astimezone(pacific_tz)
385
- results["last_update"] = now_pst.strftime("%Y-%m-%d %H:%M:%S %Z")
386
-
387
- return pd.Series(results)
388
 
389
- # Group by selected column and apply calculations
390
- grouped_stats = self.df.groupby(group_column).apply(calculate_group_stats).reset_index()
 
391
 
392
- # Sort by most relevant metric
393
- if "unique_references" in grouped_stats.columns:
394
- grouped_stats = grouped_stats.sort_values('unique_references', ascending=False)
395
- else:
396
- grouped_stats = grouped_stats.sort_values('total_postings', ascending=False)
397
 
398
- return grouped_stats, "Success"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- except Exception as e:
401
- return pd.DataFrame(), f"Error calculating weighted statistics: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
  def generate_csv(self, df, filename_prefix="feed"):
404
- """Generate CSV file for download with fixed filename"""
405
  if df is None or df.empty:
406
  return None
407
 
408
- # Create a temporary file with the exact name we want
409
  temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix='')
410
- temp_file.close() # Close to get the filename
411
 
412
- # Rename the file to what we want
413
- import shutil
414
  final_filename = temp_file.name.replace(os.path.basename(temp_file.name), f"{filename_prefix}.csv")
415
-
416
- # Save CSV with the desired name
417
  df.to_csv(final_filename, index=False)
418
 
419
  return final_filename
420
 
421
  def get_preview(self, df, max_rows=10):
422
- """Get a preview of the dataframe as a Gradio DataFrame component with truncated long columns"""
423
  if df is None or df.empty:
424
  return None
425
 
426
- # Limit the preview to avoid overwhelming display
427
  preview_df = df.head(max_rows).copy()
428
 
429
- # Truncate long string values for better display
430
  for col in preview_df.select_dtypes(include=['object']).columns:
431
  preview_df[col] = preview_df[col].astype(str).apply(
432
  lambda x: x[:50] + '...' if len(str(x)) > 50 else x
@@ -434,61 +454,19 @@ class FeedReader:
434
 
435
  return preview_df
436
 
437
- def generate_map(self, city_col, state_col=None, country_col=None, max_points=500):
438
- if self.df is None or self.df.empty:
439
- return None, "⚠️ Please load a feed first"
440
-
441
- if city_col not in self.df.columns:
442
- return None, f"⚠️ Column '{city_col}' not found in dataset"
443
-
444
- m = folium.Map(location=[20, 0], zoom_start=2)
445
- marker_cluster = MarkerCluster().add_to(m)
446
-
447
- count = 0
448
- for _, row in self.df.iterrows():
449
- if count >= max_points:
450
- break
451
-
452
- city = str(row[city_col]) if city_col else ""
453
- state = str(row[state_col]) if state_col and state_col in self.df.columns else ""
454
- country = str(row[country_col]) if country_col and country_col in self.df.columns else ""
455
-
456
- query = ", ".join([p for p in [city, state, country] if p])
457
- if not query.strip():
458
- continue
459
-
460
- location = geocode_cached(query)
461
- if location:
462
- folium.Marker(
463
- location=[location.latitude, location.longitude],
464
- popup=query
465
- ).add_to(marker_cluster)
466
- count += 1
467
-
468
- return m._repr_html_(), f"βœ… Mapped {count} locations"
469
-
470
-
471
  # Initialize the feed reader
472
  feed_reader = FeedReader()
473
 
474
- # Create Gradio interface
475
- def create_gradio_app():
476
- with gr.Blocks(title="Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
477
- # Header with theme toggle
478
  with gr.Row():
479
  with gr.Column(scale=4):
480
  gr.Markdown("""
481
- # πŸ“‘ Feed Reader & Analyzer
482
 
483
- Load and analyze XML or JSON feeds from URLs. Supports compressed files (.gz) and various data formats.
484
  """)
485
 
486
- # Theme state
487
- is_dark_theme = gr.State(False)
488
-
489
- # CSS output for theme switching
490
- theme_css = gr.HTML()
491
-
492
  with gr.Tab("πŸ“₯ Load Feed"):
493
  with gr.Row():
494
  with gr.Column():
@@ -520,17 +498,15 @@ def create_gradio_app():
520
  label="Data Preview",
521
  visible=True,
522
  interactive=False,
523
- wrap=False, # Keep rows small
524
- row_count=(1, "dynamic") # Dynamic row configuration
525
  )
526
 
527
  with gr.Row():
528
  csv_download = gr.File(label="πŸ“₯ Download Full Dataset (CSV)", visible=True)
529
 
530
- # Variable para almacenar las opciones de columnas
531
  column_choices_state = gr.State([])
532
 
533
- # Load feed functionality
534
  def process_and_download(url, job_tag):
535
  summary, df_processed, csv_file, preview_df, column_choices, metadata_df = feed_reader.process_feed(url, job_tag)
536
  return summary, metadata_df, preview_df, csv_file, column_choices
@@ -541,82 +517,152 @@ def create_gradio_app():
541
  outputs=[summary_output, metadata_output, preview_dataframe, csv_download, column_choices_state]
542
  )
543
 
544
- with gr.Tab("πŸ” Filter Data"):
 
 
 
545
  with gr.Row():
546
  with gr.Column():
547
- # Botones de columnas (inicialmente vacΓ­o)
548
- columns_radio = gr.Dropdown(
549
- label="Select Column",
550
- choices=[],
551
- value=None
552
- )
553
- # Dropdown para los valores de filtro
554
- filter_value_dropdown = gr.Dropdown(
555
- label="Filter Value",
556
- choices=[],
557
- value=None,
558
- interactive=True
559
- )
560
- filter_btn = gr.Button("πŸ” Filter", variant="primary")
 
 
 
 
 
 
 
 
 
561
 
562
  with gr.Column():
563
- filter_summary = gr.Markdown(label="Filter Results")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
 
565
  with gr.Row():
566
- filtered_dataframe = gr.Dataframe(
 
 
 
567
  label="Filtered Data",
568
  visible=True,
569
  interactive=False,
570
- wrap=False, # Disable text wrapping to keep rows small
571
- row_count=(1, "dynamic") # Allow dynamic rows
572
  )
573
 
574
  with gr.Row():
575
- filtered_csv = gr.File(label="πŸ“₯ Download Filtered Data (CSV)", visible=True)
576
 
577
- # FunciΓ³n para actualizar las opciones de columnas
578
- def update_column_choices(column_choices):
579
- return gr.Radio(choices=column_choices, value=None if not column_choices else column_choices[0])
 
 
 
 
 
 
580
 
581
- # FunciΓ³n para actualizar los valores del dropdown cuando se selecciona una columna
582
  def update_filter_values(selected_column):
583
- if not selected_column or feed_reader.df is None:
584
- return gr.Dropdown(choices=[], value=None)
585
 
586
  unique_values = feed_reader.get_column_unique_values(selected_column)
587
- return gr.Dropdown(
588
- choices=unique_values,
589
- value="All" if unique_values else None
590
- )
591
 
592
- # Actualizar las opciones cuando se carga un feed
593
  column_choices_state.change(
594
- update_column_choices,
595
  inputs=[column_choices_state],
596
- outputs=[columns_radio]
597
  )
598
 
599
- # Actualizar los valores del dropdown cuando se selecciona una columna
600
- columns_radio.change(
601
- update_filter_values,
602
- inputs=[columns_radio],
603
- outputs=[filter_value_dropdown]
604
- )
605
 
606
- # Filter functionality
607
- def filter_and_download(column_name, filter_value):
608
- summary, df_filtered, csv_file = feed_reader.filter_by_column(column_name, filter_value)
609
- if df_filtered is not None:
610
- # Show both summary and dataframe
611
- return summary, df_filtered, csv_file
612
- else:
613
- # Show error and empty dataframe
614
- return summary, pd.DataFrame(), None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
615
 
616
- filter_btn.click(
617
- filter_and_download,
618
- inputs=[columns_radio, filter_value_dropdown],
619
- outputs=[filter_summary, filtered_dataframe, filtered_csv]
 
620
  )
621
 
622
  with gr.Tab("πŸ“Š Statistics"):
@@ -708,11 +754,118 @@ def create_gradio_app():
708
  )
709
 
710
  # Basic statistics functionality
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
711
  basic_stats_btn.click(
712
- feed_reader.get_column_stats,
713
  outputs=[basic_stats_output]
714
  )
715
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
716
  # Weighted statistics functionality
717
  def calculate_weighted_stats(group_column, reference_col, cpa_col, cpc_col):
718
  if not group_column:
@@ -727,7 +880,7 @@ def create_gradio_app():
727
  if not reference_col and not cpa_col and not cpc_col:
728
  return "Please select at least one metric column (Reference ID, CPA Goal, or Payouts)", None, None
729
 
730
- weighted_df, message = feed_reader.get_weighted_stats_by_group(group_column, reference_col, cpa_col, cpc_col)
731
 
732
  if not weighted_df.empty:
733
  metrics_used = []
@@ -763,108 +916,159 @@ def create_gradio_app():
763
  outputs=[weighted_stats_summary, weighted_stats_output, weighted_stats_csv]
764
  )
765
 
766
- with gr.Tab("🌍 Map"):
767
  with gr.Row():
768
  with gr.Column():
769
- gr.Markdown("### Select Columns for Mapping")
 
770
 
771
- city_col = gr.Dropdown(label="City Column", choices=[], value=None)
772
- state_col = gr.Dropdown(label="State Column (optional)", choices=[], value=None)
773
- country_col = gr.Dropdown(label="Country Column (optional)", choices=[], value=None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
774
 
775
- map_btn = gr.Button("πŸ—ΊοΈ Generate Map", variant="primary")
 
 
776
 
777
  with gr.Column():
778
  map_status = gr.Markdown()
779
- map_output = gr.HTML()
 
 
780
 
781
- # Actualizar dropdowns cuando se cargue un feed
782
  def update_map_choices(column_choices):
783
  if not column_choices:
784
  return (
785
- gr.Dropdown.update(choices=[]),
786
- gr.Dropdown.update(choices=[]),
787
- gr.Dropdown.update(choices=[])
 
788
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789
  return (
790
- gr.Dropdown.update(choices=column_choices, value=column_choices[0]),
791
- gr.Dropdown.update(choices=["None"] + column_choices, value="None"),
792
- gr.Dropdown.update(choices=["None"] + column_choices, value="None")
 
793
  )
794
 
795
  column_choices_state.change(
796
  update_map_choices,
797
  inputs=[column_choices_state],
798
- outputs=[city_col, state_col, country_col]
799
  )
800
 
801
- # Generar mapa desde feed_reader
802
- def generate_map_handler(city_col, state_col, country_col):
 
 
 
803
  state_col = None if state_col == "None" else state_col
804
  country_col = None if country_col == "None" else country_col
805
- map_html, msg = feed_reader.generate_map(city_col, state_col, country_col)
 
 
 
 
806
  return msg, map_html
807
 
 
 
 
808
  map_btn.click(
809
- generate_map_handler,
810
- inputs=[city_col, state_col, country_col],
811
  outputs=[map_status, map_output]
812
  )
813
-
814
-
815
- # Actualizar dropdowns cuando se cargue un feed
816
- def update_map_choices(column_choices):
817
- return (
818
- gr.Dropdown(choices=column_choices, value=None),
819
- gr.Dropdown(choices=["None"] + column_choices, value="None"),
820
- gr.Dropdown(choices=["None"] + column_choices, value="None")
821
- )
822
 
823
- column_choices_state.change(
824
- update_map_choices,
825
- inputs=[column_choices_state],
826
- outputs=[city_col, state_col, country_col]
827
- )
828
-
829
- # FunciΓ³n para generar mapa
830
- def generate_map(city_col, state_col, country_col):
831
- state_col = None if state_col == "None" else state_col
832
- country_col = None if country_col == "None" else country_col
833
- map_html, msg = feed_reader.generate_map(city_col, state_col, country_col)
834
- return msg, map_html
835
-
836
- map_btn.click(
837
- generate_map,
838
- inputs=[city_col, state_col, country_col],
839
  outputs=[map_status, map_output]
840
  )
841
 
842
  gr.Markdown("""
843
  ---
844
- ### πŸ“ Instructions:
845
-
846
- 1. **Load Feed**: Enter a URL pointing to an XML or JSON feed and click "Load Feed"
847
- 2. **Filter Data**: Select a column from the radio buttons and enter a filter value
848
- 3. **Statistics**: View detailed statistics about each column in your dataset
849
- 4. **Download**: CSV files are automatically generated for download
850
-
851
- **Supported Formats:**
852
- - XML files (.xml, .xml.gz)
853
- - JSON files (.json)
854
- - REST APIs returning JSON
855
-
856
- **Features:**
857
- - Automatic format detection
858
- - Data cleaning and validation
859
- - Dynamic column-based filtering with dropdown values
860
- - Statistical analysis
861
- - CSV export functionality
862
- - Resizable dataframe columns (drag column borders to resize)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
863
  """)
864
 
865
  return app
866
 
867
- # Launch the app
868
  if __name__ == "__main__":
869
- app = create_gradio_app()
870
  app.launch(share=True, debug=True)
 
13
  import os
14
  import tempfile
15
  import pytz
16
+ import time
17
 
18
  geolocator = Nominatim(user_agent="feed_reader_app")
19
 
 
45
  def load_feed_to_dataframe(self, url, job_tag="job"):
46
  """
47
  Load an XML feed (.xml or .xml.gz) or JSON from a URL and convert to DataFrame.
 
 
 
 
 
 
 
48
  """
49
  try:
50
  response = requests.get(url, timeout=30)
 
65
  elif isinstance(data, dict) and "jobs" in data:
66
  df = pd.DataFrame(data["jobs"])
67
  else:
 
68
  df = pd.DataFrame([data] if not isinstance(data, list) else data)
69
 
 
70
  df = df.applymap(lambda x: self.truncate(x) if isinstance(x, str) else x)
71
  df = self.clean_invalid_numbers(df)
72
  return df
 
82
  items = root.findall(f".//{job_tag}")
83
 
84
  if not items:
 
85
  common_tags = ["item", "entry", "record", "row"]
86
  for tag in common_tags:
87
  items = root.findall(f".//{tag}")
 
89
  break
90
 
91
  if not items:
92
+ return pd.DataFrame(), f"No <{job_tag}> elements found in the XML."
93
 
94
  jobs_data = []
95
  for job in items:
 
100
  df = self.clean_invalid_numbers(df)
101
  return df, "Success"
102
 
 
 
 
 
 
 
103
  except Exception as e:
104
+ return pd.DataFrame(), f"Error: {str(e)}"
105
 
106
  def process_feed(self, url, job_tag="job"):
107
  """Main function to process feed and return results"""
108
  if not url.strip():
109
  return "Please enter a valid URL", None, "", "", []
110
 
 
111
  result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
112
 
113
  if isinstance(result, tuple):
 
118
  df = result
119
  message = "Success"
120
 
 
121
  self.df = df
 
 
122
  df['last_update'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
123
+ df_processed = df
124
+ #df_processed = df.fillna(0).infer_objects(copy=False)
125
 
 
 
 
 
126
  summary = f"""
127
  πŸ“Š **Feed Processing Results**
128
 
129
  βœ… **Status:** {message}
 
130
  πŸ“‹ **Rows:** {df_processed.shape[0]:,}
 
131
  πŸ“ **Columns:** {df_processed.shape[1]}
132
  """
133
 
 
134
  metadata_df = pd.DataFrame({
135
  'Column Name': df_processed.columns.tolist(),
136
  'Data Type': [str(df_processed[col].dtype) for col in df_processed.columns],
 
138
  'Null Values': [df_processed[col].isnull().sum() for col in df_processed.columns]
139
  })
140
 
 
141
  column_choices = df_processed.columns.tolist()
142
 
143
  return summary, df_processed, self.generate_csv(df_processed, "feed"), self.get_preview(df_processed), column_choices, metadata_df
144
 
145
  def get_column_unique_values(self, column_name):
146
  """Get unique values for a specific column"""
147
+ if self.df is None or column_name not in self.df.columns:
148
  return []
149
 
 
 
 
 
150
  unique_values = self.df[column_name].dropna().astype(str).unique()
151
  unique_values = sorted([str(val) for val in unique_values if str(val) != 'nan'])
 
 
152
  return ["All"] + unique_values
153
 
154
+ def apply_multiple_filters(self, filters_dict, progress=gr.Progress()):
155
+ """Apply multiple filters to the dataframe"""
156
  if self.df is None:
157
+ return pd.DataFrame(), "Please load a feed first", ""
158
 
159
+ progress(0, desc="Starting filter process...")
 
160
 
161
+ # Start with the full dataframe
162
+ filtered_df = self.df.copy()
163
+ filter_descriptions = []
164
+
165
+ # Apply each filter
166
+ active_filters = {k: v for k, v in filters_dict.items()
167
+ if v and v != "All" and v != "None"}
168
+
169
+ if not active_filters:
170
+ progress(1, desc="No filters applied - showing all data")
171
+ filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
172
+ display_df = self.truncate_display_columns(filtered_df.copy())
173
+ summary = f"""
174
+ πŸ” **Filter Results**
 
 
 
 
 
 
 
175
  πŸ“‹ **Total Rows:** {filtered_df.shape[0]:,}
176
+ 🎯 **Filters Applied:** None (showing all data)
177
+ """
178
+ return display_df, summary, self.generate_csv(filtered_df, "all_data")
179
+
180
+ progress(0.2, desc="Applying filters...")
181
+
182
+ for i, (column, value) in enumerate(active_filters.items()):
183
+ if column not in self.df.columns:
184
+ continue
185
 
186
+ progress(0.2 + (0.6 * i / len(active_filters)),
187
+ desc=f"Filtering by {column}: {value}")
188
 
189
+ # Apply filter based on data type
190
+ if self.df[column].dtype == 'object':
191
+ filtered_df = filtered_df[filtered_df[column].astype(str) == str(value)]
192
+ else:
 
193
  try:
194
+ filter_val_numeric = float(value)
195
+ filtered_df = filtered_df[filtered_df[column] == filter_val_numeric]
196
  except ValueError:
197
+ filtered_df = filtered_df[filtered_df[column].astype(str) == str(value)]
 
 
 
 
 
 
 
 
198
 
199
+ filter_descriptions.append(f"{column} = '{value}'")
200
+
201
+ progress(0.8, desc="Processing results...")
202
+
203
+ if filtered_df.empty:
204
+ progress(1, desc="Filter complete - no results found")
205
+ return pd.DataFrame(), "No records found matching the specified filters", ""
206
+
207
+ filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
208
+ display_df = self.truncate_display_columns(filtered_df.copy())
209
+
210
+ progress(1, desc="Filter complete")
211
+
212
+ summary = f"""
213
+ πŸ” **Multi-Filter Results**
214
 
215
  πŸ“‹ **Matching Rows:** {filtered_df.shape[0]:,}
216
+ 🎯 **Filters Applied:** {len(active_filters)}
217
+ πŸ“ **Filter Details:**
218
+ {chr(10).join(f" β€’ {desc}" for desc in filter_descriptions)}
219
+ """
220
 
221
+ filename_suffix = "_".join([f"{k}_{v}" for k, v in active_filters.items()])[:50]
222
+
223
+ return display_df, summary, self.generate_csv(filtered_df, f"filtered_{filename_suffix}")
224
 
225
  def truncate_display_columns(self, df):
226
+ """Truncate long columns for better display"""
227
  display_df = df.copy()
 
 
228
  long_content_columns = ['url', 'description', 'link', 'content', 'summary', 'text']
229
 
230
  for col in display_df.select_dtypes(include=['object']).columns:
 
231
  if any(long_col in col.lower() for long_col in long_content_columns):
232
  display_df[col] = display_df[col].astype(str).apply(
233
  lambda x: x[:30] + '...' if len(str(x)) > 30 else x
234
  )
235
  else:
 
236
  display_df[col] = display_df[col].astype(str).apply(
237
  lambda x: x[:50] + '...' if len(str(x)) > 50 else x
238
  )
239
  return display_df
240
 
241
+ def generate_map_with_job_counts(self, city_col, state_col=None, country_col=None,
242
+ title_col=None, max_points=500, progress=gr.Progress()):
243
+ """Generate map with job count markers per location with progress tracking"""
244
+ if self.df is None or self.df.empty:
245
+ return None, "⚠️ Please load a feed first"
246
 
247
+ if city_col not in self.df.columns:
248
+ return None, f"⚠️ Column '{city_col}' not found in dataset"
249
+
250
+ progress(0, desc="Initializing map generation...")
251
+
252
+ # Create map
253
+ m = folium.Map(location=[20, 0], zoom_start=2)
254
+
255
+ progress(0.1, desc="Processing location data...")
256
+
257
+ # Prepare location data
258
+ location_data = []
259
+ total_rows = len(self.df)
260
+
261
+ for idx, (_, row) in enumerate(self.df.iterrows()):
262
+ if idx % 100 == 0: # Update progress every 100 rows
263
+ progress(0.1 + (0.3 * idx / total_rows),
264
+ desc=f"Processing locations... {idx}/{total_rows}")
265
+
266
+ city = str(row[city_col]) if city_col else ""
267
+ state = str(row[state_col]) if state_col and state_col in self.df.columns else ""
268
+ country = str(row[country_col]) if country_col and country_col in self.df.columns else ""
269
+
270
+ location_parts = [p for p in [city, state, country] if p and p.strip() and p != 'nan']
271
+ if not location_parts:
272
+ continue
273
 
274
+ location_key = ", ".join(location_parts)
275
+ title_id = str(row[title_col]) if title_col and title_col in self.df.columns else None
 
 
 
 
 
276
 
277
+ location_data.append({
278
+ 'location_key': location_key,
279
+ 'city': city,
280
+ 'state': state,
281
+ 'country': country,
282
+ 'title_id': title_id
283
+ })
284
 
285
+ if not location_data:
286
+ progress(1, desc="No valid location data found")
287
+ return None, "⚠️ No valid location data found"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ progress(0.4, desc="Aggregating location statistics...")
 
 
 
290
 
291
+ # Group by location
292
+ locations_df = pd.DataFrame(location_data)
 
293
 
294
+ if title_col and title_col in self.df.columns:
295
+ location_stats = locations_df.groupby('location_key').agg({
296
+ 'title_id': ['count', 'nunique'],
297
+ 'city': 'first',
298
+ 'state': 'first',
299
+ 'country': 'first'
300
+ }).reset_index()
301
+ location_stats.columns = ['location_key', 'total_postings', 'unique_titles', 'city', 'state', 'country']
302
+ else:
303
+ location_stats = locations_df.groupby('location_key').agg({
304
+ 'city': 'first',
305
+ 'state': 'first',
306
+ 'country': 'first'
307
+ }).reset_index()
308
+ location_stats['total_postings'] = locations_df.groupby('location_key').size().values
309
+ location_stats['unique_titles'] = location_stats['total_postings']
310
 
311
+ progress(0.5, desc="Starting geocoding process...")
312
+
313
+ # Geocoding with progress tracking
314
+ successful_mappings = 0
315
+ failed_geocoding = 0
316
+ total_locations = len(location_stats)
317
+
318
+ for idx, (_, row) in enumerate(location_stats.iterrows()):
319
+ if successful_mappings >= max_points:
320
+ break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
321
 
322
+ # Update progress during geocoding
323
+ progress(0.5 + (0.4 * idx / total_locations),
324
+ desc=f"Geocoding locations... {successful_mappings} mapped, {failed_geocoding} failed")
325
 
326
+ location_key = row['location_key']
327
+ total_postings = row['total_postings']
328
+ unique_titles = row['unique_titles']
 
 
329
 
330
+ location = geocode_cached(location_key)
331
+ if location:
332
+ # Calculate marker properties
333
+ max_titles = location_stats['unique_titles'].max()
334
+ min_size = 10
335
+ max_size = 50
336
+
337
+ if max_titles > 0:
338
+ marker_size = min_size + (max_size - min_size) * (unique_titles / max_titles)
339
+ else:
340
+ marker_size = min_size
341
+
342
+ # Color coding
343
+ if unique_titles >= max_titles * 0.8:
344
+ color = 'red'
345
+ elif unique_titles >= max_titles * 0.5:
346
+ color = 'orange'
347
+ elif unique_titles >= max_titles * 0.2:
348
+ color = 'yellow'
349
+ else:
350
+ color = 'green'
351
+
352
+ # Create popup
353
+ popup_text = f"""
354
+ <div style='font-family: Arial, sans-serif; min-width: 200px;'>
355
+ <h4 style='color: #2E86AB; margin-bottom: 10px;'>πŸ“ {location_key}</h4>
356
+ <hr style='margin: 5px 0;'>
357
+ <p><strong>🎯 Unique Titles:</strong> {unique_titles}</p>
358
+ <p><strong>πŸ“Š Total Postings:</strong> {total_postings}</p>
359
+ <p><strong>πŸ“ˆ Avg Postings/Title:</strong> {round(total_postings/unique_titles, 1) if unique_titles > 0 else 0}</p>
360
+ </div>
361
+ """
362
+
363
+ folium.CircleMarker(
364
+ location=[location.latitude, location.longitude],
365
+ radius=marker_size,
366
+ popup=folium.Popup(popup_text, max_width=300),
367
+ color='black',
368
+ weight=2,
369
+ fillColor=color,
370
+ fillOpacity=0.7,
371
+ tooltip=f"{location_key}: {unique_titles} titles"
372
+ ).add_to(m)
373
+
374
+ successful_mappings += 1
375
+ else:
376
+ failed_geocoding += 1
377
 
378
+ # Small delay to prevent overwhelming the geocoding service
379
+ time.sleep(0.1)
380
+
381
+ progress(0.9, desc="Finalizing map...")
382
+
383
+ # Add legend
384
+ legend_html = f"""
385
+ <div style='position: fixed;
386
+ bottom: 50px; left: 50px; width: 200px; height: 120px;
387
+ background-color: white; border:2px solid grey; z-index:9999;
388
+ font-size:14px; padding: 10px'>
389
+ <h4 style='margin:0; color: #2E86AB;'>πŸ“Š Job Count Legend</h4>
390
+ <p style='margin:5px 0;'><i style='color:red'>●</i> High (80%+ of max)</p>
391
+ <p style='margin:5px 0;'><i style='color:orange'>●</i> Medium-High (50-80%)</p>
392
+ <p style='margin:5px 0;'><i style='color:yellow'>●</i> Medium (20-50%)</p>
393
+ <p style='margin:5px 0;'><i style='color:green'>●</i> Low (&lt;20%)</p>
394
+ <small>Marker size = Job count</small>
395
+ </div>
396
+ """
397
+
398
+ m.get_root().html.add_child(folium.Element(legend_html))
399
+
400
+ progress(1, desc="Map generation complete!")
401
+
402
+ # Generate status message
403
+ status_msg = f"""
404
+ βœ… **Map Generated Successfully**
405
+
406
+ πŸ—ΊοΈ **Mapped Locations:** {successful_mappings}
407
+ ❌ **Failed to Geocode:** {failed_geocoding}
408
+ πŸ“Š **Total Unique Locations:** {len(location_stats)}
409
+ 🎯 **Columns Used:**
410
+ β€’ City: {city_col}
411
+ β€’ State: {state_col if state_col else 'Not selected'}
412
+ β€’ Country: {country_col if country_col else 'Not selected'}
413
+ β€’ Title/ID: {title_col if title_col else 'Not selected'}
414
+
415
+ πŸ’‘ **Map Features:**
416
+ β€’ Marker size represents job count
417
+ β€’ Colors show relative job density
418
+ β€’ Click markers for detailed info
419
+ β€’ Hover for quick stats
420
+ """
421
+
422
+ if title_col:
423
+ top_location_idx = location_stats['unique_titles'].idxmax()
424
+ top_location = location_stats.loc[top_location_idx, 'location_key']
425
+ top_count = location_stats['unique_titles'].max()
426
+ status_msg += f"\nπŸ† **Top Location:** {top_location} ({top_count} titles)"
427
+
428
+ return m._repr_html_(), status_msg
429
 
430
  def generate_csv(self, df, filename_prefix="feed"):
431
+ """Generate CSV file for download"""
432
  if df is None or df.empty:
433
  return None
434
 
 
435
  temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix='')
436
+ temp_file.close()
437
 
 
 
438
  final_filename = temp_file.name.replace(os.path.basename(temp_file.name), f"{filename_prefix}.csv")
 
 
439
  df.to_csv(final_filename, index=False)
440
 
441
  return final_filename
442
 
443
  def get_preview(self, df, max_rows=10):
444
+ """Get a preview of the dataframe"""
445
  if df is None or df.empty:
446
  return None
447
 
 
448
  preview_df = df.head(max_rows).copy()
449
 
 
450
  for col in preview_df.select_dtypes(include=['object']).columns:
451
  preview_df[col] = preview_df[col].astype(str).apply(
452
  lambda x: x[:50] + '...' if len(str(x)) > 50 else x
 
454
 
455
  return preview_df
456
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  # Initialize the feed reader
458
  feed_reader = FeedReader()
459
 
460
+ def create_enhanced_gradio_app():
461
+ with gr.Blocks(title="Enhanced Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
 
 
462
  with gr.Row():
463
  with gr.Column(scale=4):
464
  gr.Markdown("""
465
+ # πŸ“‘ Enhanced Feed Reader & Analyzer
466
 
467
+ Load and analyze XML or JSON feeds with advanced multi-filtering and interactive mapping.
468
  """)
469
 
 
 
 
 
 
 
470
  with gr.Tab("πŸ“₯ Load Feed"):
471
  with gr.Row():
472
  with gr.Column():
 
498
  label="Data Preview",
499
  visible=True,
500
  interactive=False,
501
+ wrap=False,
502
+ row_count=(1, "dynamic")
503
  )
504
 
505
  with gr.Row():
506
  csv_download = gr.File(label="πŸ“₯ Download Full Dataset (CSV)", visible=True)
507
 
 
508
  column_choices_state = gr.State([])
509
 
 
510
  def process_and_download(url, job_tag):
511
  summary, df_processed, csv_file, preview_df, column_choices, metadata_df = feed_reader.process_feed(url, job_tag)
512
  return summary, metadata_df, preview_df, csv_file, column_choices
 
517
  outputs=[summary_output, metadata_output, preview_dataframe, csv_download, column_choices_state]
518
  )
519
 
520
+ with gr.Tab("πŸ” Advanced Filter Data"):
521
+ gr.Markdown("### 🎯 Multi-Column Filtering")
522
+ gr.Markdown("Apply multiple filters simultaneously to narrow down your dataset:")
523
+
524
  with gr.Row():
525
  with gr.Column():
526
+ gr.Markdown("**Primary Filters:**")
527
+ with gr.Column():
528
+ filter1_col = gr.Dropdown(
529
+ label="Filter 1 - Column",
530
+ choices=[],
531
+ value=None
532
+ )
533
+ filter1_val = gr.Dropdown(
534
+ label="Filter 1 - Value",
535
+ choices=[],
536
+ value=None
537
+ )
538
+ with gr.Column():
539
+ filter2_col = gr.Dropdown(
540
+ label="Filter 2 - Column",
541
+ choices=[],
542
+ value=None
543
+ )
544
+ filter2_val = gr.Dropdown(
545
+ label="Filter 2 - Value",
546
+ choices=[],
547
+ value=None
548
+ )
549
 
550
  with gr.Column():
551
+ gr.Markdown("**Additional Filters:**")
552
+ with gr.Column():
553
+ filter3_col = gr.Dropdown(
554
+ label="Filter 3 - Column",
555
+ choices=[],
556
+ value=None
557
+ )
558
+ filter3_val = gr.Dropdown(
559
+ label="Filter 3 - Value",
560
+ choices=[],
561
+ value=None
562
+ )
563
+ with gr.Column():
564
+ filter4_col = gr.Dropdown(
565
+ label="Filter 4 - Column",
566
+ choices=[],
567
+ value=None
568
+ )
569
+ filter4_val = gr.Dropdown(
570
+ label="Filter 4 - Value",
571
+ choices=[],
572
+ value=None
573
+ )
574
+
575
+ with gr.Row():
576
+ multi_filter_btn = gr.Button("πŸ” Apply Multi-Filter", variant="primary", size="lg")
577
+ clear_filters_btn = gr.Button("🧹 Clear All Filters", variant="secondary")
578
 
579
  with gr.Row():
580
+ multi_filter_summary = gr.Markdown(label="Multi-Filter Results")
581
+
582
+ with gr.Row():
583
+ multi_filtered_dataframe = gr.Dataframe(
584
  label="Filtered Data",
585
  visible=True,
586
  interactive=False,
587
+ wrap=False,
588
+ row_count=(1, "dynamic")
589
  )
590
 
591
  with gr.Row():
592
+ multi_filtered_csv = gr.File(label="πŸ“₯ Download Filtered Data (CSV)", visible=True)
593
 
594
+ # Helper functions for updating dropdowns
595
+ def update_all_filter_columns(column_choices):
596
+ choices_with_none = ["None"] + column_choices if column_choices else ["None"]
597
+ return (
598
+ gr.Dropdown(choices=choices_with_none, value="None"),
599
+ gr.Dropdown(choices=choices_with_none, value="None"),
600
+ gr.Dropdown(choices=choices_with_none, value="None"),
601
+ gr.Dropdown(choices=choices_with_none, value="None")
602
+ )
603
 
 
604
  def update_filter_values(selected_column):
605
+ if not selected_column or selected_column == "None" or feed_reader.df is None:
606
+ return gr.Dropdown(choices=["None"], value="None")
607
 
608
  unique_values = feed_reader.get_column_unique_values(selected_column)
609
+ return gr.Dropdown(choices=unique_values, value="All" if unique_values else "None")
 
 
 
610
 
611
+ # Update column choices when data is loaded
612
  column_choices_state.change(
613
+ update_all_filter_columns,
614
  inputs=[column_choices_state],
615
+ outputs=[filter1_col, filter2_col, filter3_col, filter4_col]
616
  )
617
 
618
+ # Update value dropdowns when columns are selected
619
+ filter1_col.change(update_filter_values, inputs=[filter1_col], outputs=[filter1_val])
620
+ filter2_col.change(update_filter_values, inputs=[filter2_col], outputs=[filter2_val])
621
+ filter3_col.change(update_filter_values, inputs=[filter3_col], outputs=[filter3_val])
622
+ filter4_col.change(update_filter_values, inputs=[filter4_col], outputs=[filter4_val])
 
623
 
624
+ # Multi-filter functionality
625
+ def apply_multi_filters(col1, val1, col2, val2, col3, val3, col4, val4, progress=gr.Progress()):
626
+ filters = {}
627
+
628
+ if col1 and col1 != "None" and val1 and val1 != "None":
629
+ filters[col1] = val1
630
+ if col2 and col2 != "None" and val2 and val2 != "None":
631
+ filters[col2] = val2
632
+ if col3 and col3 != "None" and val3 and val3 != "None":
633
+ filters[col3] = val3
634
+ if col4 and col4 != "None" and val4 and val4 != "None":
635
+ filters[col4] = val4
636
+
637
+ return feed_reader.apply_multiple_filters(filters, progress)
638
+
639
+ def clear_all_filters():
640
+ return (
641
+ "Filters cleared - select columns and values to filter data",
642
+ pd.DataFrame(),
643
+ None,
644
+ gr.Dropdown(value="None"),
645
+ gr.Dropdown(value="None"),
646
+ gr.Dropdown(value="None"),
647
+ gr.Dropdown(value="None"),
648
+ gr.Dropdown(value="None"),
649
+ gr.Dropdown(value="None"),
650
+ gr.Dropdown(value="None"),
651
+ gr.Dropdown(value="None")
652
+ )
653
+
654
+ multi_filter_btn.click(
655
+ apply_multi_filters,
656
+ inputs=[filter1_col, filter1_val, filter2_col, filter2_val,
657
+ filter3_col, filter3_val, filter4_col, filter4_val],
658
+ outputs=[multi_filtered_dataframe, multi_filter_summary, multi_filtered_csv]
659
+ )
660
 
661
+ clear_filters_btn.click(
662
+ clear_all_filters,
663
+ outputs=[multi_filter_summary, multi_filtered_dataframe, multi_filtered_csv,
664
+ filter1_col, filter1_val, filter2_col, filter2_val,
665
+ filter3_col, filter3_val, filter4_col, filter4_val]
666
  )
667
 
668
  with gr.Tab("πŸ“Š Statistics"):
 
754
  )
755
 
756
  # Basic statistics functionality
757
+ def get_column_stats():
758
+ """Get statistics for each column"""
759
+ if feed_reader.df is None:
760
+ return pd.DataFrame()
761
+
762
+ try:
763
+ stats = []
764
+ for column in feed_reader.df.columns:
765
+ unique_values = feed_reader.df[column].nunique()
766
+ null_count = feed_reader.df[column].isnull().sum()
767
+ total_count = len(feed_reader.df)
768
+
769
+ # Get top 5 most common values
770
+ if feed_reader.df[column].dtype == 'object':
771
+ top_values = feed_reader.df[column].value_counts().head(5)
772
+ top_values_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
773
+ else:
774
+ top_values_str = f"Min: {feed_reader.df[column].min()}, Max: {feed_reader.df[column].max()}"
775
+
776
+ stats.append({
777
+ 'Column': column,
778
+ 'Unique Values': unique_values,
779
+ 'Null Values': null_count,
780
+ 'Data Type': str(feed_reader.df[column].dtype),
781
+ 'Top Values/Range': top_values_str
782
+ })
783
+
784
+ stats_df = pd.DataFrame(stats)
785
+ return stats_df
786
+
787
+ except Exception as e:
788
+ return pd.DataFrame()
789
+
790
  basic_stats_btn.click(
791
+ get_column_stats,
792
  outputs=[basic_stats_output]
793
  )
794
 
795
+ # Get weighted statistics functionality
796
+ def get_weighted_stats_by_group(group_column, reference_col=None, cpa_col=None, cpc_col=None):
797
+ """Get weighted statistics grouped by specified column with flexible column selection"""
798
+ if feed_reader.df is None:
799
+ return pd.DataFrame(), "Please load a feed first"
800
+
801
+ # Check if group column exists
802
+ if group_column not in feed_reader.df.columns:
803
+ available_columns = [col for col in feed_reader.df.columns if col != 'last_update']
804
+ return pd.DataFrame(), f"Column '{group_column}' not found. Available columns: {', '.join(available_columns)}"
805
+
806
+ # Check if selected columns exist
807
+ selected_columns = [col for col in [reference_col, cpa_col, cpc_col] if col is not None]
808
+ missing_columns = [col for col in selected_columns if col not in feed_reader.df.columns]
809
+
810
+ if missing_columns:
811
+ available_columns = list(feed_reader.df.columns)
812
+ return pd.DataFrame(), f"Missing selected columns: {', '.join(missing_columns)}. Available columns: {', '.join(available_columns)}"
813
+
814
+ try:
815
+ def calculate_group_stats(group_df):
816
+ results = {}
817
+
818
+ # Always calculate total postings
819
+ results["total_postings"] = int(len(group_df))
820
+
821
+ # Calculate unique references if reference column is provided
822
+ if reference_col:
823
+ results["unique_references"] = int(group_df[reference_col].nunique())
824
+
825
+ # Calculate CPA statistics if CPA column is provided
826
+ if cpa_col:
827
+ cpa_series = pd.to_numeric(group_df[cpa_col], errors='coerce')
828
+ results["mean_cpa_goal"] = round(cpa_series.mean(), 2) if not cpa_series.isna().all() else 0
829
+ results["min_cpa"] = round(cpa_series.min(), 2) if not cpa_series.isna().all() else 0
830
+ results["max_cpa"] = round(cpa_series.max(), 2) if not cpa_series.isna().all() else 0
831
+
832
+ # Calculate CPC/Payout statistics if CPC column is provided
833
+ if cpc_col:
834
+ cpc_series = pd.to_numeric(group_df[cpc_col], errors='coerce')
835
+ results["mean_payouts"] = round(cpc_series.mean(), 2) if not cpc_series.isna().all() else 0
836
+ results["min_payouts"] = round(cpc_series.min(), 2) if not cpc_series.isna().all() else 0
837
+ results["max_payouts"] = round(cpc_series.max(), 2) if not cpc_series.isna().all() else 0
838
+
839
+ # Calculate Target CVR if both CPA and CPC columns are provided
840
+ if cpa_col and cpc_col:
841
+ mean_cpa = results.get("mean_cpa_goal", 0)
842
+ mean_payouts = results.get("mean_payouts", 0)
843
+ if mean_cpa > 0 and mean_payouts > 0:
844
+ results["target_cvr"] = round((mean_payouts/mean_cpa)*100, 2)
845
+ else:
846
+ results["target_cvr"] = 0
847
+
848
+ # Get current time in PST
849
+ pacific_tz = pytz.timezone("America/Los_Angeles")
850
+ now_pst = datetime.datetime.now(pytz.utc).astimezone(pacific_tz)
851
+ results["last_update"] = now_pst.strftime("%Y-%m-%d %H:%M:%S %Z")
852
+
853
+ return pd.Series(results)
854
+
855
+ # Group by selected column and apply calculations
856
+ grouped_stats = feed_reader.df.groupby(group_column).apply(calculate_group_stats).reset_index()
857
+
858
+ # Sort by most relevant metric
859
+ if "unique_references" in grouped_stats.columns:
860
+ grouped_stats = grouped_stats.sort_values('unique_references', ascending=False)
861
+ else:
862
+ grouped_stats = grouped_stats.sort_values('total_postings', ascending=False)
863
+
864
+ return grouped_stats, "Success"
865
+
866
+ except Exception as e:
867
+ return pd.DataFrame(), f"Error calculating weighted statistics: {str(e)}"
868
+
869
  # Weighted statistics functionality
870
  def calculate_weighted_stats(group_column, reference_col, cpa_col, cpc_col):
871
  if not group_column:
 
880
  if not reference_col and not cpa_col and not cpc_col:
881
  return "Please select at least one metric column (Reference ID, CPA Goal, or Payouts)", None, None
882
 
883
+ weighted_df, message = get_weighted_stats_by_group(group_column, reference_col, cpa_col, cpc_col)
884
 
885
  if not weighted_df.empty:
886
  metrics_used = []
 
916
  outputs=[weighted_stats_summary, weighted_stats_output, weighted_stats_csv]
917
  )
918
 
919
+ with gr.Tab("🌍 Interactive Job Map"):
920
  with gr.Row():
921
  with gr.Column():
922
+ gr.Markdown("### πŸ“ Map Configuration")
923
+ gr.Markdown("Select columns for geographic visualization:")
924
 
925
+ city_col = gr.Dropdown(
926
+ label="πŸ™οΈ City Column (Required)",
927
+ choices=[],
928
+ value=None,
929
+ info="Column containing city names"
930
+ )
931
+ state_col = gr.Dropdown(
932
+ label="πŸ—ΊοΈ State/Province Column (Optional)",
933
+ choices=[],
934
+ value=None,
935
+ info="Column containing state or province names"
936
+ )
937
+ country_col = gr.Dropdown(
938
+ label="🌍 Country Column (Optional)",
939
+ choices=[],
940
+ value=None,
941
+ info="Column containing country names"
942
+ )
943
+ title_col = gr.Dropdown(
944
+ label="🎯 Title/Job ID Column (Optional)",
945
+ choices=[],
946
+ value=None,
947
+ info="Column containing job titles or reference IDs"
948
+ )
949
 
950
+ with gr.Row():
951
+ map_btn = gr.Button("πŸ—ΊοΈ Generate Interactive Map", variant="primary", size="lg")
952
+ clear_map_btn = gr.Button("🧹 Clear Map", variant="secondary")
953
 
954
  with gr.Column():
955
  map_status = gr.Markdown()
956
+
957
+ with gr.Row():
958
+ map_output = gr.HTML(label="Interactive Job Distribution Map")
959
 
 
960
  def update_map_choices(column_choices):
961
  if not column_choices:
962
  return (
963
+ gr.Dropdown(choices=[]),
964
+ gr.Dropdown(choices=[]),
965
+ gr.Dropdown(choices=[]),
966
+ gr.Dropdown(choices=[])
967
  )
968
+
969
+ optional_choices = ["None"] + column_choices
970
+
971
+ # Auto-detect common column names
972
+ city_default = None
973
+ state_default = "None"
974
+ country_default = "None"
975
+ title_default = "None"
976
+
977
+ for col in column_choices:
978
+ col_lower = col.lower()
979
+
980
+ if any(term in col_lower for term in ['city', 'ciudad', 'ville', 'location']):
981
+ city_default = col
982
+ elif any(term in col_lower for term in ['state', 'province', 'region', 'estado']):
983
+ state_default = col
984
+ elif any(term in col_lower for term in ['country', 'nation', 'pais', 'pays']):
985
+ country_default = col
986
+ elif any(term in col_lower for term in ['title', 'job', 'position', 'req', 'reference', 'id', 'titulo']):
987
+ title_default = col
988
+
989
  return (
990
+ gr.Dropdown(choices=column_choices, value=city_default),
991
+ gr.Dropdown(choices=optional_choices, value=state_default),
992
+ gr.Dropdown(choices=optional_choices, value=country_default),
993
+ gr.Dropdown(choices=optional_choices, value=title_default)
994
  )
995
 
996
  column_choices_state.change(
997
  update_map_choices,
998
  inputs=[column_choices_state],
999
+ outputs=[city_col, state_col, country_col, title_col]
1000
  )
1001
 
1002
+ def generate_job_count_map(city_col, state_col, country_col, title_col, progress=gr.Progress()):
1003
+ if not city_col:
1004
+ return "❌ Please select a city column", None
1005
+
1006
+ # Handle "None" selections
1007
  state_col = None if state_col == "None" else state_col
1008
  country_col = None if country_col == "None" else country_col
1009
+ title_col = None if title_col == "None" else title_col
1010
+
1011
+ map_html, msg = feed_reader.generate_map_with_job_counts(
1012
+ city_col, state_col, country_col, title_col, progress=progress
1013
+ )
1014
  return msg, map_html
1015
 
1016
+ def clear_map():
1017
+ return "🧹 Map cleared", ""
1018
+
1019
  map_btn.click(
1020
+ generate_job_count_map,
1021
+ inputs=[city_col, state_col, country_col, title_col],
1022
  outputs=[map_status, map_output]
1023
  )
 
 
 
 
 
 
 
 
 
1024
 
1025
+ clear_map_btn.click(
1026
+ clear_map,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
  outputs=[map_status, map_output]
1028
  )
1029
 
1030
  gr.Markdown("""
1031
  ---
1032
+ ### πŸ“ Enhanced Features:
1033
+
1034
+ **πŸ” Advanced Multi-Filtering:**
1035
+ - Apply up to 4 simultaneous filters on different columns
1036
+ - Real-time progress tracking during filter operations
1037
+ - Smart dropdown population with available values
1038
+ - Clear filter functionality
1039
+
1040
+ **🌍 Interactive Map with Progress:**
1041
+ - Real-time progress bar during map generation
1042
+ - Geocoding progress tracking
1043
+ - Location data processing updates
1044
+ - Performance optimizations with delays to prevent API limits
1045
+
1046
+ **πŸ“Š Enhanced Data Processing:**
1047
+ - Improved error handling
1048
+ - Better memory management
1049
+ - Optimized for large datasets
1050
+ - Smart column auto-detection
1051
+
1052
+ **πŸ’‘ Usage Tips:**
1053
+ - **Multi-Filtering**: Select "None" to skip a filter, "All" to show all values for that column
1054
+ - **Map Generation**: Progress bar shows geocoding status and success/failure rates
1055
+ - **Performance**: Large datasets may take longer to process - progress bars keep you informed
1056
+ - **Column Detection**: Common column names are automatically detected and pre-selected
1057
+
1058
+ **🎯 Common Filter Combinations:**
1059
+ - Filter 1: Company/Client + Filter 2: City
1060
+ - Filter 1: Job Title + Filter 2: State + Filter 3: Country
1061
+ - Filter 1: Category + Filter 2: Experience Level + Filter 3: Salary Range
1062
+
1063
+ **πŸ—ΊοΈ Map Features:**
1064
+ - Marker size = Job count per location
1065
+ - Color coding = Job density (red=high, green=low)
1066
+ - Interactive popups with detailed statistics
1067
+ - Automatic legend and geocoding status
1068
  """)
1069
 
1070
  return app
1071
 
 
1072
  if __name__ == "__main__":
1073
+ app = create_enhanced_gradio_app()
1074
  app.launch(share=True, debug=True)