rogergs94 commited on
Commit
29b8dcc
Β·
verified Β·
1 Parent(s): b0efca2

Created feed.py

Browse files
Files changed (1) hide show
  1. app.py +369 -0
app.py ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import xml.etree.ElementTree as ET
4
+ import numpy as np
5
+ from io import BytesIO, StringIO
6
+ import gzip
7
+ import datetime
8
+ import gradio as gr
9
+ import os
10
+
11
+ class FeedReader:
12
+ def __init__(self):
13
+ self.df = None
14
+
15
+ @staticmethod
16
+ def truncate(value, max_length=49000):
17
+ """Truncate string values that are too long"""
18
+ if value and isinstance(value, str) and len(value) > max_length:
19
+ return value[:max_length]
20
+ return value
21
+
22
+ @staticmethod
23
+ def clean_invalid_numbers(df):
24
+ """Replace invalid numbers (NaN or infinite values) with NaN"""
25
+ return df.apply(lambda col: col.map(
26
+ lambda x: np.nan if isinstance(x, float) and (np.isnan(x) or np.isinf(x)) else x
27
+ ))
28
+
29
+ def load_feed_to_dataframe(self, url, job_tag="job"):
30
+ """
31
+ Load an XML feed (.xml or .xml.gz) or JSON from a URL and convert to DataFrame.
32
+
33
+ Args:
34
+ url (str): URL of the feed
35
+ job_tag (str): Name of the XML tag representing each job (only for XML feeds)
36
+
37
+ Returns:
38
+ pd.DataFrame: DataFrame containing the feed data
39
+ """
40
+ try:
41
+ response = requests.get(url, timeout=30)
42
+ response.raise_for_status()
43
+
44
+ # Try to parse as JSON if content-type indicates it or URL suggests JSON
45
+ content_type = response.headers.get("Content-Type", "").lower()
46
+ is_json = ("application/json" in content_type or
47
+ url.endswith(".json") or
48
+ "rest-api" in url.lower())
49
+
50
+ if is_json:
51
+ data = response.json()
52
+
53
+ # Handle different JSON formats
54
+ if isinstance(data, list):
55
+ df = pd.DataFrame(data)
56
+ elif isinstance(data, dict) and "jobs" in data:
57
+ df = pd.DataFrame(data["jobs"])
58
+ else:
59
+ # Try to convert any other dict structure to DataFrame
60
+ df = pd.DataFrame([data] if not isinstance(data, list) else data)
61
+
62
+ # Truncate and clean
63
+ df = df.applymap(lambda x: self.truncate(x) if isinstance(x, str) else x)
64
+ df = self.clean_invalid_numbers(df)
65
+ return df
66
+
67
+ # If not JSON, treat as XML
68
+ if url.endswith(".gz"):
69
+ with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
70
+ xml_content = f.read()
71
+ else:
72
+ xml_content = response.content
73
+
74
+ root = ET.fromstring(xml_content)
75
+ items = root.findall(f".//{job_tag}")
76
+
77
+ if not items:
78
+ # Try common alternative tag names
79
+ common_tags = ["item", "entry", "record", "row"]
80
+ for tag in common_tags:
81
+ items = root.findall(f".//{tag}")
82
+ if items:
83
+ break
84
+
85
+ if not items:
86
+ return pd.DataFrame(), f"No <{job_tag}> elements found in the XML. Tried common alternatives too."
87
+
88
+ jobs_data = []
89
+ for job in items:
90
+ job_data = {child.tag: self.truncate(child.text) for child in job}
91
+ jobs_data.append(job_data)
92
+
93
+ df = pd.DataFrame(jobs_data)
94
+ df = self.clean_invalid_numbers(df)
95
+ return df, "Success"
96
+
97
+ except requests.exceptions.RequestException as e:
98
+ return pd.DataFrame(), f"Request error: {str(e)}"
99
+ except ET.ParseError as e:
100
+ return pd.DataFrame(), f"XML parsing error: {str(e)}"
101
+ except ValueError as e:
102
+ return pd.DataFrame(), f"JSON parsing error: {str(e)}"
103
+ except Exception as e:
104
+ return pd.DataFrame(), f"Unexpected error: {str(e)}"
105
+
106
+ def process_feed(self, url, job_tag="job"):
107
+ """Main function to process feed and return results"""
108
+ if not url.strip():
109
+ return "Please enter a valid URL", None, "", ""
110
+
111
+ # Load the feed
112
+ result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
113
+
114
+ if isinstance(result, tuple):
115
+ df, message = result
116
+ if df.empty:
117
+ return f"Error: {message}", None, "", ""
118
+ else:
119
+ df = result
120
+ message = "Success"
121
+
122
+ # Store the dataframe
123
+ self.df = df
124
+
125
+ # Add timestamp
126
+ df['last_update'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
127
+
128
+ # Fill NaN values with 0 (with future-proof pandas handling)
129
+ df_processed = df.fillna(0).infer_objects(copy=False)
130
+
131
+ # Generate summary
132
+ summary = f"""
133
+ πŸ“Š **Feed Processing Results**
134
+
135
+ βœ… **Status:** {message}
136
+ οΏ½οΏ½οΏ½ **Rows:** {df_processed.shape[0]:,}
137
+ πŸ“ **Columns:** {df_processed.shape[1]}
138
+
139
+ πŸ” **Column Names:**
140
+ {', '.join(df_processed.columns.tolist())}
141
+
142
+ πŸ“ˆ **Data Types:**
143
+ {df_processed.dtypes.to_string()}
144
+ """
145
+
146
+ return summary, df_processed, self.generate_csv(df_processed), self.get_preview(df_processed)
147
+
148
+ def filter_by_column(self, column_name, filter_value):
149
+ """Filter dataframe by column value"""
150
+ if self.df is None:
151
+ return "Please load a feed first", None, ""
152
+
153
+ if not column_name or not filter_value:
154
+ return "Please specify both column name and filter value", None, ""
155
+
156
+ try:
157
+ # Check if column exists (case insensitive)
158
+ available_columns = self.df.columns.tolist()
159
+ matching_columns = [col for col in available_columns if col.lower() == column_name.lower()]
160
+
161
+ if not matching_columns:
162
+ return f"Column '{column_name}' not found. Available columns: {', '.join(available_columns)}", None, ""
163
+
164
+ actual_column = matching_columns[0]
165
+
166
+ # Filter the dataframe
167
+ if self.df[actual_column].dtype == 'object': # String column
168
+ filtered_df = self.df[self.df[actual_column].str.contains(filter_value, na=False, case=False)]
169
+ else: # Numeric column
170
+ try:
171
+ filter_val_numeric = float(filter_value)
172
+ filtered_df = self.df[self.df[actual_column] == filter_val_numeric]
173
+ except ValueError:
174
+ filtered_df = self.df[self.df[actual_column].astype(str).str.contains(filter_value, na=False, case=False)]
175
+
176
+ if filtered_df.empty:
177
+ return f"No records found matching '{filter_value}' in column '{actual_column}'", None, ""
178
+
179
+ filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
180
+
181
+ summary = f"""
182
+ πŸ” **Filtered Results**
183
+
184
+ πŸ“‹ **Matching Rows:** {filtered_df.shape[0]:,}
185
+ 🎯 **Filter:** {actual_column} contains '{filter_value}'
186
+ """
187
+
188
+ return summary, filtered_df, self.generate_csv(filtered_df, f"filtered_{filter_value}")
189
+
190
+ except Exception as e:
191
+ return f"Error filtering data: {str(e)}", None, ""
192
+
193
+ def get_column_stats(self):
194
+ """Get statistics for each column"""
195
+ if self.df is None:
196
+ return "Please load a feed first"
197
+
198
+ try:
199
+ stats = []
200
+ for column in self.df.columns:
201
+ unique_values = self.df[column].nunique()
202
+ null_count = self.df[column].isnull().sum()
203
+ total_count = len(self.df)
204
+
205
+ # Get top 5 most common values
206
+ if self.df[column].dtype == 'object':
207
+ top_values = self.df[column].value_counts().head(5)
208
+ top_values_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
209
+ else:
210
+ top_values_str = f"Min: {self.df[column].min()}, Max: {self.df[column].max()}"
211
+
212
+ stats.append({
213
+ 'Column': column,
214
+ 'Unique Values': unique_values,
215
+ 'Null Values': null_count,
216
+ 'Data Type': str(self.df[column].dtype),
217
+ 'Top Values/Range': top_values_str
218
+ })
219
+
220
+ stats_df = pd.DataFrame(stats)
221
+ return stats_df
222
+
223
+ except Exception as e:
224
+ return f"Error generating statistics: {str(e)}"
225
+
226
+ def generate_csv(self, df, filename_prefix="feed"):
227
+ """Generate CSV file for download"""
228
+ if df is None or df.empty:
229
+ return None
230
+
231
+ # Create a temporary file
232
+ import tempfile
233
+ temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix=f'{filename_prefix}_')
234
+ df.to_csv(temp_file.name, index=False)
235
+ temp_file.close()
236
+ return temp_file.name
237
+
238
+ def get_preview(self, df, max_rows=10):
239
+ """Get a preview of the dataframe"""
240
+ if df is None or df.empty:
241
+ return "No data to preview"
242
+
243
+ # Limit the preview to avoid overwhelming display
244
+ preview_df = df.head(max_rows)
245
+
246
+ # Truncate long string values for better display
247
+ preview_df = preview_df.copy()
248
+ for col in preview_df.select_dtypes(include=['object']).columns:
249
+ preview_df[col] = preview_df[col].astype(str).apply(lambda x: x[:50] + '...' if len(str(x)) > 50 else x)
250
+
251
+ preview = preview_df.to_string(max_cols=8, max_rows=max_rows, show_dimensions=True)
252
+ return f"**Data Preview (First {min(max_rows, len(df))} rows):**\n```\n{preview}\n```"
253
+
254
+ # Initialize the feed reader
255
+ feed_reader = FeedReader()
256
+
257
+ # Create Gradio interface
258
+ def create_gradio_app():
259
+ with gr.Blocks(title="Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
260
+ gr.Markdown("""
261
+ # πŸ“‘ Feed Reader & Analyzer
262
+
263
+ Load and analyze XML or JSON feeds from URLs. Supports compressed files (.gz) and various data formats.
264
+ """)
265
+
266
+ with gr.Tab("πŸ“₯ Load Feed"):
267
+ with gr.Row():
268
+ with gr.Column():
269
+ url_input = gr.Textbox(
270
+ label="Feed URL",
271
+ placeholder="https://example.com/feed.xml",
272
+ lines=1
273
+ )
274
+ job_tag_input = gr.Textbox(
275
+ label="XML Job Tag (for XML feeds only)",
276
+ value="job",
277
+ placeholder="job, item, entry, etc."
278
+ )
279
+ load_btn = gr.Button("πŸ”„ Load Feed", variant="primary")
280
+
281
+ with gr.Column():
282
+ summary_output = gr.Markdown(label="Summary")
283
+
284
+ with gr.Row():
285
+ preview_output = gr.Markdown(label="Data Preview")
286
+
287
+ with gr.Row():
288
+ csv_download = gr.File(label="πŸ“₯ Download Full Dataset (CSV)", visible=True)
289
+
290
+ # Load feed functionality
291
+ def process_and_download(url, job_tag):
292
+ summary, df_processed, csv_file, preview = feed_reader.process_feed(url, job_tag)
293
+ return summary, preview, csv_file
294
+
295
+ load_btn.click(
296
+ process_and_download,
297
+ inputs=[url_input, job_tag_input],
298
+ outputs=[summary_output, preview_output, csv_download]
299
+ )
300
+
301
+ with gr.Tab("πŸ” Filter Data"):
302
+ with gr.Row():
303
+ with gr.Column():
304
+ filter_column = gr.Textbox(
305
+ label="Column Name",
306
+ placeholder="e.g., clientname, title, category"
307
+ )
308
+ filter_value = gr.Textbox(
309
+ label="Filter Value",
310
+ placeholder="Value to search for"
311
+ )
312
+ filter_btn = gr.Button("πŸ” Filter", variant="primary")
313
+
314
+ with gr.Column():
315
+ filter_summary = gr.Markdown(label="Filter Results")
316
+
317
+ with gr.Row():
318
+ filtered_csv = gr.File(label="πŸ“₯ Download Filtered Data (CSV)", visible=False)
319
+
320
+ # Filter functionality
321
+ def filter_and_download(column_name, filter_value):
322
+ summary, df_filtered, csv_file = feed_reader.filter_by_column(column_name, filter_value)
323
+ return summary, csv_file
324
+
325
+ filter_btn.click(
326
+ filter_and_download,
327
+ inputs=[filter_column, filter_value],
328
+ outputs=[filter_summary, filtered_csv]
329
+ )
330
+
331
+ with gr.Tab("πŸ“Š Statistics"):
332
+ with gr.Column():
333
+ stats_btn = gr.Button("πŸ“Š Generate Column Statistics", variant="primary")
334
+ stats_output = gr.Dataframe(label="Column Statistics")
335
+
336
+ # Statistics functionality
337
+ stats_btn.click(
338
+ feed_reader.get_column_stats,
339
+ outputs=[stats_output]
340
+ )
341
+
342
+ gr.Markdown("""
343
+ ---
344
+ ### πŸ“ Instructions:
345
+
346
+ 1. **Load Feed**: Enter a URL pointing to an XML or JSON feed and click "Load Feed"
347
+ 2. **Filter Data**: Use column names to filter the loaded data
348
+ 3. **Statistics**: View detailed statistics about each column in your dataset
349
+ 4. **Download**: CSV files are automatically generated for download
350
+
351
+ **Supported Formats:**
352
+ - XML files (.xml, .xml.gz)
353
+ - JSON files (.json)
354
+ - REST APIs returning JSON
355
+
356
+ **Features:**
357
+ - Automatic format detection
358
+ - Data cleaning and validation
359
+ - Column-based filtering
360
+ - Statistical analysis
361
+ - CSV export functionality
362
+ """)
363
+
364
+ return app
365
+
366
+ # Launch the app
367
+ if __name__ == "__main__":
368
+ app = create_gradio_app()
369
+ app.launch(share=True, debug=True)