Spaces:

rogergs94
/

feed

Sleeping

App Files Files Community

rogergs94 commited on Sep 18, 2025

Commit

29b8dcc

verified ·

1 Parent(s): b0efca2

Created feed.py

Browse files

Files changed (1) hide show

app.py +369 -0

app.py ADDED Viewed

	@@ -0,0 +1,369 @@

+import pandas as pd
+import requests
+import xml.etree.ElementTree as ET
+import numpy as np
+from io import BytesIO, StringIO
+import gzip
+import datetime
+import gradio as gr
+import os
+class FeedReader:
+    def __init__(self):
+        self.df = None
+    @staticmethod
+    def truncate(value, max_length=49000):
+        """Truncate string values that are too long"""
+        if value and isinstance(value, str) and len(value) > max_length:
+            return value[:max_length]
+        return value
+    @staticmethod
+    def clean_invalid_numbers(df):
+        """Replace invalid numbers (NaN or infinite values) with NaN"""
+        return df.apply(lambda col: col.map(
+            lambda x: np.nan if isinstance(x, float) and (np.isnan(x) or np.isinf(x)) else x
+        ))
+    def load_feed_to_dataframe(self, url, job_tag="job"):
+        """
+        Load an XML feed (.xml or .xml.gz) or JSON from a URL and convert to DataFrame.
+        Args:
+            url (str): URL of the feed
+            job_tag (str): Name of the XML tag representing each job (only for XML feeds)
+        Returns:
+            pd.DataFrame: DataFrame containing the feed data
+        """
+        try:
+            response = requests.get(url, timeout=30)
+            response.raise_for_status()
+            # Try to parse as JSON if content-type indicates it or URL suggests JSON
+            content_type = response.headers.get("Content-Type", "").lower()
+            is_json = ("application/json" in content_type or
+                      url.endswith(".json") or
+                      "rest-api" in url.lower())
+            if is_json:
+                data = response.json()
+                # Handle different JSON formats
+                if isinstance(data, list):
+                    df = pd.DataFrame(data)
+                elif isinstance(data, dict) and "jobs" in data:
+                    df = pd.DataFrame(data["jobs"])
+                else:
+                    # Try to convert any other dict structure to DataFrame
+                    df = pd.DataFrame([data] if not isinstance(data, list) else data)
+                # Truncate and clean
+                df = df.applymap(lambda x: self.truncate(x) if isinstance(x, str) else x)
+                df = self.clean_invalid_numbers(df)
+                return df
+            # If not JSON, treat as XML
+            if url.endswith(".gz"):
+                with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
+                    xml_content = f.read()
+            else:
+                xml_content = response.content
+            root = ET.fromstring(xml_content)
+            items = root.findall(f".//{job_tag}")
+            if not items:
+                # Try common alternative tag names
+                common_tags = ["item", "entry", "record", "row"]
+                for tag in common_tags:
+                    items = root.findall(f".//{tag}")
+                    if items:
+                        break
+            if not items:
+                return pd.DataFrame(), f"No <{job_tag}> elements found in the XML. Tried common alternatives too."
+            jobs_data = []
+            for job in items:
+                job_data = {child.tag: self.truncate(child.text) for child in job}
+                jobs_data.append(job_data)
+            df = pd.DataFrame(jobs_data)
+            df = self.clean_invalid_numbers(df)
+            return df, "Success"
+        except requests.exceptions.RequestException as e:
+            return pd.DataFrame(), f"Request error: {str(e)}"
+        except ET.ParseError as e:
+            return pd.DataFrame(), f"XML parsing error: {str(e)}"
+        except ValueError as e:
+            return pd.DataFrame(), f"JSON parsing error: {str(e)}"
+        except Exception as e:
+            return pd.DataFrame(), f"Unexpected error: {str(e)}"
+    def process_feed(self, url, job_tag="job"):
+        """Main function to process feed and return results"""
+        if not url.strip():
+            return "Please enter a valid URL", None, "", ""
+        # Load the feed
+        result = self.load_feed_to_dataframe(url.strip(), job_tag.strip())
+        if isinstance(result, tuple):
+            df, message = result
+            if df.empty:
+                return f"Error: {message}", None, "", ""
+        else:
+            df = result
+            message = "Success"
+        # Store the dataframe
+        self.df = df
+        # Add timestamp
+        df['last_update'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
+        # Fill NaN values with 0 (with future-proof pandas handling)
+        df_processed = df.fillna(0).infer_objects(copy=False)
+        # Generate summary
+        summary = f"""
+📊 **Feed Processing Results**
+✅ **Status:** {message}
+��� **Rows:** {df_processed.shape[0]:,}
+📝 **Columns:** {df_processed.shape[1]}
+🔍 **Column Names:**
+{', '.join(df_processed.columns.tolist())}
+📈 **Data Types:**
+{df_processed.dtypes.to_string()}
+        """
+        return summary, df_processed, self.generate_csv(df_processed), self.get_preview(df_processed)
+    def filter_by_column(self, column_name, filter_value):
+        """Filter dataframe by column value"""
+        if self.df is None:
+            return "Please load a feed first", None, ""
+        if not column_name or not filter_value:
+            return "Please specify both column name and filter value", None, ""
+        try:
+            # Check if column exists (case insensitive)
+            available_columns = self.df.columns.tolist()
+            matching_columns = [col for col in available_columns if col.lower() == column_name.lower()]
+            if not matching_columns:
+                return f"Column '{column_name}' not found. Available columns: {', '.join(available_columns)}", None, ""
+            actual_column = matching_columns[0]
+            # Filter the dataframe
+            if self.df[actual_column].dtype == 'object':  # String column
+                filtered_df = self.df[self.df[actual_column].str.contains(filter_value, na=False, case=False)]
+            else:  # Numeric column
+                try:
+                    filter_val_numeric = float(filter_value)
+                    filtered_df = self.df[self.df[actual_column] == filter_val_numeric]
+                except ValueError:
+                    filtered_df = self.df[self.df[actual_column].astype(str).str.contains(filter_value, na=False, case=False)]
+            if filtered_df.empty:
+                return f"No records found matching '{filter_value}' in column '{actual_column}'", None, ""
+            filtered_df = filtered_df.fillna(0).infer_objects(copy=False)
+            summary = f"""
+🔍 **Filtered Results**
+📋 **Matching Rows:** {filtered_df.shape[0]:,}
+🎯 **Filter:** {actual_column} contains '{filter_value}'
+            """
+            return summary, filtered_df, self.generate_csv(filtered_df, f"filtered_{filter_value}")
+        except Exception as e:
+            return f"Error filtering data: {str(e)}", None, ""
+    def get_column_stats(self):
+        """Get statistics for each column"""
+        if self.df is None:
+            return "Please load a feed first"
+        try:
+            stats = []
+            for column in self.df.columns:
+                unique_values = self.df[column].nunique()
+                null_count = self.df[column].isnull().sum()
+                total_count = len(self.df)
+                # Get top 5 most common values
+                if self.df[column].dtype == 'object':
+                    top_values = self.df[column].value_counts().head(5)
+                    top_values_str = ", ".join([f"{val} ({count})" for val, count in top_values.items()])
+                else:
+                    top_values_str = f"Min: {self.df[column].min()}, Max: {self.df[column].max()}"
+                stats.append({
+                    'Column': column,
+                    'Unique Values': unique_values,
+                    'Null Values': null_count,
+                    'Data Type': str(self.df[column].dtype),
+                    'Top Values/Range': top_values_str
+                })
+            stats_df = pd.DataFrame(stats)
+            return stats_df
+        except Exception as e:
+            return f"Error generating statistics: {str(e)}"
+    def generate_csv(self, df, filename_prefix="feed"):
+        """Generate CSV file for download"""
+        if df is None or df.empty:
+            return None
+        # Create a temporary file
+        import tempfile
+        temp_file = tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False, prefix=f'{filename_prefix}_')
+        df.to_csv(temp_file.name, index=False)
+        temp_file.close()
+        return temp_file.name
+    def get_preview(self, df, max_rows=10):
+        """Get a preview of the dataframe"""
+        if df is None or df.empty:
+            return "No data to preview"
+        # Limit the preview to avoid overwhelming display
+        preview_df = df.head(max_rows)
+        # Truncate long string values for better display
+        preview_df = preview_df.copy()
+        for col in preview_df.select_dtypes(include=['object']).columns:
+            preview_df[col] = preview_df[col].astype(str).apply(lambda x: x[:50] + '...' if len(str(x)) > 50 else x)
+        preview = preview_df.to_string(max_cols=8, max_rows=max_rows, show_dimensions=True)
+        return f"**Data Preview (First {min(max_rows, len(df))} rows):**\n```\n{preview}\n```"
+# Initialize the feed reader
+feed_reader = FeedReader()
+# Create Gradio interface
+def create_gradio_app():
+    with gr.Blocks(title="Feed Reader & Analyzer", theme=gr.themes.Soft()) as app:
+        gr.Markdown("""
+        # 📡 Feed Reader & Analyzer
+        Load and analyze XML or JSON feeds from URLs. Supports compressed files (.gz) and various data formats.
+        """)
+        with gr.Tab("📥 Load Feed"):
+            with gr.Row():
+                with gr.Column():
+                    url_input = gr.Textbox(
+                        label="Feed URL",
+                        placeholder="https://example.com/feed.xml",
+                        lines=1
+                    )
+                    job_tag_input = gr.Textbox(
+                        label="XML Job Tag (for XML feeds only)",
+                        value="job",
+                        placeholder="job, item, entry, etc."
+                    )
+                    load_btn = gr.Button("🔄 Load Feed", variant="primary")
+                with gr.Column():
+                    summary_output = gr.Markdown(label="Summary")
+            with gr.Row():
+                preview_output = gr.Markdown(label="Data Preview")
+            with gr.Row():
+                csv_download = gr.File(label="📥 Download Full Dataset (CSV)", visible=True)
+            # Load feed functionality
+            def process_and_download(url, job_tag):
+                summary, df_processed, csv_file, preview = feed_reader.process_feed(url, job_tag)
+                return summary, preview, csv_file
+            load_btn.click(
+                process_and_download,
+                inputs=[url_input, job_tag_input],
+                outputs=[summary_output, preview_output, csv_download]
+            )
+        with gr.Tab("🔍 Filter Data"):
+            with gr.Row():
+                with gr.Column():
+                    filter_column = gr.Textbox(
+                        label="Column Name",
+                        placeholder="e.g., clientname, title, category"
+                    )
+                    filter_value = gr.Textbox(
+                        label="Filter Value",
+                        placeholder="Value to search for"
+                    )
+                    filter_btn = gr.Button("🔍 Filter", variant="primary")
+                with gr.Column():
+                    filter_summary = gr.Markdown(label="Filter Results")
+            with gr.Row():
+                filtered_csv = gr.File(label="📥 Download Filtered Data (CSV)", visible=False)
+            # Filter functionality
+            def filter_and_download(column_name, filter_value):
+                summary, df_filtered, csv_file = feed_reader.filter_by_column(column_name, filter_value)
+                return summary, csv_file
+            filter_btn.click(
+                filter_and_download,
+                inputs=[filter_column, filter_value],
+                outputs=[filter_summary, filtered_csv]
+            )
+        with gr.Tab("📊 Statistics"):
+            with gr.Column():
+                stats_btn = gr.Button("📊 Generate Column Statistics", variant="primary")
+                stats_output = gr.Dataframe(label="Column Statistics")
+            # Statistics functionality
+            stats_btn.click(
+                feed_reader.get_column_stats,
+                outputs=[stats_output]
+            )
+        gr.Markdown("""
+        ---
+        ### 📝 Instructions:
+        1. **Load Feed**: Enter a URL pointing to an XML or JSON feed and click "Load Feed"
+        2. **Filter Data**: Use column names to filter the loaded data
+        3. **Statistics**: View detailed statistics about each column in your dataset
+        4. **Download**: CSV files are automatically generated for download
+        **Supported Formats:**
+        - XML files (.xml, .xml.gz)
+        - JSON files (.json)
+        - REST APIs returning JSON
+        **Features:**
+        - Automatic format detection
+        - Data cleaning and validation
+        - Column-based filtering
+        - Statistical analysis
+        - CSV export functionality
+        """)
+    return app
+# Launch the app
+if __name__ == "__main__":
+    app = create_gradio_app()
+    app.launch(share=True, debug=True)