Spaces:

shamilcoded
/

Data-Excel

Sleeping

App Files Files Community

SHAMIL SHAHBAZ AWAN commited on Jan 1, 2025

Commit

0779acb

verified ·

1 Parent(s): f45cec7

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -21

app.py CHANGED Viewed

@@ -2,7 +2,6 @@ import streamlit as st
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-import os
 from io import StringIO
 from transformers import pipeline
@@ -16,7 +15,8 @@ def load_file(uploaded_file):
         if uploaded_file.type == "text/csv":
             data = pd.read_csv(uploaded_file)
         elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
-            data = pd.read_excel(uploaded_file)
         else:
             st.error("Unsupported file type.")
             return None
@@ -25,17 +25,9 @@ def load_file(uploaded_file):
         st.error(f"Error loading file: {e}")
         return None
-# Function to infer column names based on synonyms
-def infer_column(data, synonyms):
-    """Infer a column name based on synonyms."""
-    for column in data.columns:
-        if column.lower() in synonyms:
-            return column
-    return None
 # Function to classify the user query
 def classify_query(query, candidate_labels):
-    """Classify the user query into graph types."""
     results = nlp(query, candidate_labels)
     if results:
         return results['labels'][0]
@@ -47,17 +39,23 @@ def generate_graph(data, query):
     try:
         fig, ax = plt.subplots(figsize=(10, 6))
-        # Infer column names dynamically
         numerical_columns = data.select_dtypes(include=['number']).columns.tolist()
         categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
         datetime_columns = data.select_dtypes(include=['datetime']).columns.tolist()
         # Define possible graph types
-        candidate_labels = ["bar chart", "line chart", "scatter plot", "histogram"]
         query_type = classify_query(query, candidate_labels)
         if query_type == "bar chart" and categorical_columns and numerical_columns:
-            # Bar chart for categorical vs numerical
             x_col = st.selectbox("Select the categorical column:", categorical_columns)
             y_col = st.selectbox("Select the numerical column:", numerical_columns)
             aggregated_data = data[[x_col, y_col]].groupby(x_col).sum().reset_index()
@@ -67,7 +65,7 @@ def generate_graph(data, query):
             st.pyplot(fig)
         elif query_type == "line chart" and datetime_columns and numerical_columns:
-            # Line chart for numerical trend over time
             x_col = st.selectbox("Select the datetime column:", datetime_columns)
             y_col = st.selectbox("Select the numerical column:", numerical_columns)
             data[x_col] = pd.to_datetime(data[x_col])
@@ -77,7 +75,7 @@ def generate_graph(data, query):
             st.pyplot(fig)
         elif query_type == "scatter plot" and len(numerical_columns) >= 2:
-            # Scatter plot for numerical relationships
             x_col = st.selectbox("Select the x-axis numerical column:", numerical_columns)
             y_col = st.selectbox("Select the y-axis numerical column:", numerical_columns)
             sns.scatterplot(x=x_col, y=y_col, data=data, ax=ax)
@@ -85,17 +83,44 @@ def generate_graph(data, query):
             st.pyplot(fig)
         elif query_type == "histogram" and numerical_columns:
-            # Histogram for a numerical column
             hist_col = st.selectbox("Select the numerical column:", numerical_columns)
             sns.histplot(data[hist_col], bins=20, kde=True, ax=ax, color='green')
             ax.set_title(f"Histogram of {hist_col}")
             st.pyplot(fig)
         else:
-            st.error("Unsupported graph type or insufficient data. Try asking for a bar chart, line chart, scatter plot, or histogram.")
     except Exception as e:
         st.error(f"Error generating graph: {e}")
 # Streamlit App Interface
 def main():
     st.set_page_config(page_title="Data Visualization App", page_icon="📊", layout="wide")
@@ -123,13 +148,17 @@ def main():
         data = load_file(uploaded_file)
         if data is not None:
             st.write("Dataset preview:", data.head())
-            # User input for graph generation
-            query = st.text_input("Enter your query (e.g., 'Generate a bar chart for countries and gross sales')")
             if query:
-                # Generate the graph based on the query
                 generate_graph(data, query)
 if __name__ == "__main__":

 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
 from io import StringIO
 from transformers import pipeline
         if uploaded_file.type == "text/csv":
             data = pd.read_csv(uploaded_file)
         elif uploaded_file.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+            # Load all sheets if it's an Excel file
+            data = pd.read_excel(uploaded_file, sheet_name=None)  # Load all sheets into a dictionary
         else:
             st.error("Unsupported file type.")
             return None
         st.error(f"Error loading file: {e}")
         return None
 # Function to classify the user query
 def classify_query(query, candidate_labels):
+    """Classify the user query into graph types or general analysis queries."""
     results = nlp(query, candidate_labels)
     if results:
         return results['labels'][0]
     try:
         fig, ax = plt.subplots(figsize=(10, 6))
+        # Extract columns from data (if it's a dictionary of sheets, flatten it)
+        if isinstance(data, dict):
+            data = pd.concat(data.values(), ignore_index=True)  # Combine all sheets into a single dataframe
+        # Infer column types
         numerical_columns = data.select_dtypes(include=['number']).columns.tolist()
         categorical_columns = data.select_dtypes(include=['object', 'category']).columns.tolist()
         datetime_columns = data.select_dtypes(include=['datetime']).columns.tolist()
         # Define possible graph types
+        candidate_labels = ["bar chart", "line chart", "scatter plot", "histogram", "sales question"]
         query_type = classify_query(query, candidate_labels)
+        # Provide text-based query response
+        response = ""
         if query_type == "bar chart" and categorical_columns and numerical_columns:
+            response = f"Generating a bar chart for {query}"
             x_col = st.selectbox("Select the categorical column:", categorical_columns)
             y_col = st.selectbox("Select the numerical column:", numerical_columns)
             aggregated_data = data[[x_col, y_col]].groupby(x_col).sum().reset_index()
             st.pyplot(fig)
         elif query_type == "line chart" and datetime_columns and numerical_columns:
+            response = f"Generating a line chart for {query}"
             x_col = st.selectbox("Select the datetime column:", datetime_columns)
             y_col = st.selectbox("Select the numerical column:", numerical_columns)
             data[x_col] = pd.to_datetime(data[x_col])
             st.pyplot(fig)
         elif query_type == "scatter plot" and len(numerical_columns) >= 2:
+            response = f"Generating a scatter plot for {query}"
             x_col = st.selectbox("Select the x-axis numerical column:", numerical_columns)
             y_col = st.selectbox("Select the y-axis numerical column:", numerical_columns)
             sns.scatterplot(x=x_col, y=y_col, data=data, ax=ax)
             st.pyplot(fig)
         elif query_type == "histogram" and numerical_columns:
+            response = f"Generating a histogram for {query}"
             hist_col = st.selectbox("Select the numerical column:", numerical_columns)
             sns.histplot(data[hist_col], bins=20, kde=True, ax=ax, color='green')
             ax.set_title(f"Histogram of {hist_col}")
             st.pyplot(fig)
+        elif query_type == "sales question":
+            # General sales-related question (e.g., "Which department has the most sales?")
+            response = "Analyzing the sales data for your query."
+            # Assuming the file has columns like "Department" and "Sales"
+            department_column = infer_column(data, ["department", "dept"])
+            sales_column = infer_column(data, ["sales", "revenue"])
+            if department_column and sales_column:
+                # Answer the query: Which department has the most sales?
+                top_department = data.groupby(department_column)[sales_column].sum().idxmax()
+                top_sales = data.groupby(department_column)[sales_column].sum().max()
+                response += f" The department with the most sales is {top_department} with total sales of {top_sales:.2f}."
+            else:
+                response += " Could not find relevant 'department' or 'sales' columns in the dataset."
         else:
+            response = "Unsupported graph type or insufficient data. Try asking for a bar chart, line chart, scatter plot, histogram, or sales-related question."
+        # Show text-based response
+        st.write(response)
     except Exception as e:
         st.error(f"Error generating graph: {e}")
+# Helper function to infer column names based on synonyms
+def infer_column(data, synonyms):
+    """Infer a column name based on synonyms."""
+    for column in data.columns:
+        if column.lower() in synonyms:
+            return column
+    return None
 # Streamlit App Interface
 def main():
     st.set_page_config(page_title="Data Visualization App", page_icon="📊", layout="wide")
         data = load_file(uploaded_file)
         if data is not None:
+            if isinstance(data, dict):  # For Excel with multiple sheets
+                st.write("Sheets in Excel file:", list(data.keys()))
+                sheet_name = st.selectbox("Select a sheet", list(data.keys()))
+                data = data[sheet_name]  # Use the selected sheet
             st.write("Dataset preview:", data.head())
+            # User input for graph generation or general questions
+            query = st.text_input("Enter your query (e.g., 'Generate a bar chart for countries and gross sales', or 'Which department has the most sales?')")
             if query:
+                # Generate the graph based on the query or handle general questions
                 generate_graph(data, query)
 if __name__ == "__main__":