Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from pygwalker.api.streamlit import StreamlitRenderer | |
| from io import BytesIO | |
| import requests | |
| import streamlit as st | |
| from pymongo import MongoClient | |
| import os | |
| from dotenv import load_dotenv | |
| import json | |
| # Load environment variables | |
| load_dotenv() | |
| MONGO_URI = os.getenv("MONGO_URI") | |
| DB_NAME = os.getenv("DB_NAME") | |
| COLLECTION_NAME = os.getenv("COLLECTION_NAME") | |
| mongo_client = MongoClient(MONGO_URI) | |
| db = mongo_client[DB_NAME] | |
| collection = db[COLLECTION_NAME] | |
| # Load the CSV from a URL (replace with actual CSV download from S3) | |
| def load_csv_from_url(object_url): | |
| response = requests.get(object_url) | |
| response.raise_for_status() # Ensure the request was successful | |
| csv_data = pd.read_csv(BytesIO(response.content)) | |
| return csv_data | |
| # Analyzing each column based on data type | |
| def analyze_column_data(df): | |
| analysis = {} | |
| for col in df.columns: | |
| if pd.api.types.is_numeric_dtype(df[col]): | |
| analysis[col] = { | |
| "Mean": df[col].mean(), | |
| "Median": df[col].median(), | |
| "Mode": df[col].mode()[0] if not df[col].mode().empty else None, | |
| "Unique Values": df[col].nunique(), | |
| "Null Values": df[col].isnull().sum() | |
| } | |
| else: | |
| analysis[col] = { | |
| "Unique Values": df[col].nunique(), | |
| "Null Values": df[col].isnull().sum(), | |
| "Top Categories": df[col].value_counts().head(5).to_dict() | |
| } | |
| return analysis | |
| # Main function to render the View Table Analysis page | |
| def view_table_analysis_page(url): | |
| if st.button("Back",key="back_button"): | |
| st.session_state.page="view_image" | |
| st.rerun() | |
| image=collection.find_one({"object_url":url}) | |
| csv_url=image.get("csv_object_url") | |
| # Load CSV data | |
| df = load_csv_from_url(csv_url) | |
| # Check if the last row has any cell containing the word "total" (case-insensitive) | |
| if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any(): | |
| df = df.iloc[:-1] # Drop the last row if "total" is found in any cell | |
| # Page title | |
| st.title("Table Analysis") | |
| # CSV Preview | |
| st.subheader("CSV Preview") | |
| st.write("Below is a preview of the uploaded CSV file:") | |
| st.dataframe(df) # Interactive, scrollable table | |
| # Download Button | |
| excel_buffer = BytesIO() | |
| with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer: | |
| df.to_excel(writer, index=False, sheet_name="Sheet1") | |
| excel_buffer.seek(0) # Reset buffer position | |
| # Download Button | |
| st.download_button( | |
| label="Download Full Excel Sheet", | |
| data=excel_buffer, | |
| file_name="table_data.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |
| st.markdown("<hr>", unsafe_allow_html=True) | |
| table_description=image.get("table_data").get("description",None) | |
| if table_description: | |
| # Table Description | |
| st.subheader("Table Description") | |
| st.write(table_description) | |
| # Column Summary | |
| st.markdown("<hr>",unsafe_allow_html=True) | |
| st.subheader("Column Summary") | |
| with st.container(height=400, border=False): | |
| column_summary = image.get("table_data").get("column_summary", None) | |
| if column_summary: | |
| # Column-level descriptions and analysis | |
| column_analysis = analyze_column_data(df) | |
| col1, col2 = st.columns(2) | |
| for idx, (col_name, col_description) in enumerate(column_summary.items()): | |
| # Determine which column to use based on the index | |
| with col1 if idx % 2 == 0 else col2: | |
| st.markdown(f"Column Name : **{col_name}**") | |
| st.write(f"Column Description : {col_description}") | |
| # Display basic analysis | |
| analysis = column_analysis.get(col_name, {}) | |
| if pd.api.types.is_numeric_dtype(df[col_name]): | |
| # Numeric column analysis | |
| st.write({ | |
| "Mean": analysis.get("Mean"), | |
| "Median": analysis.get("Median"), | |
| "Mode": analysis.get("Mode"), | |
| "Unique Values": analysis.get("Unique Values"), | |
| "Null Values": analysis.get("Null Values") | |
| }) | |
| else: | |
| # Categorical column analysis | |
| st.write({ | |
| "Unique Values": analysis.get("Unique Values"), | |
| "Null Values": analysis.get("Null Values"), | |
| "Top Categories": analysis.get("Top Categories") | |
| }) | |
| st.markdown("<hr>", unsafe_allow_html=True) | |
| st.subheader("Graphical Analysis of Table") | |
| # Default configuration for initial visualization | |
| best_col1=image.get("table_data").get("best_col1") | |
| best_col2 = image.get("table_data").get("best_col2") | |
| default_chart_config = { | |
| "mark": "bar", | |
| "encoding": { | |
| "x": {"field": best_col1, "type": "nominal"}, | |
| "y": {"field": best_col2, "type": "quantitative"} | |
| } | |
| } | |
| # Convert default_chart_config to JSON string for Pygwalker spec parameter | |
| pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config)) | |
| pyg_app.explorer() | |