Spaces:

youdata-ai
/

MOSPI_analysis_tool

Sleeping

App Files Files Community

MOSPI_analysis_tool / table_analysis_for_image.py

akshansh36

Upload 10 files

eef9e83 verified about 1 year ago

raw

history blame contribute delete

5.59 kB

	import pandas as pd
	from pygwalker.api.streamlit import StreamlitRenderer
	from io import BytesIO
	import requests
	import streamlit as st
	from pymongo import MongoClient
	import os
	from dotenv import load_dotenv
	import json

	# Load environment variables
	load_dotenv()
	MONGO_URI = os.getenv("MONGO_URI")
	DB_NAME = os.getenv("DB_NAME")
	COLLECTION_NAME = os.getenv("COLLECTION_NAME")

	mongo_client = MongoClient(MONGO_URI)
	db = mongo_client[DB_NAME]
	collection = db[COLLECTION_NAME]




	# Load the CSV from a URL (replace with actual CSV download from S3)
	def load_csv_from_url(object_url):
	response = requests.get(object_url)
	response.raise_for_status() # Ensure the request was successful
	csv_data = pd.read_csv(BytesIO(response.content))
	return csv_data


	# Analyzing each column based on data type
	def analyze_column_data(df):
	analysis = {}
	for col in df.columns:
	if pd.api.types.is_numeric_dtype(df[col]):
	analysis[col] = {
	"Mean": df[col].mean(),
	"Median": df[col].median(),
	"Mode": df[col].mode()[0] if not df[col].mode().empty else None,
	"Unique Values": df[col].nunique(),
	"Null Values": df[col].isnull().sum()
	}
	else:
	analysis[col] = {
	"Unique Values": df[col].nunique(),
	"Null Values": df[col].isnull().sum(),
	"Top Categories": df[col].value_counts().head(5).to_dict()
	}
	return analysis


	# Main function to render the View Table Analysis page
	def view_table_analysis_page(url):

	if st.button("Back",key="back_button"):
	st.session_state.page="view_image"
	st.rerun()

	image=collection.find_one({"object_url":url})
	csv_url=image.get("csv_object_url")

	# Load CSV data
	df = load_csv_from_url(csv_url)
	# Check if the last row has any cell containing the word "total" (case-insensitive)
	if df.iloc[-1].apply(lambda x: "total" in str(x).lower()).any():
	df = df.iloc[:-1] # Drop the last row if "total" is found in any cell




	# Page title
	st.title("Table Analysis")

	# CSV Preview
	st.subheader("CSV Preview")
	st.write("Below is a preview of the uploaded CSV file:")
	st.dataframe(df) # Interactive, scrollable table

	# Download Button
	excel_buffer = BytesIO()
	with pd.ExcelWriter(excel_buffer, engine='openpyxl') as writer:
	df.to_excel(writer, index=False, sheet_name="Sheet1")
	excel_buffer.seek(0) # Reset buffer position

	# Download Button
	st.download_button(
	label="Download Full Excel Sheet",
	data=excel_buffer,
	file_name="table_data.xlsx",
	mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
	)

	st.markdown("<hr>", unsafe_allow_html=True)
	table_description=image.get("table_data").get("description",None)

	if table_description:
	# Table Description
	st.subheader("Table Description")
	st.write(table_description)

	# Column Summary
	st.markdown("<hr>",unsafe_allow_html=True)
	st.subheader("Column Summary")
	with st.container(height=400, border=False):

	column_summary = image.get("table_data").get("column_summary", None)

	if column_summary:
	# Column-level descriptions and analysis
	column_analysis = analyze_column_data(df)


	col1, col2 = st.columns(2)
	for idx, (col_name, col_description) in enumerate(column_summary.items()):
	# Determine which column to use based on the index

	with col1 if idx % 2 == 0 else col2:
	st.markdown(f"Column Name : {col_name}")
	st.write(f"Column Description : {col_description}")

	# Display basic analysis
	analysis = column_analysis.get(col_name, {})
	if pd.api.types.is_numeric_dtype(df[col_name]):
	# Numeric column analysis
	st.write({
	"Mean": analysis.get("Mean"),
	"Median": analysis.get("Median"),
	"Mode": analysis.get("Mode"),
	"Unique Values": analysis.get("Unique Values"),
	"Null Values": analysis.get("Null Values")
	})
	else:
	# Categorical column analysis
	st.write({
	"Unique Values": analysis.get("Unique Values"),
	"Null Values": analysis.get("Null Values"),
	"Top Categories": analysis.get("Top Categories")
	})

	st.markdown("<hr>", unsafe_allow_html=True)
	st.subheader("Graphical Analysis of Table")

	# Default configuration for initial visualization
	best_col1=image.get("table_data").get("best_col1")
	best_col2 = image.get("table_data").get("best_col2")
	default_chart_config = {
	"mark": "bar",
	"encoding": {
	"x": {"field": best_col1, "type": "nominal"},
	"y": {"field": best_col2, "type": "quantitative"}
	}
	}

	# Convert default_chart_config to JSON string for Pygwalker spec parameter
	pyg_app = StreamlitRenderer(df, spec=json.dumps(default_chart_config))
	pyg_app.explorer()