Spaces:

shvy
/

data-visulaizer-69

Sleeping

App Files Files Community

data-visulaizer-69 / app.py

shvy

Update app.py

28fce0e verified 11 months ago

raw

history blame contribute delete

7.05 kB

	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	import numpy as np
	import scipy.stats as stats
	import base64

	def advanced_analysis(file):
	try:
	# Support multiple file types
	supported_extensions = ['.csv', '.xlsx', '.xls', '.txt']
	if not any(file.lower().endswith(ext) for ext in supported_extensions):
	raise ValueError(f"Unsupported file type. Please upload a file with one of these extensions: {', '.join(supported_extensions)}")

	# Load file based on extension
	if file.endswith('.csv'):
	df = pd.read_csv(file)
	elif file.endswith(('.xlsx', '.xls')):
	df = pd.read_excel(file)
	elif file.endswith('.txt'):
	df = pd.read_csv(file, sep='\t')

	# Comprehensive Analysis Report
	report = "# 📊 Comprehensive Data Analysis Report\n\n"

	# 1. Basic Dataset Information
	report += "## 1. Dataset Overview\n"
	report += f"- Total Rows: {len(df)}\n"
	report += f"- Total Columns: {len(df.columns)}\n"
	report += f"- Column Types:\n"
	for col, dtype in df.dtypes.items():
	report += f" - `{col}`: {dtype}\n"

	# 2. Missing Value Analysis
	report += "\n## 2. Missing Value Analysis\n"
	missing_data = df.isnull().sum()
	missing_percentage = 100 * df.isnull().sum() / len(df)
	missing_table = pd.concat([missing_data, missing_percentage], axis=1,
	keys=['Missing Count', 'Missing Percentage'])
	report += "```\n" + missing_table.to_string() + "\n```\n"

	# 3. Statistical Summary
	report += "\n## 3. Statistical Summary\n"
	numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_cols) > 0:
	stats_summary = df[numeric_cols].describe()
	report += "### Numerical Columns Statistics\n"
	report += "```\n" + stats_summary.to_string() + "\n```\n"

	# 4. Outlier Detection
	report += "\n## 4. Outlier Analysis\n"
	outliers = {}
	for col in numeric_cols:
	Q1 = df[col].quantile(0.25)
	Q3 = df[col].quantile(0.75)
	IQR = Q3 - Q1
	lower_bound = Q1 - 1.5 * IQR
	upper_bound = Q3 + 1.5 * IQR
	column_outliers = df[(df[col] < lower_bound) \| (df[col] > upper_bound)]
	if len(column_outliers) > 0:
	outliers[col] = len(column_outliers)
	report += f"- {col}: {len(column_outliers)} outliers detected\n"

	# 5. Correlation Analysis
	report += "\n## 5. Correlation Analysis\n"
	if len(numeric_cols) > 1:
	correlation_matrix = df[numeric_cols].corr()
	report += "### Top Correlations\n"
	# Find and report top correlations
	corr_unstack = correlation_matrix.unstack()
	top_correlations = corr_unstack[corr_unstack != 1].nlargest(5)
	for (col1, col2), corr_value in top_correlations.items():
	report += f"- {col1} & {col2}: {corr_value:.2f}\n"

	# Visualizations
	plt.close('all')
	fig, axs = plt.subplots(2, 2, figsize=(20, 15))
	plt.subplots_adjust(hspace=0.4, wspace=0.3)

	# Correlation Heatmap
	if len(numeric_cols) > 1:
	sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', ax=axs[0, 0],
	square=True, cbar=True, linewidths=0.5)
	axs[0, 0].set_title('Correlation Heatmap', fontsize=14, fontweight='bold')

	# Box Plot for Numeric Columns
	df[numeric_cols].boxplot(ax=axs[0, 1])
	axs[0, 1].set_title('Box Plot of Numeric Columns', fontsize=14, fontweight='bold')
	axs[0, 1].tick_params(axis='x', rotation=45)

	# Distribution of Categorical Columns
	categorical_cols = df.select_dtypes(include=['object']).columns
	if len(categorical_cols) > 0:
	cat_value_counts = df[categorical_cols[0]].value_counts()
	cat_value_counts.plot(kind='bar', ax=axs[1, 0])
	axs[1, 0].set_title(f'Distribution of {categorical_cols[0]}',
	fontsize=14, fontweight='bold')
	axs[1, 0].tick_params(axis='x', rotation=45)

	# Scatter Plot Matrix for Top Correlated Features
	if len(numeric_cols) > 2:
	top_corr_features = correlation_matrix.unstack().sort_values(
	kind="quicksort", ascending=False).head(5)
	top_features = list(set([x[0] for x in top_corr_features.index] +
	[x[1] for x in top_corr_features.index]))[:3]
	pd.plotting.scatter_matrix(df[top_features],
	figsize=(10,10),
	diagonal='hist',
	ax=axs[1, 1])
	axs[1, 1].set_title('Scatter Plot of Top Correlated Features',
	fontsize=14, fontweight='bold')

	plt.suptitle('🔍 Data Analysis Visualizations', fontsize=16, fontweight='bold')
	plt.tight_layout()
	plt.savefig('data_analysis_advanced.png', dpi=300, bbox_inches='tight')
	plt.close()

	return report, 'data_analysis_advanced.png'

	except Exception as e:
	error_report = f"## ❌ Analysis Failed\n\nError: {str(e)}\n\n"
	error_report += "Possible reasons:\n"
	error_report += "- Incorrect file format\n"
	error_report += "- Unsupported data types\n"
	error_report += "- Corrupted or incomplete dataset"
	return error_report, None

	# Custom CSS for a more modern look
	css = """
	.gradio-container {
	background-color: #f0f2f6;
	font-family: 'Inter', 'Helvetica Neue', Arial, sans-serif;
	}
	.output-markdown {
	background-color: white;
	border-radius: 10px;
	padding: 20px;
	box-shadow: 0 4px 6px rgba(0,0,0,0.1);
	max-height: 500px;
	overflow-y: auto;
	}
	.file-upload {
	background-color: #4CAF50;
	color: white;
	border-radius: 5px;
	padding: 10px 20px;
	transition: background-color 0.3s ease;
	}
	.file-upload:hover {
	background-color: #45a049;
	}
	"""

	# Gradio Interface with Enhanced UI
	demo = gr.Interface(
	fn=advanced_analysis,
	inputs=gr.File(
	type="filepath",
	label="📤 Upload File",
	file_count="single",
	file_types=["csv", "xlsx", "xls", "txt"] # Updated file types
	),
	outputs=[
	gr.Markdown(),
	gr.Image(label="📊 Advanced Visualizations")
	],
	title="🧠 Smart Data Analyzer",
	description="Upload a CSV, Excel, or Text file for comprehensive data analysis, statistical insights, and interactive visualizations.",
	theme='default',
	css=css
	)

	# Launch the interface
	demo.launch(server_name="0.0.0.0", server_port=7860)