Spaces:

lsempe77
/

fcas

Sleeping

App Files Files Community

fcas / visualisations.py

lsempe

Clean repo, remove binary history

9c062cd 6 months ago

raw

history blame contribute delete

12.7 kB

	import pandas as pd
	import plotly.express as px
	import plotly.graph_objects as go
	from plotly.subplots import make_subplots
	from collections import Counter
	import numpy as np

	def create_world_map(docs_df):
	"""Create interactive world map showing study distribution for conflict-affected countries only"""
	if docs_df.empty or 'study_countries' not in docs_df.columns:
	print("No data or missing 'study_countries' column")
	return None

	# Define the specific countries we want to show with their study counts
	target_countries = {
	# Nationwide conflict
	'Burkina Faso': 1098,
	'Afghanistan': 697,
	'Mali': 496,
	'Sudan': 470,
	'Haiti': 394,
	'Somalia': 373,
	'Niger': 352,
	'Syria': 323,
	'South Sudan': 294,
	'Libya': 119,
	'Palestinian Territories': 81,
	'Central African Republic': 72,
	# Partial conflict
	'Iraq': 128,
	'Nigeria': 121,
	'Lebanon': 102,
	'Ethiopia': 81,
	'Democratic Republic of the Congo': 71,
	'Cameroon': 54,
	'Chad': 36,
	'Mozambique': 30,
	'Myanmar': 11
	}

	# Count actual studies in our dataset for these countries
	country_counts = Counter()

	for countries_str in docs_df['study_countries'].dropna():
	if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
	continue

	countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
	for country in countries:
	if country in target_countries:
	country_counts[country] += 1

	# Use target countries with actual counts where available
	map_data = []
	for country, target_count in target_countries.items():
	actual_count = country_counts.get(country, 0)
	conflict_type = "Nationwide" if target_count > 400 else "Partial"
	map_data.append({
	'country': country,
	'actual_studies': actual_count,
	'target_studies': target_count,
	'conflict_type': conflict_type
	})

	map_df = pd.DataFrame(map_data)

	print(f"Mapping {len(map_df)} conflict-affected countries")
	print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")

	# Create choropleth map using target study counts
	fig = go.Figure(data=go.Choropleth(
	locations=map_df['country'],
	z=map_df['target_studies'],
	locationmode='country names',
	colorscale='Reds',
	hovertemplate='<b>%{location}</b><br>' +
	'Studies (Target): %{z}<br>' +
	'Studies (In Dataset): %{customdata}<br>' +
	'<extra></extra>',
	customdata=map_df['actual_studies'],
	colorbar_title="Number of Studies"
	))

	fig.update_layout(
	title={
	'text': 'Research Coverage: Conflict-Affected Countries',
	'x': 0.5,
	'xanchor': 'center',
	'font': {'size': 18}
	},
	geo=dict(
	showframe=False,
	showcoastlines=True,
	projection_type='natural earth'
	),
	height=600,
	width=1000
	)

	fig.show()
	return fig

	def create_interactive_data_explorer(docs_df):
	"""Create an interactive data explorer for methodology analysis"""
	if docs_df.empty:
	print("No data available")
	return None

	print("=== DATASET OVERVIEW ===")
	print(f"Total studies: {len(docs_df)}")
	print(f"Columns available: {len(docs_df.columns)}")

	# Key numeric columns for analysis
	numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
	categorical_cols = [
	'world_bank_sector', 'research_design', 'data_collection_method',
	'analysis_type', 'study_countries', 'population', 'author_income_group',
	'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
	]

	# Filter to existing columns
	available_numeric = [col for col in numeric_cols if col in docs_df.columns]
	available_categorical = [col for col in categorical_cols if col in docs_df.columns]

	print(f"Numeric variables: {available_numeric}")
	print(f"Categorical variables: {available_categorical}")

	# Create summary statistics table
	summary_data = []

	# Numeric summaries
	for col in available_numeric:
	values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
	if len(values) > 0:
	summary_data.append({
	'Variable': col,
	'Type': 'Numeric',
	'Valid_Values': len(values),
	'Missing': len(docs_df) - len(values),
	'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
	})

	# Categorical summaries
	for col in available_categorical:
	values = docs_df[col].dropna()
	if len(values) > 0:
	unique_count = values.nunique()
	top_category = values.value_counts().index[0] if len(values) > 0 else "None"
	summary_data.append({
	'Variable': col,
	'Type': 'Categorical',
	'Valid_Values': len(values),
	'Missing': len(docs_df) - len(values),
	'Summary': f"{unique_count} categories, Top: {top_category}"
	})

	summary_df = pd.DataFrame(summary_data)

	# Create visualization showing data completeness
	fig = go.Figure()

	# Data completeness bar chart
	fig.add_trace(go.Bar(
	x=summary_df['Variable'],
	y=summary_df['Valid_Values'],
	name='Valid Values',
	marker_color='steelblue',
	hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
	customdata=summary_df['Summary']
	))

	fig.add_trace(go.Bar(
	x=summary_df['Variable'],
	y=summary_df['Missing'],
	name='Missing Values',
	marker_color='lightcoral'
	))

	fig.update_layout(
	title='Data Completeness by Variable',
	xaxis_title='Variables',
	yaxis_title='Number of Records',
	barmode='stack',
	height=500,
	xaxis={'tickangle': 45}
	)

	fig.show()

	# Print summary table
	print("\n=== VARIABLE SUMMARY ===")
	for _, row in summary_df.iterrows():
	print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")

	return fig, summary_df

	def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
	"""Create a pivot table analysis with visualization"""
	if docs_df.empty:
	return None

	if row_var not in docs_df.columns or col_var not in docs_df.columns:
	print(f"Variables not found. Available: {list(docs_df.columns)}")
	return None

	try:
	if value_var and value_var in docs_df.columns:
	# Numeric aggregation
	pivot_df = docs_df.pivot_table(
	index=row_var,
	columns=col_var,
	values=value_var,
	aggfunc=agg_func,
	fill_value=0
	)
	title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
	else:
	# Count aggregation
	pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
	title = f"Study Count by {row_var} and {col_var}"

	# Create heatmap
	fig = px.imshow(
	pivot_df.values,
	x=pivot_df.columns,
	y=pivot_df.index,
	color_continuous_scale='Viridis',
	title=title
	)

	fig.update_layout(
	height=max(400, len(pivot_df.index) * 30),
	width=max(600, len(pivot_df.columns) * 50)
	)

	fig.show()

	print(f"\nPivot Table: {row_var} × {col_var}")
	print(pivot_df.head(10))

	return fig, pivot_df

	except Exception as e:
	print(f"Error creating pivot: {e}")
	return None

	# Example usage functions
	def explore_methodology_patterns(docs_df):
	"""Explore common methodology patterns"""
	if docs_df.empty:
	return None

	# Research design by sector
	if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
	print("=== RESEARCH DESIGN BY SECTOR ===")
	return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')

	def explore_data_collection(docs_df):
	"""Explore data collection patterns"""
	if docs_df.empty:
	return None

	# Data collection by country income group
	if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
	print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
	return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')

	def filter_and_analyze(docs_df, **filters):
	"""Filter data and run analysis on the subset"""
	if docs_df.empty:
	print("No data available")
	return None

	filtered = docs_df.copy()
	filter_summary = []

	# Apply filters
	if 'countries' in filters and filters['countries']:
	countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
	country_mask = filtered['study_countries'].str.contains('\|'.join(countries), case=False, na=False)
	filtered = filtered[country_mask]
	filter_summary.append(f"Countries: {', '.join(countries)}")

	if 'sectors' in filters and filters['sectors']:
	sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
	sector_mask = filtered['world_bank_sector'].isin(sectors)
	filtered = filtered[sector_mask]
	filter_summary.append(f"Sectors: {', '.join(sectors)}")

	if 'min_year' in filters and filters['min_year']:
	year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
	filtered = filtered[year_col >= filters['min_year']]
	filter_summary.append(f"Year >= {filters['min_year']}")

	if 'max_year' in filters and filters['max_year']:
	year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
	filtered = filtered[year_col <= filters['max_year']]
	filter_summary.append(f"Year <= {filters['max_year']}")

	if 'has_rct' in filters and filters['has_rct']:
	filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
	filter_summary.append("RCT studies only")

	if 'min_sample_size' in filters and filters['min_sample_size']:
	sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
	filtered = filtered[sample_col >= filters['min_sample_size']]
	filter_summary.append(f"Sample size >= {filters['min_sample_size']}")

	# Show results
	print(f"=== FILTERED ANALYSIS ===")
	print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
	print(f"Studies found: {len(filtered)}/{len(docs_df)}")

	if filtered.empty:
	print("No studies match the criteria.")
	return None

	# Quick analysis of filtered data
	if len(filtered) > 5:
	# Show key distributions
	if 'world_bank_sector' in filtered.columns:
	print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
	if 'research_design' in filtered.columns:
	print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
	if 'rigor_score' in filtered.columns:
	rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
	if len(rigor_scores) > 0:
	print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")

	return filtered

	# Quick start function
	def quick_analysis(docs_df):
	"""Run a quick analysis of the dataset"""
	print("Starting comprehensive data analysis...")

	# 1. Data overview
	explorer_fig, summary_df = create_interactive_data_explorer(docs_df)

	# 2. Map
	map_fig = create_world_map(docs_df)

	# 3. Sample pivot analyses
	if len(docs_df) > 0:
	explore_methodology_patterns(docs_df)
	explore_data_collection(docs_df)

	return explorer_fig, map_fig, summary_df