import pandas as pd import plotly.express as px import plotly.graph_objects as go from plotly.subplots import make_subplots from collections import Counter import numpy as np def create_world_map(docs_df): """Create interactive world map showing study distribution for conflict-affected countries only""" if docs_df.empty or 'study_countries' not in docs_df.columns: print("No data or missing 'study_countries' column") return None # Define the specific countries we want to show with their study counts target_countries = { # Nationwide conflict 'Burkina Faso': 1098, 'Afghanistan': 697, 'Mali': 496, 'Sudan': 470, 'Haiti': 394, 'Somalia': 373, 'Niger': 352, 'Syria': 323, 'South Sudan': 294, 'Libya': 119, 'Palestinian Territories': 81, 'Central African Republic': 72, # Partial conflict 'Iraq': 128, 'Nigeria': 121, 'Lebanon': 102, 'Ethiopia': 81, 'Democratic Republic of the Congo': 71, 'Cameroon': 54, 'Chad': 36, 'Mozambique': 30, 'Myanmar': 11 } # Count actual studies in our dataset for these countries country_counts = Counter() for countries_str in docs_df['study_countries'].dropna(): if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']: continue countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')] for country in countries: if country in target_countries: country_counts[country] += 1 # Use target countries with actual counts where available map_data = [] for country, target_count in target_countries.items(): actual_count = country_counts.get(country, 0) conflict_type = "Nationwide" if target_count > 400 else "Partial" map_data.append({ 'country': country, 'actual_studies': actual_count, 'target_studies': target_count, 'conflict_type': conflict_type }) map_df = pd.DataFrame(map_data) print(f"Mapping {len(map_df)} conflict-affected countries") print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}") # Create choropleth map using target study counts fig = go.Figure(data=go.Choropleth( locations=map_df['country'], z=map_df['target_studies'], locationmode='country names', colorscale='Reds', hovertemplate='%{location}
' + 'Studies (Target): %{z}
' + 'Studies (In Dataset): %{customdata}
' + '', customdata=map_df['actual_studies'], colorbar_title="Number of Studies" )) fig.update_layout( title={ 'text': 'Research Coverage: Conflict-Affected Countries', 'x': 0.5, 'xanchor': 'center', 'font': {'size': 18} }, geo=dict( showframe=False, showcoastlines=True, projection_type='natural earth' ), height=600, width=1000 ) fig.show() return fig def create_interactive_data_explorer(docs_df): """Create an interactive data explorer for methodology analysis""" if docs_df.empty: print("No data available") return None print("=== DATASET OVERVIEW ===") print(f"Total studies: {len(docs_df)}") print(f"Columns available: {len(docs_df.columns)}") # Key numeric columns for analysis numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number'] categorical_cols = [ 'world_bank_sector', 'research_design', 'data_collection_method', 'analysis_type', 'study_countries', 'population', 'author_income_group', 'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis' ] # Filter to existing columns available_numeric = [col for col in numeric_cols if col in docs_df.columns] available_categorical = [col for col in categorical_cols if col in docs_df.columns] print(f"Numeric variables: {available_numeric}") print(f"Categorical variables: {available_categorical}") # Create summary statistics table summary_data = [] # Numeric summaries for col in available_numeric: values = pd.to_numeric(docs_df[col], errors='coerce').dropna() if len(values) > 0: summary_data.append({ 'Variable': col, 'Type': 'Numeric', 'Valid_Values': len(values), 'Missing': len(docs_df) - len(values), 'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}" }) # Categorical summaries for col in available_categorical: values = docs_df[col].dropna() if len(values) > 0: unique_count = values.nunique() top_category = values.value_counts().index[0] if len(values) > 0 else "None" summary_data.append({ 'Variable': col, 'Type': 'Categorical', 'Valid_Values': len(values), 'Missing': len(docs_df) - len(values), 'Summary': f"{unique_count} categories, Top: {top_category}" }) summary_df = pd.DataFrame(summary_data) # Create visualization showing data completeness fig = go.Figure() # Data completeness bar chart fig.add_trace(go.Bar( x=summary_df['Variable'], y=summary_df['Valid_Values'], name='Valid Values', marker_color='steelblue', hovertemplate='%{x}
Valid: %{y}
%{customdata}', customdata=summary_df['Summary'] )) fig.add_trace(go.Bar( x=summary_df['Variable'], y=summary_df['Missing'], name='Missing Values', marker_color='lightcoral' )) fig.update_layout( title='Data Completeness by Variable', xaxis_title='Variables', yaxis_title='Number of Records', barmode='stack', height=500, xaxis={'tickangle': 45} ) fig.show() # Print summary table print("\n=== VARIABLE SUMMARY ===") for _, row in summary_df.iterrows(): print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}") return fig, summary_df def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'): """Create a pivot table analysis with visualization""" if docs_df.empty: return None if row_var not in docs_df.columns or col_var not in docs_df.columns: print(f"Variables not found. Available: {list(docs_df.columns)}") return None try: if value_var and value_var in docs_df.columns: # Numeric aggregation pivot_df = docs_df.pivot_table( index=row_var, columns=col_var, values=value_var, aggfunc=agg_func, fill_value=0 ) title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}" else: # Count aggregation pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var]) title = f"Study Count by {row_var} and {col_var}" # Create heatmap fig = px.imshow( pivot_df.values, x=pivot_df.columns, y=pivot_df.index, color_continuous_scale='Viridis', title=title ) fig.update_layout( height=max(400, len(pivot_df.index) * 30), width=max(600, len(pivot_df.columns) * 50) ) fig.show() print(f"\nPivot Table: {row_var} × {col_var}") print(pivot_df.head(10)) return fig, pivot_df except Exception as e: print(f"Error creating pivot: {e}") return None # Example usage functions def explore_methodology_patterns(docs_df): """Explore common methodology patterns""" if docs_df.empty: return None # Research design by sector if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns: print("=== RESEARCH DESIGN BY SECTOR ===") return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design') def explore_data_collection(docs_df): """Explore data collection patterns""" if docs_df.empty: return None # Data collection by country income group if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns: print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===") return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method') def filter_and_analyze(docs_df, **filters): """Filter data and run analysis on the subset""" if docs_df.empty: print("No data available") return None filtered = docs_df.copy() filter_summary = [] # Apply filters if 'countries' in filters and filters['countries']: countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']] country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False) filtered = filtered[country_mask] filter_summary.append(f"Countries: {', '.join(countries)}") if 'sectors' in filters and filters['sectors']: sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']] sector_mask = filtered['world_bank_sector'].isin(sectors) filtered = filtered[sector_mask] filter_summary.append(f"Sectors: {', '.join(sectors)}") if 'min_year' in filters and filters['min_year']: year_col = pd.to_numeric(filtered['publication_year'], errors='coerce') filtered = filtered[year_col >= filters['min_year']] filter_summary.append(f"Year >= {filters['min_year']}") if 'max_year' in filters and filters['max_year']: year_col = pd.to_numeric(filtered['publication_year'], errors='coerce') filtered = filtered[year_col <= filters['max_year']] filter_summary.append(f"Year <= {filters['max_year']}") if 'has_rct' in filters and filters['has_rct']: filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])] filter_summary.append("RCT studies only") if 'min_sample_size' in filters and filters['min_sample_size']: sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce') filtered = filtered[sample_col >= filters['min_sample_size']] filter_summary.append(f"Sample size >= {filters['min_sample_size']}") # Show results print(f"=== FILTERED ANALYSIS ===") print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}") print(f"Studies found: {len(filtered)}/{len(docs_df)}") if filtered.empty: print("No studies match the criteria.") return None # Quick analysis of filtered data if len(filtered) > 5: # Show key distributions if 'world_bank_sector' in filtered.columns: print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}") if 'research_design' in filtered.columns: print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}") if 'rigor_score' in filtered.columns: rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna() if len(rigor_scores) > 0: print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}") return filtered # Quick start function def quick_analysis(docs_df): """Run a quick analysis of the dataset""" print("Starting comprehensive data analysis...") # 1. Data overview explorer_fig, summary_df = create_interactive_data_explorer(docs_df) # 2. Map map_fig = create_world_map(docs_df) # 3. Sample pivot analyses if len(docs_df) > 0: explore_methodology_patterns(docs_df) explore_data_collection(docs_df) return explorer_fig, map_fig, summary_df