| | import pandas as pd |
| | import plotly.express as px |
| | import plotly.graph_objects as go |
| | from plotly.subplots import make_subplots |
| | from collections import Counter |
| | import numpy as np |
| |
|
| | def create_world_map(docs_df): |
| | """Create interactive world map showing study distribution for conflict-affected countries only""" |
| | if docs_df.empty or 'study_countries' not in docs_df.columns: |
| | print("No data or missing 'study_countries' column") |
| | return None |
| | |
| | |
| | target_countries = { |
| | |
| | 'Burkina Faso': 1098, |
| | 'Afghanistan': 697, |
| | 'Mali': 496, |
| | 'Sudan': 470, |
| | 'Haiti': 394, |
| | 'Somalia': 373, |
| | 'Niger': 352, |
| | 'Syria': 323, |
| | 'South Sudan': 294, |
| | 'Libya': 119, |
| | 'Palestinian Territories': 81, |
| | 'Central African Republic': 72, |
| | |
| | 'Iraq': 128, |
| | 'Nigeria': 121, |
| | 'Lebanon': 102, |
| | 'Ethiopia': 81, |
| | 'Democratic Republic of the Congo': 71, |
| | 'Cameroon': 54, |
| | 'Chad': 36, |
| | 'Mozambique': 30, |
| | 'Myanmar': 11 |
| | } |
| | |
| | |
| | country_counts = Counter() |
| | |
| | for countries_str in docs_df['study_countries'].dropna(): |
| | if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']: |
| | continue |
| | |
| | countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')] |
| | for country in countries: |
| | if country in target_countries: |
| | country_counts[country] += 1 |
| | |
| | |
| | map_data = [] |
| | for country, target_count in target_countries.items(): |
| | actual_count = country_counts.get(country, 0) |
| | conflict_type = "Nationwide" if target_count > 400 else "Partial" |
| | map_data.append({ |
| | 'country': country, |
| | 'actual_studies': actual_count, |
| | 'target_studies': target_count, |
| | 'conflict_type': conflict_type |
| | }) |
| | |
| | map_df = pd.DataFrame(map_data) |
| | |
| | print(f"Mapping {len(map_df)} conflict-affected countries") |
| | print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}") |
| | |
| | |
| | fig = go.Figure(data=go.Choropleth( |
| | locations=map_df['country'], |
| | z=map_df['target_studies'], |
| | locationmode='country names', |
| | colorscale='Reds', |
| | hovertemplate='<b>%{location}</b><br>' + |
| | 'Studies (Target): %{z}<br>' + |
| | 'Studies (In Dataset): %{customdata}<br>' + |
| | '<extra></extra>', |
| | customdata=map_df['actual_studies'], |
| | colorbar_title="Number of Studies" |
| | )) |
| | |
| | fig.update_layout( |
| | title={ |
| | 'text': 'Research Coverage: Conflict-Affected Countries', |
| | 'x': 0.5, |
| | 'xanchor': 'center', |
| | 'font': {'size': 18} |
| | }, |
| | geo=dict( |
| | showframe=False, |
| | showcoastlines=True, |
| | projection_type='natural earth' |
| | ), |
| | height=600, |
| | width=1000 |
| | ) |
| | |
| | fig.show() |
| | return fig |
| |
|
| | def create_interactive_data_explorer(docs_df): |
| | """Create an interactive data explorer for methodology analysis""" |
| | if docs_df.empty: |
| | print("No data available") |
| | return None |
| | |
| | print("=== DATASET OVERVIEW ===") |
| | print(f"Total studies: {len(docs_df)}") |
| | print(f"Columns available: {len(docs_df.columns)}") |
| | |
| | |
| | numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number'] |
| | categorical_cols = [ |
| | 'world_bank_sector', 'research_design', 'data_collection_method', |
| | 'analysis_type', 'study_countries', 'population', 'author_income_group', |
| | 'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis' |
| | ] |
| | |
| | |
| | available_numeric = [col for col in numeric_cols if col in docs_df.columns] |
| | available_categorical = [col for col in categorical_cols if col in docs_df.columns] |
| | |
| | print(f"Numeric variables: {available_numeric}") |
| | print(f"Categorical variables: {available_categorical}") |
| | |
| | |
| | summary_data = [] |
| | |
| | |
| | for col in available_numeric: |
| | values = pd.to_numeric(docs_df[col], errors='coerce').dropna() |
| | if len(values) > 0: |
| | summary_data.append({ |
| | 'Variable': col, |
| | 'Type': 'Numeric', |
| | 'Valid_Values': len(values), |
| | 'Missing': len(docs_df) - len(values), |
| | 'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}" |
| | }) |
| | |
| | |
| | for col in available_categorical: |
| | values = docs_df[col].dropna() |
| | if len(values) > 0: |
| | unique_count = values.nunique() |
| | top_category = values.value_counts().index[0] if len(values) > 0 else "None" |
| | summary_data.append({ |
| | 'Variable': col, |
| | 'Type': 'Categorical', |
| | 'Valid_Values': len(values), |
| | 'Missing': len(docs_df) - len(values), |
| | 'Summary': f"{unique_count} categories, Top: {top_category}" |
| | }) |
| | |
| | summary_df = pd.DataFrame(summary_data) |
| | |
| | |
| | fig = go.Figure() |
| | |
| | |
| | fig.add_trace(go.Bar( |
| | x=summary_df['Variable'], |
| | y=summary_df['Valid_Values'], |
| | name='Valid Values', |
| | marker_color='steelblue', |
| | hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>', |
| | customdata=summary_df['Summary'] |
| | )) |
| | |
| | fig.add_trace(go.Bar( |
| | x=summary_df['Variable'], |
| | y=summary_df['Missing'], |
| | name='Missing Values', |
| | marker_color='lightcoral' |
| | )) |
| | |
| | fig.update_layout( |
| | title='Data Completeness by Variable', |
| | xaxis_title='Variables', |
| | yaxis_title='Number of Records', |
| | barmode='stack', |
| | height=500, |
| | xaxis={'tickangle': 45} |
| | ) |
| | |
| | fig.show() |
| | |
| | |
| | print("\n=== VARIABLE SUMMARY ===") |
| | for _, row in summary_df.iterrows(): |
| | print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}") |
| | |
| | return fig, summary_df |
| |
|
| | def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'): |
| | """Create a pivot table analysis with visualization""" |
| | if docs_df.empty: |
| | return None |
| | |
| | if row_var not in docs_df.columns or col_var not in docs_df.columns: |
| | print(f"Variables not found. Available: {list(docs_df.columns)}") |
| | return None |
| | |
| | try: |
| | if value_var and value_var in docs_df.columns: |
| | |
| | pivot_df = docs_df.pivot_table( |
| | index=row_var, |
| | columns=col_var, |
| | values=value_var, |
| | aggfunc=agg_func, |
| | fill_value=0 |
| | ) |
| | title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}" |
| | else: |
| | |
| | pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var]) |
| | title = f"Study Count by {row_var} and {col_var}" |
| | |
| | |
| | fig = px.imshow( |
| | pivot_df.values, |
| | x=pivot_df.columns, |
| | y=pivot_df.index, |
| | color_continuous_scale='Viridis', |
| | title=title |
| | ) |
| | |
| | fig.update_layout( |
| | height=max(400, len(pivot_df.index) * 30), |
| | width=max(600, len(pivot_df.columns) * 50) |
| | ) |
| | |
| | fig.show() |
| | |
| | print(f"\nPivot Table: {row_var} × {col_var}") |
| | print(pivot_df.head(10)) |
| | |
| | return fig, pivot_df |
| | |
| | except Exception as e: |
| | print(f"Error creating pivot: {e}") |
| | return None |
| |
|
| | |
| | def explore_methodology_patterns(docs_df): |
| | """Explore common methodology patterns""" |
| | if docs_df.empty: |
| | return None |
| | |
| | |
| | if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns: |
| | print("=== RESEARCH DESIGN BY SECTOR ===") |
| | return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design') |
| |
|
| | def explore_data_collection(docs_df): |
| | """Explore data collection patterns""" |
| | if docs_df.empty: |
| | return None |
| | |
| | |
| | if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns: |
| | print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===") |
| | return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method') |
| |
|
| | def filter_and_analyze(docs_df, **filters): |
| | """Filter data and run analysis on the subset""" |
| | if docs_df.empty: |
| | print("No data available") |
| | return None |
| | |
| | filtered = docs_df.copy() |
| | filter_summary = [] |
| | |
| | |
| | if 'countries' in filters and filters['countries']: |
| | countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']] |
| | country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False) |
| | filtered = filtered[country_mask] |
| | filter_summary.append(f"Countries: {', '.join(countries)}") |
| | |
| | if 'sectors' in filters and filters['sectors']: |
| | sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']] |
| | sector_mask = filtered['world_bank_sector'].isin(sectors) |
| | filtered = filtered[sector_mask] |
| | filter_summary.append(f"Sectors: {', '.join(sectors)}") |
| | |
| | if 'min_year' in filters and filters['min_year']: |
| | year_col = pd.to_numeric(filtered['publication_year'], errors='coerce') |
| | filtered = filtered[year_col >= filters['min_year']] |
| | filter_summary.append(f"Year >= {filters['min_year']}") |
| | |
| | if 'max_year' in filters and filters['max_year']: |
| | year_col = pd.to_numeric(filtered['publication_year'], errors='coerce') |
| | filtered = filtered[year_col <= filters['max_year']] |
| | filter_summary.append(f"Year <= {filters['max_year']}") |
| | |
| | if 'has_rct' in filters and filters['has_rct']: |
| | filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])] |
| | filter_summary.append("RCT studies only") |
| | |
| | if 'min_sample_size' in filters and filters['min_sample_size']: |
| | sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce') |
| | filtered = filtered[sample_col >= filters['min_sample_size']] |
| | filter_summary.append(f"Sample size >= {filters['min_sample_size']}") |
| | |
| | |
| | print(f"=== FILTERED ANALYSIS ===") |
| | print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}") |
| | print(f"Studies found: {len(filtered)}/{len(docs_df)}") |
| | |
| | if filtered.empty: |
| | print("No studies match the criteria.") |
| | return None |
| | |
| | |
| | if len(filtered) > 5: |
| | |
| | if 'world_bank_sector' in filtered.columns: |
| | print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}") |
| | if 'research_design' in filtered.columns: |
| | print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}") |
| | if 'rigor_score' in filtered.columns: |
| | rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna() |
| | if len(rigor_scores) > 0: |
| | print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}") |
| | |
| | return filtered |
| |
|
| | |
| | def quick_analysis(docs_df): |
| | """Run a quick analysis of the dataset""" |
| | print("Starting comprehensive data analysis...") |
| | |
| | |
| | explorer_fig, summary_df = create_interactive_data_explorer(docs_df) |
| | |
| | |
| | map_fig = create_world_map(docs_df) |
| | |
| | |
| | if len(docs_df) > 0: |
| | explore_methodology_patterns(docs_df) |
| | explore_data_collection(docs_df) |
| | |
| | return explorer_fig, map_fig, summary_df |