File size: 12,665 Bytes
9c062cd | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 | import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import numpy as np
def create_world_map(docs_df):
"""Create interactive world map showing study distribution for conflict-affected countries only"""
if docs_df.empty or 'study_countries' not in docs_df.columns:
print("No data or missing 'study_countries' column")
return None
# Define the specific countries we want to show with their study counts
target_countries = {
# Nationwide conflict
'Burkina Faso': 1098,
'Afghanistan': 697,
'Mali': 496,
'Sudan': 470,
'Haiti': 394,
'Somalia': 373,
'Niger': 352,
'Syria': 323,
'South Sudan': 294,
'Libya': 119,
'Palestinian Territories': 81,
'Central African Republic': 72,
# Partial conflict
'Iraq': 128,
'Nigeria': 121,
'Lebanon': 102,
'Ethiopia': 81,
'Democratic Republic of the Congo': 71,
'Cameroon': 54,
'Chad': 36,
'Mozambique': 30,
'Myanmar': 11
}
# Count actual studies in our dataset for these countries
country_counts = Counter()
for countries_str in docs_df['study_countries'].dropna():
if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
continue
countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
for country in countries:
if country in target_countries:
country_counts[country] += 1
# Use target countries with actual counts where available
map_data = []
for country, target_count in target_countries.items():
actual_count = country_counts.get(country, 0)
conflict_type = "Nationwide" if target_count > 400 else "Partial"
map_data.append({
'country': country,
'actual_studies': actual_count,
'target_studies': target_count,
'conflict_type': conflict_type
})
map_df = pd.DataFrame(map_data)
print(f"Mapping {len(map_df)} conflict-affected countries")
print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
# Create choropleth map using target study counts
fig = go.Figure(data=go.Choropleth(
locations=map_df['country'],
z=map_df['target_studies'],
locationmode='country names',
colorscale='Reds',
hovertemplate='<b>%{location}</b><br>' +
'Studies (Target): %{z}<br>' +
'Studies (In Dataset): %{customdata}<br>' +
'<extra></extra>',
customdata=map_df['actual_studies'],
colorbar_title="Number of Studies"
))
fig.update_layout(
title={
'text': 'Research Coverage: Conflict-Affected Countries',
'x': 0.5,
'xanchor': 'center',
'font': {'size': 18}
},
geo=dict(
showframe=False,
showcoastlines=True,
projection_type='natural earth'
),
height=600,
width=1000
)
fig.show()
return fig
def create_interactive_data_explorer(docs_df):
"""Create an interactive data explorer for methodology analysis"""
if docs_df.empty:
print("No data available")
return None
print("=== DATASET OVERVIEW ===")
print(f"Total studies: {len(docs_df)}")
print(f"Columns available: {len(docs_df.columns)}")
# Key numeric columns for analysis
numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
categorical_cols = [
'world_bank_sector', 'research_design', 'data_collection_method',
'analysis_type', 'study_countries', 'population', 'author_income_group',
'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
]
# Filter to existing columns
available_numeric = [col for col in numeric_cols if col in docs_df.columns]
available_categorical = [col for col in categorical_cols if col in docs_df.columns]
print(f"Numeric variables: {available_numeric}")
print(f"Categorical variables: {available_categorical}")
# Create summary statistics table
summary_data = []
# Numeric summaries
for col in available_numeric:
values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
if len(values) > 0:
summary_data.append({
'Variable': col,
'Type': 'Numeric',
'Valid_Values': len(values),
'Missing': len(docs_df) - len(values),
'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
})
# Categorical summaries
for col in available_categorical:
values = docs_df[col].dropna()
if len(values) > 0:
unique_count = values.nunique()
top_category = values.value_counts().index[0] if len(values) > 0 else "None"
summary_data.append({
'Variable': col,
'Type': 'Categorical',
'Valid_Values': len(values),
'Missing': len(docs_df) - len(values),
'Summary': f"{unique_count} categories, Top: {top_category}"
})
summary_df = pd.DataFrame(summary_data)
# Create visualization showing data completeness
fig = go.Figure()
# Data completeness bar chart
fig.add_trace(go.Bar(
x=summary_df['Variable'],
y=summary_df['Valid_Values'],
name='Valid Values',
marker_color='steelblue',
hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
customdata=summary_df['Summary']
))
fig.add_trace(go.Bar(
x=summary_df['Variable'],
y=summary_df['Missing'],
name='Missing Values',
marker_color='lightcoral'
))
fig.update_layout(
title='Data Completeness by Variable',
xaxis_title='Variables',
yaxis_title='Number of Records',
barmode='stack',
height=500,
xaxis={'tickangle': 45}
)
fig.show()
# Print summary table
print("\n=== VARIABLE SUMMARY ===")
for _, row in summary_df.iterrows():
print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
return fig, summary_df
def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
"""Create a pivot table analysis with visualization"""
if docs_df.empty:
return None
if row_var not in docs_df.columns or col_var not in docs_df.columns:
print(f"Variables not found. Available: {list(docs_df.columns)}")
return None
try:
if value_var and value_var in docs_df.columns:
# Numeric aggregation
pivot_df = docs_df.pivot_table(
index=row_var,
columns=col_var,
values=value_var,
aggfunc=agg_func,
fill_value=0
)
title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
else:
# Count aggregation
pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
title = f"Study Count by {row_var} and {col_var}"
# Create heatmap
fig = px.imshow(
pivot_df.values,
x=pivot_df.columns,
y=pivot_df.index,
color_continuous_scale='Viridis',
title=title
)
fig.update_layout(
height=max(400, len(pivot_df.index) * 30),
width=max(600, len(pivot_df.columns) * 50)
)
fig.show()
print(f"\nPivot Table: {row_var} × {col_var}")
print(pivot_df.head(10))
return fig, pivot_df
except Exception as e:
print(f"Error creating pivot: {e}")
return None
# Example usage functions
def explore_methodology_patterns(docs_df):
"""Explore common methodology patterns"""
if docs_df.empty:
return None
# Research design by sector
if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
print("=== RESEARCH DESIGN BY SECTOR ===")
return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')
def explore_data_collection(docs_df):
"""Explore data collection patterns"""
if docs_df.empty:
return None
# Data collection by country income group
if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')
def filter_and_analyze(docs_df, **filters):
"""Filter data and run analysis on the subset"""
if docs_df.empty:
print("No data available")
return None
filtered = docs_df.copy()
filter_summary = []
# Apply filters
if 'countries' in filters and filters['countries']:
countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
filtered = filtered[country_mask]
filter_summary.append(f"Countries: {', '.join(countries)}")
if 'sectors' in filters and filters['sectors']:
sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
sector_mask = filtered['world_bank_sector'].isin(sectors)
filtered = filtered[sector_mask]
filter_summary.append(f"Sectors: {', '.join(sectors)}")
if 'min_year' in filters and filters['min_year']:
year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
filtered = filtered[year_col >= filters['min_year']]
filter_summary.append(f"Year >= {filters['min_year']}")
if 'max_year' in filters and filters['max_year']:
year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
filtered = filtered[year_col <= filters['max_year']]
filter_summary.append(f"Year <= {filters['max_year']}")
if 'has_rct' in filters and filters['has_rct']:
filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
filter_summary.append("RCT studies only")
if 'min_sample_size' in filters and filters['min_sample_size']:
sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
filtered = filtered[sample_col >= filters['min_sample_size']]
filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
# Show results
print(f"=== FILTERED ANALYSIS ===")
print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
print(f"Studies found: {len(filtered)}/{len(docs_df)}")
if filtered.empty:
print("No studies match the criteria.")
return None
# Quick analysis of filtered data
if len(filtered) > 5:
# Show key distributions
if 'world_bank_sector' in filtered.columns:
print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
if 'research_design' in filtered.columns:
print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
if 'rigor_score' in filtered.columns:
rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
if len(rigor_scores) > 0:
print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
return filtered
# Quick start function
def quick_analysis(docs_df):
"""Run a quick analysis of the dataset"""
print("Starting comprehensive data analysis...")
# 1. Data overview
explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
# 2. Map
map_fig = create_world_map(docs_df)
# 3. Sample pivot analyses
if len(docs_df) > 0:
explore_methodology_patterns(docs_df)
explore_data_collection(docs_df)
return explorer_fig, map_fig, summary_df |