File size: 12,665 Bytes
9c062cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import numpy as np

def create_world_map(docs_df):
    """Create interactive world map showing study distribution for conflict-affected countries only"""
    if docs_df.empty or 'study_countries' not in docs_df.columns:
        print("No data or missing 'study_countries' column")
        return None
    
    # Define the specific countries we want to show with their study counts
    target_countries = {
        # Nationwide conflict
        'Burkina Faso': 1098,
        'Afghanistan': 697,
        'Mali': 496,
        'Sudan': 470,
        'Haiti': 394,
        'Somalia': 373,
        'Niger': 352,
        'Syria': 323,
        'South Sudan': 294,
        'Libya': 119,
        'Palestinian Territories': 81,
        'Central African Republic': 72,
        # Partial conflict
        'Iraq': 128,
        'Nigeria': 121,
        'Lebanon': 102,
        'Ethiopia': 81,
        'Democratic Republic of the Congo': 71,
        'Cameroon': 54,
        'Chad': 36,
        'Mozambique': 30,
        'Myanmar': 11
    }
    
    # Count actual studies in our dataset for these countries
    country_counts = Counter()
    
    for countries_str in docs_df['study_countries'].dropna():
        if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
            continue
            
        countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
        for country in countries:
            if country in target_countries:
                country_counts[country] += 1
    
    # Use target countries with actual counts where available
    map_data = []
    for country, target_count in target_countries.items():
        actual_count = country_counts.get(country, 0)
        conflict_type = "Nationwide" if target_count > 400 else "Partial"
        map_data.append({
            'country': country, 
            'actual_studies': actual_count,
            'target_studies': target_count,
            'conflict_type': conflict_type
        })
    
    map_df = pd.DataFrame(map_data)
    
    print(f"Mapping {len(map_df)} conflict-affected countries")
    print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
    
    # Create choropleth map using target study counts
    fig = go.Figure(data=go.Choropleth(
        locations=map_df['country'],
        z=map_df['target_studies'],
        locationmode='country names',
        colorscale='Reds',
        hovertemplate='<b>%{location}</b><br>' +
                     'Studies (Target): %{z}<br>' +
                     'Studies (In Dataset): %{customdata}<br>' +
                     '<extra></extra>',
        customdata=map_df['actual_studies'],
        colorbar_title="Number of Studies"
    ))
    
    fig.update_layout(
        title={
            'text': 'Research Coverage: Conflict-Affected Countries',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        geo=dict(
            showframe=False,
            showcoastlines=True,
            projection_type='natural earth'
        ),
        height=600,
        width=1000
    )
    
    fig.show()
    return fig

def create_interactive_data_explorer(docs_df):
    """Create an interactive data explorer for methodology analysis"""
    if docs_df.empty:
        print("No data available")
        return None
    
    print("=== DATASET OVERVIEW ===")
    print(f"Total studies: {len(docs_df)}")
    print(f"Columns available: {len(docs_df.columns)}")
    
    # Key numeric columns for analysis
    numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
    categorical_cols = [
        'world_bank_sector', 'research_design', 'data_collection_method',
        'analysis_type', 'study_countries', 'population', 'author_income_group',
        'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
    ]
    
    # Filter to existing columns
    available_numeric = [col for col in numeric_cols if col in docs_df.columns]
    available_categorical = [col for col in categorical_cols if col in docs_df.columns]
    
    print(f"Numeric variables: {available_numeric}")
    print(f"Categorical variables: {available_categorical}")
    
    # Create summary statistics table
    summary_data = []
    
    # Numeric summaries
    for col in available_numeric:
        values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
        if len(values) > 0:
            summary_data.append({
                'Variable': col,
                'Type': 'Numeric',
                'Valid_Values': len(values),
                'Missing': len(docs_df) - len(values),
                'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
            })
    
    # Categorical summaries
    for col in available_categorical:
        values = docs_df[col].dropna()
        if len(values) > 0:
            unique_count = values.nunique()
            top_category = values.value_counts().index[0] if len(values) > 0 else "None"
            summary_data.append({
                'Variable': col,
                'Type': 'Categorical',
                'Valid_Values': len(values),
                'Missing': len(docs_df) - len(values),
                'Summary': f"{unique_count} categories, Top: {top_category}"
            })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Create visualization showing data completeness
    fig = go.Figure()
    
    # Data completeness bar chart
    fig.add_trace(go.Bar(
        x=summary_df['Variable'],
        y=summary_df['Valid_Values'],
        name='Valid Values',
        marker_color='steelblue',
        hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
        customdata=summary_df['Summary']
    ))
    
    fig.add_trace(go.Bar(
        x=summary_df['Variable'],
        y=summary_df['Missing'],
        name='Missing Values',
        marker_color='lightcoral'
    ))
    
    fig.update_layout(
        title='Data Completeness by Variable',
        xaxis_title='Variables',
        yaxis_title='Number of Records',
        barmode='stack',
        height=500,
        xaxis={'tickangle': 45}
    )
    
    fig.show()
    
    # Print summary table
    print("\n=== VARIABLE SUMMARY ===")
    for _, row in summary_df.iterrows():
        print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
    
    return fig, summary_df

def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
    """Create a pivot table analysis with visualization"""
    if docs_df.empty:
        return None
    
    if row_var not in docs_df.columns or col_var not in docs_df.columns:
        print(f"Variables not found. Available: {list(docs_df.columns)}")
        return None
    
    try:
        if value_var and value_var in docs_df.columns:
            # Numeric aggregation
            pivot_df = docs_df.pivot_table(
                index=row_var, 
                columns=col_var, 
                values=value_var, 
                aggfunc=agg_func,
                fill_value=0
            )
            title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
        else:
            # Count aggregation
            pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
            title = f"Study Count by {row_var} and {col_var}"
        
        # Create heatmap
        fig = px.imshow(
            pivot_df.values,
            x=pivot_df.columns,
            y=pivot_df.index,
            color_continuous_scale='Viridis',
            title=title
        )
        
        fig.update_layout(
            height=max(400, len(pivot_df.index) * 30),
            width=max(600, len(pivot_df.columns) * 50)
        )
        
        fig.show()
        
        print(f"\nPivot Table: {row_var} × {col_var}")
        print(pivot_df.head(10))
        
        return fig, pivot_df
        
    except Exception as e:
        print(f"Error creating pivot: {e}")
        return None

# Example usage functions
def explore_methodology_patterns(docs_df):
    """Explore common methodology patterns"""
    if docs_df.empty:
        return None
    
    # Research design by sector
    if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
        print("=== RESEARCH DESIGN BY SECTOR ===")
        return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')

def explore_data_collection(docs_df):
    """Explore data collection patterns"""
    if docs_df.empty:
        return None
    
    # Data collection by country income group
    if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
        print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
        return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')

def filter_and_analyze(docs_df, **filters):
    """Filter data and run analysis on the subset"""
    if docs_df.empty:
        print("No data available")
        return None
    
    filtered = docs_df.copy()
    filter_summary = []
    
    # Apply filters
    if 'countries' in filters and filters['countries']:
        countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
        country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
        filtered = filtered[country_mask]
        filter_summary.append(f"Countries: {', '.join(countries)}")
    
    if 'sectors' in filters and filters['sectors']:
        sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
        sector_mask = filtered['world_bank_sector'].isin(sectors)
        filtered = filtered[sector_mask]
        filter_summary.append(f"Sectors: {', '.join(sectors)}")
    
    if 'min_year' in filters and filters['min_year']:
        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
        filtered = filtered[year_col >= filters['min_year']]
        filter_summary.append(f"Year >= {filters['min_year']}")
    
    if 'max_year' in filters and filters['max_year']:
        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
        filtered = filtered[year_col <= filters['max_year']]
        filter_summary.append(f"Year <= {filters['max_year']}")
    
    if 'has_rct' in filters and filters['has_rct']:
        filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
        filter_summary.append("RCT studies only")
    
    if 'min_sample_size' in filters and filters['min_sample_size']:
        sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
        filtered = filtered[sample_col >= filters['min_sample_size']]
        filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
    
    # Show results
    print(f"=== FILTERED ANALYSIS ===")
    print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
    print(f"Studies found: {len(filtered)}/{len(docs_df)}")
    
    if filtered.empty:
        print("No studies match the criteria.")
        return None
    
    # Quick analysis of filtered data
    if len(filtered) > 5:
        # Show key distributions
        if 'world_bank_sector' in filtered.columns:
            print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
        if 'research_design' in filtered.columns:
            print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
        if 'rigor_score' in filtered.columns:
            rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
            if len(rigor_scores) > 0:
                print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
    
    return filtered

# Quick start function
def quick_analysis(docs_df):
    """Run a quick analysis of the dataset"""
    print("Starting comprehensive data analysis...")
    
    # 1. Data overview
    explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
    
    # 2. Map
    map_fig = create_world_map(docs_df)
    
    # 3. Sample pivot analyses
    if len(docs_df) > 0:
        explore_methodology_patterns(docs_df)
        explore_data_collection(docs_df)
    
    return explorer_fig, map_fig, summary_df