Spaces:

lsempe77
/

fcas

Sleeping

File size: 12,665 Bytes

9c062cd

import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import numpy as np

def create_world_map(docs_df):
    """Create interactive world map showing study distribution for conflict-affected countries only"""
    if docs_df.empty or 'study_countries' not in docs_df.columns:
        print("No data or missing 'study_countries' column")
        return None
    
    # Define the specific countries we want to show with their study counts
    target_countries = {
        # Nationwide conflict
        'Burkina Faso': 1098,
        'Afghanistan': 697,
        'Mali': 496,
        'Sudan': 470,
        'Haiti': 394,
        'Somalia': 373,
        'Niger': 352,
        'Syria': 323,
        'South Sudan': 294,
        'Libya': 119,
        'Palestinian Territories': 81,
        'Central African Republic': 72,
        # Partial conflict
        'Iraq': 128,
        'Nigeria': 121,
        'Lebanon': 102,
        'Ethiopia': 81,
        'Democratic Republic of the Congo': 71,
        'Cameroon': 54,
        'Chad': 36,
        'Mozambique': 30,
        'Myanmar': 11
    }
    
    # Count actual studies in our dataset for these countries
    country_counts = Counter()
    
    for countries_str in docs_df['study_countries'].dropna():
        if pd.isna(countries_str) or str(countries_str).lower() in ['nan', 'none', '']:
            continue
            
        countries = [c.strip() for c in str(countries_str).replace(';', ',').split(',')]
        for country in countries:
            if country in target_countries:
                country_counts[country] += 1
    
    # Use target countries with actual counts where available
    map_data = []
    for country, target_count in target_countries.items():
        actual_count = country_counts.get(country, 0)
        conflict_type = "Nationwide" if target_count > 400 else "Partial"
        map_data.append({
            'country': country, 
            'actual_studies': actual_count,
            'target_studies': target_count,
            'conflict_type': conflict_type
        })
    
    map_df = pd.DataFrame(map_data)
    
    print(f"Mapping {len(map_df)} conflict-affected countries")
    print(f"Countries with data: {map_df[map_df['actual_studies'] > 0]['country'].tolist()}")
    
    # Create choropleth map using target study counts
    fig = go.Figure(data=go.Choropleth(
        locations=map_df['country'],
        z=map_df['target_studies'],
        locationmode='country names',
        colorscale='Reds',
        hovertemplate='<b>%{location}</b><br>' +
                     'Studies (Target): %{z}<br>' +
                     'Studies (In Dataset): %{customdata}<br>' +
                     '<extra></extra>',
        customdata=map_df['actual_studies'],
        colorbar_title="Number of Studies"
    ))
    
    fig.update_layout(
        title={
            'text': 'Research Coverage: Conflict-Affected Countries',
            'x': 0.5,
            'xanchor': 'center',
            'font': {'size': 18}
        },
        geo=dict(
            showframe=False,
            showcoastlines=True,
            projection_type='natural earth'
        ),
        height=600,
        width=1000
    )
    
    fig.show()
    return fig

def create_interactive_data_explorer(docs_df):
    """Create an interactive data explorer for methodology analysis"""
    if docs_df.empty:
        print("No data available")
        return None
    
    print("=== DATASET OVERVIEW ===")
    print(f"Total studies: {len(docs_df)}")
    print(f"Columns available: {len(docs_df.columns)}")
    
    # Key numeric columns for analysis
    numeric_cols = ['publication_year', 'sample_numeric', 'rigor_score', 'sdg_number']
    categorical_cols = [
        'world_bank_sector', 'research_design', 'data_collection_method',
        'analysis_type', 'study_countries', 'population', 'author_income_group',
        'has_validation', 'has_randomization', 'has_mixed_methods', 'has_advanced_analysis'
    ]
    
    # Filter to existing columns
    available_numeric = [col for col in numeric_cols if col in docs_df.columns]
    available_categorical = [col for col in categorical_cols if col in docs_df.columns]
    
    print(f"Numeric variables: {available_numeric}")
    print(f"Categorical variables: {available_categorical}")
    
    # Create summary statistics table
    summary_data = []
    
    # Numeric summaries
    for col in available_numeric:
        values = pd.to_numeric(docs_df[col], errors='coerce').dropna()
        if len(values) > 0:
            summary_data.append({
                'Variable': col,
                'Type': 'Numeric',
                'Valid_Values': len(values),
                'Missing': len(docs_df) - len(values),
                'Summary': f"Mean: {values.mean():.1f}, Range: {values.min()}-{values.max()}"
            })
    
    # Categorical summaries
    for col in available_categorical:
        values = docs_df[col].dropna()
        if len(values) > 0:
            unique_count = values.nunique()
            top_category = values.value_counts().index[0] if len(values) > 0 else "None"
            summary_data.append({
                'Variable': col,
                'Type': 'Categorical',
                'Valid_Values': len(values),
                'Missing': len(docs_df) - len(values),
                'Summary': f"{unique_count} categories, Top: {top_category}"
            })
    
    summary_df = pd.DataFrame(summary_data)
    
    # Create visualization showing data completeness
    fig = go.Figure()
    
    # Data completeness bar chart
    fig.add_trace(go.Bar(
        x=summary_df['Variable'],
        y=summary_df['Valid_Values'],
        name='Valid Values',
        marker_color='steelblue',
        hovertemplate='<b>%{x}</b><br>Valid: %{y}<br>%{customdata}<extra></extra>',
        customdata=summary_df['Summary']
    ))
    
    fig.add_trace(go.Bar(
        x=summary_df['Variable'],
        y=summary_df['Missing'],
        name='Missing Values',
        marker_color='lightcoral'
    ))
    
    fig.update_layout(
        title='Data Completeness by Variable',
        xaxis_title='Variables',
        yaxis_title='Number of Records',
        barmode='stack',
        height=500,
        xaxis={'tickangle': 45}
    )
    
    fig.show()
    
    # Print summary table
    print("\n=== VARIABLE SUMMARY ===")
    for _, row in summary_df.iterrows():
        print(f"{row['Variable']} ({row['Type']}): {row['Valid_Values']}/{row['Valid_Values'] + row['Missing']} values - {row['Summary']}")
    
    return fig, summary_df

def create_pivot_analysis(docs_df, row_var, col_var, value_var=None, agg_func='count'):
    """Create a pivot table analysis with visualization"""
    if docs_df.empty:
        return None
    
    if row_var not in docs_df.columns or col_var not in docs_df.columns:
        print(f"Variables not found. Available: {list(docs_df.columns)}")
        return None
    
    try:
        if value_var and value_var in docs_df.columns:
            # Numeric aggregation
            pivot_df = docs_df.pivot_table(
                index=row_var, 
                columns=col_var, 
                values=value_var, 
                aggfunc=agg_func,
                fill_value=0
            )
            title = f"{agg_func.title()} of {value_var} by {row_var} and {col_var}"
        else:
            # Count aggregation
            pivot_df = pd.crosstab(docs_df[row_var], docs_df[col_var])
            title = f"Study Count by {row_var} and {col_var}"
        
        # Create heatmap
        fig = px.imshow(
            pivot_df.values,
            x=pivot_df.columns,
            y=pivot_df.index,
            color_continuous_scale='Viridis',
            title=title
        )
        
        fig.update_layout(
            height=max(400, len(pivot_df.index) * 30),
            width=max(600, len(pivot_df.columns) * 50)
        )
        
        fig.show()
        
        print(f"\nPivot Table: {row_var} × {col_var}")
        print(pivot_df.head(10))
        
        return fig, pivot_df
        
    except Exception as e:
        print(f"Error creating pivot: {e}")
        return None

# Example usage functions
def explore_methodology_patterns(docs_df):
    """Explore common methodology patterns"""
    if docs_df.empty:
        return None
    
    # Research design by sector
    if 'research_design' in docs_df.columns and 'world_bank_sector' in docs_df.columns:
        print("=== RESEARCH DESIGN BY SECTOR ===")
        return create_pivot_analysis(docs_df, 'world_bank_sector', 'research_design')

def explore_data_collection(docs_df):
    """Explore data collection patterns"""
    if docs_df.empty:
        return None
    
    # Data collection by country income group
    if 'data_collection_method' in docs_df.columns and 'author_income_group' in docs_df.columns:
        print("=== DATA COLLECTION BY AUTHOR INCOME GROUP ===")
        return create_pivot_analysis(docs_df, 'author_income_group', 'data_collection_method')

def filter_and_analyze(docs_df, **filters):
    """Filter data and run analysis on the subset"""
    if docs_df.empty:
        print("No data available")
        return None
    
    filtered = docs_df.copy()
    filter_summary = []
    
    # Apply filters
    if 'countries' in filters and filters['countries']:
        countries = filters['countries'] if isinstance(filters['countries'], list) else [filters['countries']]
        country_mask = filtered['study_countries'].str.contains('|'.join(countries), case=False, na=False)
        filtered = filtered[country_mask]
        filter_summary.append(f"Countries: {', '.join(countries)}")
    
    if 'sectors' in filters and filters['sectors']:
        sectors = filters['sectors'] if isinstance(filters['sectors'], list) else [filters['sectors']]
        sector_mask = filtered['world_bank_sector'].isin(sectors)
        filtered = filtered[sector_mask]
        filter_summary.append(f"Sectors: {', '.join(sectors)}")
    
    if 'min_year' in filters and filters['min_year']:
        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
        filtered = filtered[year_col >= filters['min_year']]
        filter_summary.append(f"Year >= {filters['min_year']}")
    
    if 'max_year' in filters and filters['max_year']:
        year_col = pd.to_numeric(filtered['publication_year'], errors='coerce')
        filtered = filtered[year_col <= filters['max_year']]
        filter_summary.append(f"Year <= {filters['max_year']}")
    
    if 'has_rct' in filters and filters['has_rct']:
        filtered = filtered[filtered['has_randomization'].str.lower().isin(['true', 'yes', '1'])]
        filter_summary.append("RCT studies only")
    
    if 'min_sample_size' in filters and filters['min_sample_size']:
        sample_col = pd.to_numeric(filtered['sample_numeric'], errors='coerce')
        filtered = filtered[sample_col >= filters['min_sample_size']]
        filter_summary.append(f"Sample size >= {filters['min_sample_size']}")
    
    # Show results
    print(f"=== FILTERED ANALYSIS ===")
    print(f"Filters applied: {'; '.join(filter_summary) if filter_summary else 'None'}")
    print(f"Studies found: {len(filtered)}/{len(docs_df)}")
    
    if filtered.empty:
        print("No studies match the criteria.")
        return None
    
    # Quick analysis of filtered data
    if len(filtered) > 5:
        # Show key distributions
        if 'world_bank_sector' in filtered.columns:
            print(f"\nTop sectors: {dict(filtered['world_bank_sector'].value_counts().head(3))}")
        if 'research_design' in filtered.columns:
            print(f"Research designs: {dict(filtered['research_design'].value_counts().head(3))}")
        if 'rigor_score' in filtered.columns:
            rigor_scores = pd.to_numeric(filtered['rigor_score'], errors='coerce').dropna()
            if len(rigor_scores) > 0:
                print(f"Rigor score: mean={rigor_scores.mean():.1f}, range={rigor_scores.min()}-{rigor_scores.max()}")
    
    return filtered

# Quick start function
def quick_analysis(docs_df):
    """Run a quick analysis of the dataset"""
    print("Starting comprehensive data analysis...")
    
    # 1. Data overview
    explorer_fig, summary_df = create_interactive_data_explorer(docs_df)
    
    # 2. Map
    map_fig = create_world_map(docs_df)
    
    # 3. Sample pivot analyses
    if len(docs_df) > 0:
        explore_methodology_patterns(docs_df)
        explore_data_collection(docs_df)
    
    return explorer_fig, map_fig, summary_df