import gradio as gr import pandas as pd import matplotlib.pyplot as plt import io from PIL import Image # ───────────────────────── 1. scenario scores ────────────────────────── scores = pd.DataFrame( { "scenario": ["0", "A", "B"], "consistency_score": [0.954451, 0.979592, 1.000000], "representativity_score": [0.79486, 0.79486, 0.75695], "integrity_score": [0.983921, 0.983921, 0.983921], } ).set_index("scenario") scenario_map = { "0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent + Low-urgency cleansing", } # ─────────────────── 2. long Markdown shown by the button ────────────── QUALITY_TEXT = """ ### Overall Data Quality Analysis After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made: - **Consistency Score** : 0.980 - **Overall Representativity Score** : 0.795 - **Integrity Score** : 0.984 - **Overall Data Quality Score** : 0.919 #### Summary The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions. --- ### Consistency Action Suggestions *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:* The following dimensions are evaluated for consistency: - Completeness check - Dist-outlier check - Free-text check - Model-based outlier **Key questions with many issues** - `enumerator_name` – 98 issues (free-text) - `household_average_income_female_members` – 81 issues (outliers & completeness) - `household_average_income` – 72 issues (outliers & completeness) - `household_average_income_male_members` – 39 issues (completeness) - `household_average_expenses_education` – 29 issues (outliers & completeness) - `impact_contributions_other_factors` – 23 issues (completeness) - `monthly_spend_on_healthcare` – 21 issues (completeness) For full details see the **Data Consistency Issues Deep Dive** tab. --- ### Integrity Action Suggestions Respondent `_index: 1` shows low integrity scores: | Check | Score | |-------|------:| | Payment for Survey | 0/1 | | Respondent Influenced | 0/1 | | Response Time Integrity | 0.0/1 | | Questions Were Difficult | 0.0/2 | | Respondent Suspicious | 0/2 | | Phone Number Check | 0.0/1 | | Name Check | 0.0/1 | | Location Check | 0/1 | See **Integrity Issues Deep Dive** for more respondents. --- ### Representativity Action Suggestions | Scenario | Score | Δ vs Baseline | |----------|-------|--------------| | Baseline (0) | 0.795 | — | | Urgent cleansing (A) | 0.795 | ±0.000 | | +Low-urgency cleansing (B) | 0.757 | −0.038 | --- ### Enumerator Action Suggestions No enumerator bias detected. """ # ───────────────────── 3. traffic-light plot helper ──────────────────── def traffic_plot(metric, scen): y = scores.loc[scen, metric] fig, ax = plt.subplots(figsize=(3, 6)) # coloured bands ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30) ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30) ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30) # black line marker ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70) # axes styling ax.set_xlim(0, 1) ax.set_ylim(0, 1) ax.set_xticks([]) ax.set_yticks([0, .6, .8, 1]) ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12) for s in ax.spines.values(): s.set_visible(False) ax.spines["left"].set_visible(True) ax.spines["left"].set_linewidth(2) ax.set_title( f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})", fontsize=12, weight="bold", pad=6, ) plt.tight_layout() return fig def make_images(scen): imgs = [] for met in [ "consistency_score", "representativity_score", "integrity_score", ]: buf = io.BytesIO() traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight") buf.seek(0) imgs.append(Image.open(buf)) plt.close() return imgs # ───────────────────── 4. CSV-table simple filter ────────────────────── CSV_FILE = "issues_log.csv" # rename here if needed df_full = pd.read_csv(CSV_FILE) # load once def filter_csv(col, val): if col and val: mask = df_full[col].astype(str).str.contains(val, case=False, na=False) return df_full[mask] return df_full # ───────────────────── 5. Gradio interface ───────────────────────────── with gr.Blocks(title="Data Quality Scenario Explorer") as demo: gr.Markdown("## Data Quality Scenario Explorer") scen = gr.Dropdown( label="Scenario", choices=[ ("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent + Low-urgency cleansing", "B"), ], value="0", ) # plots side-by-side with gr.Row(): im1, im2, im3 = gr.Image(), gr.Image(), gr.Image() scen.change(make_images, scen, [im1, im2, im3]) demo.load(lambda: make_images("0"), outputs=[im1, im2, im3]) # summary text button summary_btn = gr.Button("Data Validation Summary") summary_md = gr.Markdown(visible=False) summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True), inputs=None, outputs=summary_md) # ──────── table viewer with universal filter ───────── gr.Markdown("### Table 1-2 Viewer") with gr.Row(): col_dd = gr.Dropdown(label="Column", choices=list(df_full.columns), value=list(df_full.columns)[0]) val_tb = gr.Textbox(label="Filter value (optional)") apply_b = gr.Button("Apply Filter") reset_b = gr.Button("Show All") table_df = gr.Dataframe(value=df_full, label="table_1_2.csv") apply_b.click(filter_csv, [col_dd, val_tb], table_df) reset_b.click(lambda: df_full, None, table_df) if __name__ == "__main__": demo.launch()