| import gradio as gr |
| import pandas as pd |
| import matplotlib.pyplot as plt |
| import io |
| from PIL import Image |
|
|
| |
| scores = pd.DataFrame( |
| { |
| "scenario": ["0", "A", "B"], |
| "consistency_score": [0.954451, 0.979592, 1.000000], |
| "representativity_score": [0.79486, 0.79486, 0.75695], |
| "integrity_score": [0.983921, 0.983921, 0.983921], |
| } |
| ).set_index("scenario") |
|
|
| scenario_map = { |
| "0": "No cleansing", |
| "A": "Urgent cleansing", |
| "B": "Urgent + Low-urgency cleansing", |
| } |
|
|
| |
| QUALITY_TEXT = """ |
| ### Overall Data Quality Analysis |
| |
| After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made: |
| |
| - **Consistency Score** : 0.980 |
| - **Overall Representativity Score** : 0.795 |
| - **Integrity Score** : 0.984 |
| - **Overall Data Quality Score** : 0.919 |
| |
| #### Summary |
| The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions. |
| |
| --- |
| |
| ### Consistency Action Suggestions |
| *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:* |
| |
| The following dimensions are evaluated for consistency: |
| - Completeness check |
| - Dist-outlier check |
| - Free-text check |
| - Model-based outlier |
| |
| **Key questions with many issues** |
| - `enumerator_name` β 98 issues (free-text) |
| - `household_average_income_female_members` β 81 issues (outliers & completeness) |
| - `household_average_income` β 72 issues (outliers & completeness) |
| - `household_average_income_male_members` β 39 issues (completeness) |
| - `household_average_expenses_education` β 29 issues (outliers & completeness) |
| - `impact_contributions_other_factors` β 23 issues (completeness) |
| - `monthly_spend_on_healthcare` β 21 issues (completeness) |
| |
| For full details see the **Data Consistency Issues Deep Dive** tab. |
| |
| --- |
| |
| ### Integrity Action Suggestions |
| Respondent `_index: 1` shows low integrity scores: |
| |
| | Check | Score | |
| |-------|------:| |
| | Payment for Survey | 0/1 | |
| | Respondent Influenced | 0/1 | |
| | Response Time Integrity | 0.0/1 | |
| | Questions Were Difficult | 0.0/2 | |
| | Respondent Suspicious | 0/2 | |
| | Phone Number Check | 0.0/1 | |
| | Name Check | 0.0/1 | |
| | Location Check | 0/1 | |
| |
| See **Integrity Issues Deep Dive** for more respondents. |
| |
| --- |
| |
| ### Representativity Action Suggestions |
| | Scenario | Score | Ξ vs Baseline | |
| |----------|-------|--------------| |
| | Baseline (0) | 0.795 | β | |
| | Urgent cleansing (A) | 0.795 | Β±0.000 | |
| | +Low-urgency cleansing (B) | 0.757 | β0.038 | |
| |
| --- |
| |
| ### Enumerator Action Suggestions |
| No enumerator bias detected. |
| """ |
|
|
| |
| def traffic_plot(metric, scen): |
| y = scores.loc[scen, metric] |
| fig, ax = plt.subplots(figsize=(3, 6)) |
|
|
| |
| ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30) |
| ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30) |
| ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30) |
|
|
| |
| ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70) |
|
|
| |
| ax.set_xlim(0, 1) |
| ax.set_ylim(0, 1) |
| ax.set_xticks([]) |
| ax.set_yticks([0, .6, .8, 1]) |
| ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12) |
| for s in ax.spines.values(): |
| s.set_visible(False) |
| ax.spines["left"].set_visible(True) |
| ax.spines["left"].set_linewidth(2) |
|
|
| ax.set_title( |
| f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})", |
| fontsize=12, |
| weight="bold", |
| pad=6, |
| ) |
| plt.tight_layout() |
| return fig |
|
|
|
|
| def make_images(scen): |
| imgs = [] |
| for met in [ |
| "consistency_score", |
| "representativity_score", |
| "integrity_score", |
| ]: |
| buf = io.BytesIO() |
| traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight") |
| buf.seek(0) |
| imgs.append(Image.open(buf)) |
| plt.close() |
| return imgs |
|
|
|
|
| |
| CSV_FILE = "issues_log.csv" |
| df_full = pd.read_csv(CSV_FILE) |
|
|
| def filter_csv(col, val): |
| if col and val: |
| mask = df_full[col].astype(str).str.contains(val, case=False, na=False) |
| return df_full[mask] |
| return df_full |
|
|
| |
| with gr.Blocks(title="Data Quality Scenario Explorer") as demo: |
| gr.Markdown("## Data Quality Scenario Explorer") |
|
|
| scen = gr.Dropdown( |
| label="Scenario", |
| choices=[ |
| ("No cleansing", "0"), |
| ("Urgent cleansing", "A"), |
| ("Urgent + Low-urgency cleansing", "B"), |
| ], |
| value="0", |
| ) |
|
|
| |
| with gr.Row(): |
| im1, im2, im3 = gr.Image(), gr.Image(), gr.Image() |
| scen.change(make_images, scen, [im1, im2, im3]) |
| demo.load(lambda: make_images("0"), outputs=[im1, im2, im3]) |
|
|
| |
| summary_btn = gr.Button("Data Validation Summary") |
| summary_md = gr.Markdown(visible=False) |
| summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True), |
| inputs=None, outputs=summary_md) |
|
|
| |
| gr.Markdown("### Table 1-2 Viewer") |
|
|
| with gr.Row(): |
| col_dd = gr.Dropdown(label="Column", |
| choices=list(df_full.columns), |
| value=list(df_full.columns)[0]) |
| val_tb = gr.Textbox(label="Filter value (optional)") |
| apply_b = gr.Button("Apply Filter") |
| reset_b = gr.Button("Show All") |
|
|
| table_df = gr.Dataframe(value=df_full, label="table_1_2.csv") |
|
|
| apply_b.click(filter_csv, [col_dd, val_tb], table_df) |
| reset_b.click(lambda: df_full, None, table_df) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|