| | import gradio as gr |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | import io |
| | from PIL import Image |
| |
|
| | |
| | scores = pd.DataFrame( |
| | { |
| | "scenario": ["0", "A", "B"], |
| | "consistency_score": [0.954451, 0.979592, 1.000000], |
| | "representativity_score": [0.79486, 0.79486, 0.75695], |
| | "integrity_score": [0.983921, 0.983921, 0.983921], |
| | } |
| | ).set_index("scenario") |
| |
|
| | scenario_map = { |
| | "0": "No cleansing", |
| | "A": "Urgent cleansing", |
| | "B": "Urgent + Low-urgency cleansing", |
| | } |
| |
|
| | |
| | QUALITY_TEXT = """ |
| | ### Overall Data Quality Analysis |
| | |
| | After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made: |
| | |
| | - **Consistency Score** : 0.980 |
| | - **Overall Representativity Score** : 0.795 |
| | - **Integrity Score** : 0.984 |
| | - **Overall Data Quality Score** : 0.919 |
| | |
| | #### Summary |
| | The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions. |
| | |
| | --- |
| | |
| | ### Consistency Action Suggestions |
| | *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:* |
| | |
| | The following dimensions are evaluated for consistency: |
| | - Completeness check |
| | - Dist-outlier check |
| | - Free-text check |
| | - Model-based outlier |
| | |
| | **Key questions with many issues** |
| | - `enumerator_name` β 98 issues (free-text) |
| | - `household_average_income_female_members` β 81 issues (outliers & completeness) |
| | - `household_average_income` β 72 issues (outliers & completeness) |
| | - `household_average_income_male_members` β 39 issues (completeness) |
| | - `household_average_expenses_education` β 29 issues (outliers & completeness) |
| | - `impact_contributions_other_factors` β 23 issues (completeness) |
| | - `monthly_spend_on_healthcare` β 21 issues (completeness) |
| | |
| | For full details see the **Data Consistency Issues Deep Dive** tab. |
| | |
| | --- |
| | |
| | ### Integrity Action Suggestions |
| | Respondent `_index: 1` shows low integrity scores: |
| | |
| | | Check | Score | |
| | |-------|------:| |
| | | Payment for Survey | 0/1 | |
| | | Respondent Influenced | 0/1 | |
| | | Response Time Integrity | 0.0/1 | |
| | | Questions Were Difficult | 0.0/2 | |
| | | Respondent Suspicious | 0/2 | |
| | | Phone Number Check | 0.0/1 | |
| | | Name Check | 0.0/1 | |
| | | Location Check | 0/1 | |
| | |
| | See **Integrity Issues Deep Dive** for more respondents. |
| | |
| | --- |
| | |
| | ### Representativity Action Suggestions |
| | | Scenario | Score | Ξ vs Baseline | |
| | |----------|-------|--------------| |
| | | Baseline (0) | 0.795 | β | |
| | | Urgent cleansing (A) | 0.795 | Β±0.000 | |
| | | +Low-urgency cleansing (B) | 0.757 | β0.038 | |
| | |
| | --- |
| | |
| | ### Enumerator Action Suggestions |
| | No enumerator bias detected. |
| | """ |
| |
|
| | |
| | def traffic_plot(metric, scen): |
| | y = scores.loc[scen, metric] |
| | fig, ax = plt.subplots(figsize=(3, 6)) |
| |
|
| | |
| | ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30) |
| | ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30) |
| | ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30) |
| |
|
| | |
| | ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70) |
| |
|
| | |
| | ax.set_xlim(0, 1) |
| | ax.set_ylim(0, 1) |
| | ax.set_xticks([]) |
| | ax.set_yticks([0, .6, .8, 1]) |
| | ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12) |
| | for s in ax.spines.values(): |
| | s.set_visible(False) |
| | ax.spines["left"].set_visible(True) |
| | ax.spines["left"].set_linewidth(2) |
| |
|
| | ax.set_title( |
| | f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})", |
| | fontsize=12, |
| | weight="bold", |
| | pad=6, |
| | ) |
| | plt.tight_layout() |
| | return fig |
| |
|
| |
|
| | def make_images(scen): |
| | imgs = [] |
| | for met in [ |
| | "consistency_score", |
| | "representativity_score", |
| | "integrity_score", |
| | ]: |
| | buf = io.BytesIO() |
| | traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight") |
| | buf.seek(0) |
| | imgs.append(Image.open(buf)) |
| | plt.close() |
| | return imgs |
| |
|
| |
|
| | |
| | CSV_FILE = "issues_log.csv" |
| | df_full = pd.read_csv(CSV_FILE) |
| |
|
| | def filter_csv(col, val): |
| | if col and val: |
| | mask = df_full[col].astype(str).str.contains(val, case=False, na=False) |
| | return df_full[mask] |
| | return df_full |
| |
|
| | |
| | with gr.Blocks(title="Data Quality Scenario Explorer") as demo: |
| | gr.Markdown("## Data Quality Scenario Explorer") |
| |
|
| | scen = gr.Dropdown( |
| | label="Scenario", |
| | choices=[ |
| | ("No cleansing", "0"), |
| | ("Urgent cleansing", "A"), |
| | ("Urgent + Low-urgency cleansing", "B"), |
| | ], |
| | value="0", |
| | ) |
| |
|
| | |
| | with gr.Row(): |
| | im1, im2, im3 = gr.Image(), gr.Image(), gr.Image() |
| | scen.change(make_images, scen, [im1, im2, im3]) |
| | demo.load(lambda: make_images("0"), outputs=[im1, im2, im3]) |
| |
|
| | |
| | summary_btn = gr.Button("Data Validation Summary") |
| | summary_md = gr.Markdown(visible=False) |
| | summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True), |
| | inputs=None, outputs=summary_md) |
| |
|
| | |
| | gr.Markdown("### Table 1-2 Viewer") |
| |
|
| | with gr.Row(): |
| | col_dd = gr.Dropdown(label="Column", |
| | choices=list(df_full.columns), |
| | value=list(df_full.columns)[0]) |
| | val_tb = gr.Textbox(label="Filter value (optional)") |
| | apply_b = gr.Button("Apply Filter") |
| | reset_b = gr.Button("Show All") |
| |
|
| | table_df = gr.Dataframe(value=df_full, label="table_1_2.csv") |
| |
|
| | apply_b.click(filter_csv, [col_dd, val_tb], table_df) |
| | reset_b.click(lambda: df_full, None, table_df) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|