File size: 6,658 Bytes
44e4f59 f71a2f6 363a986 f71a2f6 c973292 9d688cb c973292 f71a2f6 e83e816 f71a2f6 e83e816 f71a2f6 e83e816 c973292 44e4f59 f71a2f6 e83e816 c973292 e83e816 c973292 f71a2f6 c973292 e83e816 c973292 e83e816 f71a2f6 e83e816 f71a2f6 c973292 e83e816 c973292 730ab8c c973292 730ab8c c973292 e83e816 c973292 730ab8c c973292 e83e816 c973292 9d688cb c973292 730ab8c c973292 730ab8c e83e816 730ab8c c973292 d427422 361f65b c973292 361f65b c973292 e83e816 f71a2f6 c973292 e83e816 361f65b f71a2f6 361f65b c973292 361f65b c973292 361f65b c973292 112056c 44e4f59 f71a2f6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image
# βββββββββββββββββββββββββ 1. scenario scores ββββββββββββββββββββββββββ
scores = pd.DataFrame(
{
"scenario": ["0", "A", "B"],
"consistency_score": [0.954451, 0.979592, 1.000000],
"representativity_score": [0.79486, 0.79486, 0.75695],
"integrity_score": [0.983921, 0.983921, 0.983921],
}
).set_index("scenario")
scenario_map = {
"0": "No cleansing",
"A": "Urgent cleansing",
"B": "Urgent + Low-urgency cleansing",
}
# βββββββββββββββββββ 2. long Markdown shown by the button ββββββββββββββ
QUALITY_TEXT = """
### Overall Data Quality Analysis
After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
- **Consistency Score** : 0.980
- **Overall Representativity Score** : 0.795
- **Integrity Score** : 0.984
- **Overall Data Quality Score** : 0.919
#### Summary
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
---
### Consistency Action Suggestions
*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
The following dimensions are evaluated for consistency:
- Completeness check
- Dist-outlier check
- Free-text check
- Model-based outlier
**Key questions with many issues**
- `enumerator_name` β 98 issues (free-text)
- `household_average_income_female_members` β 81 issues (outliers & completeness)
- `household_average_income` β 72 issues (outliers & completeness)
- `household_average_income_male_members` β 39 issues (completeness)
- `household_average_expenses_education` β 29 issues (outliers & completeness)
- `impact_contributions_other_factors` β 23 issues (completeness)
- `monthly_spend_on_healthcare` β 21 issues (completeness)
For full details see the **Data Consistency Issues Deep Dive** tab.
---
### Integrity Action Suggestions
Respondent `_index: 1` shows low integrity scores:
| Check | Score |
|-------|------:|
| Payment for Survey | 0/1 |
| Respondent Influenced | 0/1 |
| Response Time Integrity | 0.0/1 |
| Questions Were Difficult | 0.0/2 |
| Respondent Suspicious | 0/2 |
| Phone Number Check | 0.0/1 |
| Name Check | 0.0/1 |
| Location Check | 0/1 |
See **Integrity Issues Deep Dive** for more respondents.
---
### Representativity Action Suggestions
| Scenario | Score | Ξ vs Baseline |
|----------|-------|--------------|
| Baseline (0) | 0.795 | β |
| Urgent cleansing (A) | 0.795 | Β±0.000 |
| +Low-urgency cleansing (B) | 0.757 | β0.038 |
---
### Enumerator Action Suggestions
No enumerator bias detected.
"""
# βββββββββββββββββββββ 3. traffic-light plot helper ββββββββββββββββββββ
def traffic_plot(metric, scen):
y = scores.loc[scen, metric]
fig, ax = plt.subplots(figsize=(3, 6))
# coloured bands
ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)
# black line marker
ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)
# axes styling
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xticks([])
ax.set_yticks([0, .6, .8, 1])
ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
for s in ax.spines.values():
s.set_visible(False)
ax.spines["left"].set_visible(True)
ax.spines["left"].set_linewidth(2)
ax.set_title(
f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
fontsize=12,
weight="bold",
pad=6,
)
plt.tight_layout()
return fig
def make_images(scen):
imgs = []
for met in [
"consistency_score",
"representativity_score",
"integrity_score",
]:
buf = io.BytesIO()
traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
imgs.append(Image.open(buf))
plt.close()
return imgs
# βββββββββββββββββββββ 4. CSV-table simple filter ββββββββββββββββββββββ
CSV_FILE = "issues_log.csv" # rename here if needed
df_full = pd.read_csv(CSV_FILE) # load once
def filter_csv(col, val):
if col and val:
mask = df_full[col].astype(str).str.contains(val, case=False, na=False)
return df_full[mask]
return df_full
# βββββββββββββββββββββ 5. Gradio interface βββββββββββββββββββββββββββββ
with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
gr.Markdown("## Data Quality Scenario Explorer")
scen = gr.Dropdown(
label="Scenario",
choices=[
("No cleansing", "0"),
("Urgent cleansing", "A"),
("Urgent + Low-urgency cleansing", "B"),
],
value="0",
)
# plots side-by-side
with gr.Row():
im1, im2, im3 = gr.Image(), gr.Image(), gr.Image()
scen.change(make_images, scen, [im1, im2, im3])
demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])
# summary text button
summary_btn = gr.Button("Data Validation Summary")
summary_md = gr.Markdown(visible=False)
summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
inputs=None, outputs=summary_md)
# ββββββββ table viewer with universal filter βββββββββ
gr.Markdown("### Table 1-2 Viewer")
with gr.Row():
col_dd = gr.Dropdown(label="Column",
choices=list(df_full.columns),
value=list(df_full.columns)[0])
val_tb = gr.Textbox(label="Filter value (optional)")
apply_b = gr.Button("Apply Filter")
reset_b = gr.Button("Show All")
table_df = gr.Dataframe(value=df_full, label="table_1_2.csv")
apply_b.click(filter_csv, [col_dd, val_tb], table_df)
reset_b.click(lambda: df_full, None, table_df)
if __name__ == "__main__":
demo.launch()
|