File size: 6,658 Bytes
44e4f59
 
f71a2f6
 
363a986
f71a2f6
c973292
 
 
 
 
9d688cb
c973292
 
 
 
 
 
 
 
 
 
 
 
f71a2f6
 
 
 
e83e816
 
 
 
f71a2f6
e83e816
f71a2f6
 
 
 
 
 
 
e83e816
c973292
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44e4f59
f71a2f6
 
 
e83e816
 
c973292
 
e83e816
 
 
 
 
 
 
 
 
c973292
f71a2f6
 
 
 
c973292
 
e83e816
c973292
e83e816
f71a2f6
 
 
 
e83e816
f71a2f6
 
c973292
 
 
 
e83e816
 
c973292
 
 
 
 
 
 
 
730ab8c
c973292
730ab8c
c973292
 
e83e816
 
 
 
 
c973292
 
 
 
 
 
730ab8c
 
 
c973292
 
e83e816
c973292
 
9d688cb
c973292
 
730ab8c
c973292
730ab8c
e83e816
 
 
 
730ab8c
c973292
d427422
361f65b
c973292
 
361f65b
 
 
 
c973292
 
e83e816
f71a2f6
 
c973292
 
 
 
 
 
 
e83e816
 
 
361f65b
f71a2f6
361f65b
c973292
 
 
361f65b
 
 
 
 
c973292
361f65b
c973292
 
112056c
 
 
 
 
 
 
 
 
 
 
 
44e4f59
 
f71a2f6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image

# ───────────────────────── 1. scenario scores ──────────────────────────
scores = pd.DataFrame(
    {
        "scenario": ["0", "A", "B"],
        "consistency_score": [0.954451, 0.979592, 1.000000],
        "representativity_score": [0.79486, 0.79486, 0.75695],
        "integrity_score": [0.983921, 0.983921, 0.983921],
    }
).set_index("scenario")

scenario_map = {
    "0": "No cleansing",
    "A": "Urgent cleansing",
    "B": "Urgent + Low-urgency cleansing",
}

# ─────────────────── 2. long Markdown shown by the button ──────────────
QUALITY_TEXT = """
### Overall Data Quality Analysis

After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:

- **Consistency Score**              : 0.980  
- **Overall Representativity Score** : 0.795  
- **Integrity Score**                : 0.984  
- **Overall Data Quality Score**     : 0.919  

#### Summary  
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.

---

### Consistency Action Suggestions
*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*

The following dimensions are evaluated for consistency:  
- Completeness check  
- Dist-outlier check  
- Free-text check  
- Model-based outlier  

**Key questions with many issues**  
- `enumerator_name` – 98 issues (free-text)  
- `household_average_income_female_members` – 81 issues (outliers & completeness)  
- `household_average_income` – 72 issues (outliers & completeness)  
- `household_average_income_male_members` – 39 issues (completeness)  
- `household_average_expenses_education` – 29 issues (outliers & completeness)  
- `impact_contributions_other_factors` – 23 issues (completeness)  
- `monthly_spend_on_healthcare` – 21 issues (completeness)  

For full details see the **Data Consistency Issues Deep Dive** tab.

---

### Integrity Action Suggestions
Respondent `_index: 1` shows low integrity scores:

| Check | Score |
|-------|------:|
| Payment for Survey | 0/1 |
| Respondent Influenced | 0/1 |
| Response Time Integrity | 0.0/1 |
| Questions Were Difficult | 0.0/2 |
| Respondent Suspicious | 0/2 |
| Phone Number Check | 0.0/1 |
| Name Check | 0.0/1 |
| Location Check | 0/1 |

See **Integrity Issues Deep Dive** for more respondents.

---

### Representativity Action Suggestions
| Scenario | Score | Ξ” vs Baseline |
|----------|-------|--------------|
| Baseline (0) | 0.795 | β€” |
| Urgent cleansing (A) | 0.795 | Β±0.000 |
| +Low-urgency cleansing (B) | 0.757 | βˆ’0.038 |

---

### Enumerator Action Suggestions
No enumerator bias detected.
"""

# ───────────────────── 3. traffic-light plot helper ────────────────────
def traffic_plot(metric, scen):
    y = scores.loc[scen, metric]
    fig, ax = plt.subplots(figsize=(3, 6))

    # coloured bands
    ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
    ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
    ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)

    # black line marker
    ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)

    # axes styling
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xticks([])
    ax.set_yticks([0, .6, .8, 1])
    ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
    for s in ax.spines.values():
        s.set_visible(False)
    ax.spines["left"].set_visible(True)
    ax.spines["left"].set_linewidth(2)

    ax.set_title(
        f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
        fontsize=12,
        weight="bold",
        pad=6,
    )
    plt.tight_layout()
    return fig


def make_images(scen):
    imgs = []
    for met in [
        "consistency_score",
        "representativity_score",
        "integrity_score",
    ]:
        buf = io.BytesIO()
        traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
        buf.seek(0)
        imgs.append(Image.open(buf))
        plt.close()
    return imgs


# ───────────────────── 4. CSV-table simple filter ──────────────────────
CSV_FILE = "issues_log.csv"        # rename here if needed
df_full  = pd.read_csv(CSV_FILE)  # load once

def filter_csv(col, val):
    if col and val:
        mask = df_full[col].astype(str).str.contains(val, case=False, na=False)
        return df_full[mask]
    return df_full

# ───────────────────── 5. Gradio interface ─────────────────────────────
with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
    gr.Markdown("## Data Quality Scenario Explorer")

    scen = gr.Dropdown(
        label="Scenario",
        choices=[
            ("No cleansing", "0"),
            ("Urgent cleansing", "A"),
            ("Urgent + Low-urgency cleansing", "B"),
        ],
        value="0",
    )

    # plots side-by-side
    with gr.Row():
        im1, im2, im3 = gr.Image(), gr.Image(), gr.Image()
    scen.change(make_images, scen, [im1, im2, im3])
    demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])

    # summary text button
    summary_btn = gr.Button("Data Validation Summary")
    summary_md  = gr.Markdown(visible=False)
    summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
                      inputs=None, outputs=summary_md)

    # ──────── table viewer with universal filter ─────────
    gr.Markdown("### Table 1-2 Viewer")

    with gr.Row():
        col_dd  = gr.Dropdown(label="Column",
                              choices=list(df_full.columns),
                              value=list(df_full.columns)[0])
        val_tb  = gr.Textbox(label="Filter value (optional)")
        apply_b = gr.Button("Apply Filter")
        reset_b = gr.Button("Show All")

    table_df = gr.Dataframe(value=df_full, label="table_1_2.csv")

    apply_b.click(filter_csv,  [col_dd, val_tb],  table_df)
    reset_b.click(lambda: df_full, None, table_df)

if __name__ == "__main__":
    demo.launch()