fortuala commited on
Commit
c973292
Β·
verified Β·
1 Parent(s): e83e816

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -114
app.py CHANGED
@@ -4,25 +4,24 @@ import matplotlib.pyplot as plt
4
  import io
5
  from PIL import Image
6
 
7
- # ───────────────────────────────────────────
8
- # 1. Static data for the three scenarios
9
- # ───────────────────────────────────────────
10
- data = pd.DataFrame({
11
- "scenario": ["0", "A", "B"],
12
- "consistency_score": [0.954451, 0.979592, 1.000000],
13
- "overall_representativity_score": [0.79486, 0.79486, 0.75695],
14
- "integrity_score": [0.983921, 0.983921, 0.983921],
15
- "data_quality_score": [0.911077, 0.919457, 0.913624]
16
- }).set_index("scenario")
17
-
18
- scenario_map = {"0": "No cleansing",
19
- "A": "Urgent cleansing",
20
- "B": "Urgent + Low-urgency cleansing"}
21
-
22
- # ───────────────────────────────────────────
23
- # 2. Full analysis text (shown for Scenario A)
24
- # ───────────────────────────────────────────
25
- QUALITY_TEXT = """\
26
  ### Overall Data Quality Analysis
27
 
28
  After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
@@ -41,29 +40,29 @@ The overall data quality score is satisfactory, but the following dimensions req
41
  *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
42
 
43
  The following dimensions are evaluated for consistency:
44
- - Completeness check (missing answers)
45
- - Dist-outlier check (extreme values)
46
- - Free-text check (short answers)
47
- - Model-based outlier (inconsistent values)
48
-
49
- **Key questions with many issues:**
50
- - `enumerator_name` β€” 98 issues (mainly free-text)
51
- - `household_average_income_female_members` β€” 81 issues (outliers & completeness)
52
- - `household_average_income` β€” 72 issues (outliers & completeness)
53
- - `household_average_income_male_members` β€” 39 issues (completeness)
54
- - `household_average_expenses_education` β€” 29 issues (outliers & completeness)
55
- - `impact_contributions_other_factors` β€” 23 issues (completeness)
56
- - `monthly_spend_on_healthcare` β€” 21 issues (completeness)
57
-
58
- For a detailed view of consistency issues, see the **Data Consistency Issues Deep Dive** tab.
59
 
60
  ---
61
 
62
  ### Integrity Action Suggestions
63
  Respondent `_index: 1` shows low integrity scores:
64
 
65
- | Check | Score | |
66
- |-------|-------|---|
67
  | Payment for Survey | 0/1 |
68
  | Respondent Influenced | 0/1 |
69
  | Response Time Integrity | 0.0/1 |
@@ -73,15 +72,15 @@ Respondent `_index: 1` shows low integrity scores:
73
  | Name Check | 0.0/1 |
74
  | Location Check | 0/1 |
75
 
76
- For definitions and more respondents, open the **Integrity Issues Deep Dive** tab.
77
 
78
  ---
79
 
80
  ### Representativity Action Suggestions
81
- | Scenario | Score | Ξ” vs. Baseline |
82
- |----------|-------|----------------|
83
  | Baseline (0) | 0.795 | β€” |
84
- | Urgent cleansing (A) | 0.795 | Β±0.000 |
85
  | +Low-urgency cleansing (B) | 0.757 | βˆ’0.038 |
86
 
87
  ---
@@ -90,111 +89,103 @@ For definitions and more respondents, open the **Integrity Issues Deep Dive** ta
90
  No enumerator bias detected.
91
  """
92
 
93
- # ──────────────────────────���────────────────
94
- # 3. Traffic-light plot helper
95
- # ───────────────────────────────────────────
96
- def make_plot(dim, scen):
97
- val = data.loc[scen, dim]
98
- fig, ax = plt.subplots(figsize=(4, 7))
99
 
100
  # coloured bands
101
- ax.axhspan(0, 0.60, color="#FF4D4F", alpha=0.3) # red
102
- ax.axhspan(0.60, 0.80, color="#FFE58F", alpha=0.3) # yellow
103
- ax.axhspan(0.80, 0.95, color="#52C41A", alpha=0.3) # green
104
-
105
- # black marker
106
- ax.axhline(val, color="black", lw=2, xmin=0.35, xmax=0.65)
107
- ax.annotate(f"{val:.3f}", (0.5, val),
108
- xycoords=("axes fraction", "data"),
109
- ha="center", va="bottom",
110
- fontsize=22, weight="bold",
111
- bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.85))
112
-
113
- # cosmetics
114
  ax.set_xlim(0, 1)
115
- ax.set_ylim(0, 0.95)
116
  ax.set_xticks([])
117
- ax.set_yticks([0, 0.6, 0.8, 0.95])
118
- ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=14)
119
  for s in ax.spines.values():
120
  s.set_visible(False)
121
  ax.spines["left"].set_visible(True)
122
  ax.spines["left"].set_linewidth(2)
123
 
124
- ax.set_title(f"{dim.replace('_', ' ').title()}\n({scenario_map[scen]})",
125
- fontsize=15, weight="bold", pad=8)
 
 
 
 
126
  plt.tight_layout()
127
  return fig
128
 
129
- def get_plots(scen):
 
130
  imgs = []
131
- for dim in ["consistency_score",
132
- "overall_representativity_score",
133
- "integrity_score"]:
 
 
134
  buf = io.BytesIO()
135
- make_plot(dim, scen).savefig(buf, format="png", bbox_inches="tight")
136
  buf.seek(0)
137
  imgs.append(Image.open(buf))
138
  plt.close()
139
  return imgs
140
 
141
- # ───────────────────────────────────────────
142
- # 4. CSV-table filter helper
143
- # ───────────────────────────────────────────
144
- CSV_FILE = "table_1_2.csv" # change if your file has a different name
145
 
146
- def filter_table(col, val):
 
 
 
 
147
  df = pd.read_csv(CSV_FILE)
148
  if col and val and col in df.columns:
149
- return df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
150
  return df
151
 
152
- # ───────────────────────────────────────────
153
- # 5. Gradio UI
154
- # ───────────────────────────────────────────
155
  with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
156
  gr.Markdown("## Data Quality Scenario Explorer")
157
 
158
- # Scenario selector
159
- scenario = gr.Dropdown(
160
- label="Select Scenario",
161
- choices=[("No cleansing", "0"),
162
- ("Urgent cleansing", "A"),
163
- ("Urgent + Low-urgency cleansing", "B")],
 
164
  value="0",
165
  )
166
 
167
- # Three traffic-light plots
168
- img1 = gr.Image(label="Consistency")
169
- img2 = gr.Image(label="Representativity")
170
- img3 = gr.Image(label="Integrity")
171
- scenario.change(get_plots, scenario, [img1, img2, img3])
172
- demo.load(lambda: get_plots("0"), outputs=[img1, img2, img3])
173
-
174
- # Button β†’ show full analysis text
175
- show_btn = gr.Button("Show Overall Data Quality Analysis")
176
- analysis_md = gr.Markdown(visible=False)
177
-
178
- def show_analysis(scen):
179
- return gr.update(
180
- value=QUALITY_TEXT if scen == "A" else
181
- "Please select **Urgent cleansing (Scenario A)** to view the detailed analysis.",
182
- visible=True
183
- )
184
- show_btn.click(show_analysis, scenario, analysis_md)
185
-
186
- # ───────── table section ─────────
187
- gr.Markdown("### Data Consistency Issues – Table 1.2")
188
-
189
  with gr.Row():
190
- col_in = gr.Textbox(label="Column (optional)", placeholder="e.g. question")
191
- val_in = gr.Textbox(label="Value (optional)", placeholder="e.g. income")
192
- apply_btn = gr.Button("Apply / Refresh")
193
-
194
- table_out = gr.Dataframe(label="Filtered table_1_2.csv")
195
-
196
- apply_btn.click(filter_table, [col_in, val_in], table_out)
197
- demo.load(lambda: filter_table("", ""), outputs=table_out)
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
  if __name__ == "__main__":
200
  demo.launch()
 
4
  import io
5
  from PIL import Image
6
 
7
+ # ───────────────────────── 1. scenario scores ──────────────────────────
8
+ scores = pd.DataFrame(
9
+ {
10
+ "scenario": ["0", "A", "B"],
11
+ "consistency_score": [0.954451, 0.979592, 1.000000],
12
+ "overall_representativity_score": [0.79486, 0.79486, 0.75695],
13
+ "integrity_score": [0.983921, 0.983921, 0.983921],
14
+ }
15
+ ).set_index("scenario")
16
+
17
+ scenario_map = {
18
+ "0": "No cleansing",
19
+ "A": "Urgent cleansing",
20
+ "B": "Urgent + Low-urgency cleansing",
21
+ }
22
+
23
+ # ─────────────────── 2. long Markdown shown by the button ──────────────
24
+ QUALITY_TEXT = """
 
25
  ### Overall Data Quality Analysis
26
 
27
  After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
 
40
  *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
41
 
42
  The following dimensions are evaluated for consistency:
43
+ - Completeness check
44
+ - Dist-outlier check
45
+ - Free-text check
46
+ - Model-based outlier
47
+
48
+ **Key questions with many issues**
49
+ - `enumerator_name` – 98 issues (free-text)
50
+ - `household_average_income_female_members` – 81 issues (outliers & completeness)
51
+ - `household_average_income` – 72 issues (outliers & completeness)
52
+ - `household_average_income_male_members` – 39 issues (completeness)
53
+ - `household_average_expenses_education` – 29 issues (outliers & completeness)
54
+ - `impact_contributions_other_factors` – 23 issues (completeness)
55
+ - `monthly_spend_on_healthcare` – 21 issues (completeness)
56
+
57
+ For full details see the **Data Consistency Issues Deep Dive** tab.
58
 
59
  ---
60
 
61
  ### Integrity Action Suggestions
62
  Respondent `_index: 1` shows low integrity scores:
63
 
64
+ | Check | Score |
65
+ |-------|------:|
66
  | Payment for Survey | 0/1 |
67
  | Respondent Influenced | 0/1 |
68
  | Response Time Integrity | 0.0/1 |
 
72
  | Name Check | 0.0/1 |
73
  | Location Check | 0/1 |
74
 
75
+ See **Integrity Issues Deep Dive** for more respondents.
76
 
77
  ---
78
 
79
  ### Representativity Action Suggestions
80
+ | Scenario | Score | Ξ” vs Baseline |
81
+ |----------|-------|--------------|
82
  | Baseline (0) | 0.795 | β€” |
83
+ | Urgent cleansing (A) | 0.795 | Β±0.000 |
84
  | +Low-urgency cleansing (B) | 0.757 | βˆ’0.038 |
85
 
86
  ---
 
89
  No enumerator bias detected.
90
  """
91
 
92
+ # ───────────────────── 3. traffic-light plot helper ────────────────────
93
+ def traffic_plot(metric, scen):
94
+ y = scores.loc[scen, metric]
95
+ fig, ax = plt.subplots(figsize=(3, 6))
 
 
96
 
97
  # coloured bands
98
+ ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
99
+ ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
100
+ ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)
101
+
102
+ # black line marker
103
+ ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)
104
+
105
+ # axes styling
 
 
 
 
 
106
  ax.set_xlim(0, 1)
107
+ ax.set_ylim(0, 1)
108
  ax.set_xticks([])
109
+ ax.set_yticks([0, .6, .8, 1])
110
+ ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
111
  for s in ax.spines.values():
112
  s.set_visible(False)
113
  ax.spines["left"].set_visible(True)
114
  ax.spines["left"].set_linewidth(2)
115
 
116
+ ax.set_title(
117
+ f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
118
+ fontsize=12,
119
+ weight="bold",
120
+ pad=6,
121
+ )
122
  plt.tight_layout()
123
  return fig
124
 
125
+
126
+ def make_images(scen):
127
  imgs = []
128
+ for met in [
129
+ "consistency_score",
130
+ "overall_representativity_score",
131
+ "integrity_score",
132
+ ]:
133
  buf = io.BytesIO()
134
+ traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
135
  buf.seek(0)
136
  imgs.append(Image.open(buf))
137
  plt.close()
138
  return imgs
139
 
 
 
 
 
140
 
141
+ # ───────────────────── 4. CSV-table simple filter ──────────────────────
142
+ CSV_FILE = "table_1_2.csv" # rename if needed
143
+
144
+
145
+ def filter_csv(col, val):
146
  df = pd.read_csv(CSV_FILE)
147
  if col and val and col in df.columns:
148
+ return df[df[col].astype(str).str.contains(val, case=False, na=False)]
149
  return df
150
 
151
+
152
+ # ───────────────────── 5. Gradio interface ─────────────────────────────
 
153
  with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
154
  gr.Markdown("## Data Quality Scenario Explorer")
155
 
156
+ scen = gr.Dropdown(
157
+ label="Scenario",
158
+ choices=[
159
+ ("No cleansing", "0"),
160
+ ("Urgent cleansing", "A"),
161
+ ("Urgent + Low-urgency cleansing", "B"),
162
+ ],
163
  value="0",
164
  )
165
 
166
+ # three images side-by-side
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  with gr.Row():
168
+ im1 = gr.Image()
169
+ im2 = gr.Image()
170
+ im3 = gr.Image()
171
+ scen.change(make_images, scen, [im1, im2, im3])
172
+ demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])
173
+
174
+ # button -> full markdown
175
+ show_btn = gr.Button("Data Validation Summary")
176
+ summary_md = gr.Markdown(visible=False)
177
+ show_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
178
+ inputs=None, outputs=summary_md)
179
+
180
+ # table viewer
181
+ gr.Markdown("### Table 1-2 Viewer")
182
+ col_in = gr.Textbox(label="Column (optional)")
183
+ val_in = gr.Textbox(label="Value (optional)")
184
+ tbl_btn = gr.Button("Show / Filter Table")
185
+ tbl_out = gr.Dataframe()
186
+
187
+ tbl_btn.click(filter_csv, [col_in, val_in], tbl_out)
188
+ demo.load(lambda: filter_csv("", ""), outputs=tbl_out)
189
 
190
  if __name__ == "__main__":
191
  demo.launch()