fortuala commited on
Commit
e83e816
Β·
verified Β·
1 Parent(s): 730ab8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -149
app.py CHANGED
@@ -4,213 +4,197 @@ import matplotlib.pyplot as plt
4
  import io
5
  from PIL import Image
6
 
7
- # --- Data (hardcoded as requested) ---
 
 
8
  data = pd.DataFrame({
9
  "scenario": ["0", "A", "B"],
10
- "consistency_score": [0.954451, 0.979592, 1.0],
11
  "overall_representativity_score": [0.79486, 0.79486, 0.75695],
12
  "integrity_score": [0.983921, 0.983921, 0.983921],
13
  "data_quality_score": [0.911077, 0.919457, 0.913624]
14
- })
15
 
16
- scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}
 
 
17
 
18
- QUALITY_TEXT = """
 
 
 
19
  ### Overall Data Quality Analysis
20
 
21
  After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
22
 
23
- - **Consistency Score** : 0.980
24
- - **Overall Representativity Score** : 0.795
25
- - **Integrity Score** : 0.984
26
- - **Overall Data Quality Score** : 0.919
27
 
28
- #### Summary
29
  The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
30
 
31
  ---
32
 
33
  ### Consistency Action Suggestions
34
-
35
  *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
36
 
37
- The following dimensions are evaluated for consistency:
38
- - Completeness check: An answer was expected but not provided.
39
- - Dist outlier check: A value outside the range of reasonable values.
40
- - Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
41
- - Model-based outlier: An inconsistent or extreme value compared to typical responses.
42
-
43
- Question: 'enumerator_name' has 98 issues.
44
- - The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.
45
-
46
- Question: 'household_average_income_female_members' has 81 issues.
47
- - The dimension(s) with the most issues: model based outlier with 41 issues.
48
- - The second dimension with issues: completeness check with 40 issues.
49
-
50
- Question: 'household_average_income' has 72 issues.
51
- - The dimension(s) with the most issues: model based outlier with 39 issues.
52
- - The second dimension with issues: completeness check with 33 issues.
53
-
54
- Question: 'household_average_income_male_members' has 39 issues.
55
- - The dimension with the most issues: completeness check with 39 issues.
56
-
57
- Question: 'household_average_expenses_education' has 29 issues.
58
- - The dimension(s) with the most issues: model based outlier with 23 issues.
59
- - The second dimension with issues: completeness check with 6 issues.
60
-
61
- Question: 'impact_contributions_other_factors' has 23 issues.
62
- - The dimension with the most issues: completeness check with 23 issues.
63
 
64
- Question: 'monthly_spend_on_healthcare' has 21 issues.
65
- - The dimension with the most issues: completeness check with 21 issues.
 
 
 
 
 
 
66
 
67
- For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.
68
 
69
  ---
70
 
71
  ### Integrity Action Suggestions
72
- The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.
73
-
74
- **Respondent with _index: 1**
75
-
76
- The following checks scored below the maximum value:
77
- Payment For Survey (score: 0/1)
78
- Respondent Influenced (score: 0/1)
79
- Response Time Integrity (score: 0.0/1)
80
- Questions Which Were Difficult (score: 0.0/2)
81
- Respondent Suspicious (score: 0/2)
82
- Phone Number Check (score: 0.0/1)
83
- Name Check (score: 0.0/1)
84
- Location Check (score: 0/1)
85
-
86
- The following checks are evaluated for integrity:
87
- - **Payment for Survey:** Less integrity if the respondent was paid to do it.
88
- - **Respondent Influenced:** Less integrity score if the respondent seemed influenced.
89
- - **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond.
90
- - **Audio Verification:** More integrity if audio verification is in place.
91
- - **Questions Were Difficult:** Less integrity if more questions were hard to respond to.
92
- - **Respondent Suspicious:** Less integrity the more suspicious the respondent is.
93
- - **Phone Number Check:** More integrity if a realistic phone number is provided.
94
- - **Response Uniqueness:** More integrity if the response is truly unique.
95
- - **Name Check:** More integrity if the name is realistic.
96
- - **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided.
97
- - **Enumerator Bias:** Less integrity if enumerator responses are biased.
98
- - **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts.
99
-
100
- For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.
101
 
102
  ---
103
 
104
  ### Representativity Action Suggestions
105
-
106
- Baseline (no cleansing) overall representativity score: 0.795
107
- After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Ξ” = 0.000).
108
- After low urgency cleansing (Scenario B), the score is 0.757 (declined, Ξ” = -0.038).
 
109
 
110
  ---
111
 
112
  ### Enumerator Action Suggestions
113
- No enumerator bias has been found.
114
  """
115
 
116
- def plot_dimension(dim_col, scenario):
117
- value = data.set_index("scenario").loc[scenario, dim_col]
 
 
 
118
  fig, ax = plt.subplots(figsize=(4, 7))
119
- ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.30)
120
- ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.30)
121
- ax.axhspan(0.8, 0.95, color="#52C41A", alpha=0.30)
122
- ax.axhline(value, color='black', lw=2, xmin=0.35, xmax=0.65)
123
- ax.annotate(
124
- f"{value:.3f}",
125
- xy=(0.5, value),
126
- xycoords=('axes fraction', 'data'),
127
- ha='center', va='bottom',
128
- fontsize=22, weight='bold',
129
- color='black',
130
- bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, boxstyle='round,pad=0.2')
131
- )
 
 
132
  ax.set_xlim(0, 1)
133
  ax.set_ylim(0, 0.95)
134
  ax.set_xticks([])
135
  ax.set_yticks([0, 0.6, 0.8, 0.95])
136
- ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=15)
137
- for spine in ax.spines.values():
138
- spine.set_visible(False)
139
- ax.spines['left'].set_visible(True)
140
- ax.spines['left'].set_linewidth(2)
141
- plt.subplots_adjust(top=0.88)
142
- ax.set_title(
143
- f"{dim_col.replace('_', ' ').title()}\n({scenario_map[scenario]})",
144
- fontsize=15, weight='bold', pad=10
145
- )
146
  plt.tight_layout()
147
  return fig
148
 
149
- def show_plots(scenario):
150
- fig1 = plot_dimension("consistency_score", scenario)
151
- fig2 = plot_dimension("overall_representativity_score", scenario)
152
- fig3 = plot_dimension("integrity_score", scenario)
153
- img_list = []
154
- for fig in [fig1, fig2, fig3]:
155
  buf = io.BytesIO()
156
- fig.savefig(buf, format='png', bbox_inches='tight')
157
  buf.seek(0)
158
- img = Image.open(buf)
159
- img_list.append(img)
160
- plt.close(fig)
161
- return img_list
 
 
 
 
162
 
163
- # --- Simple table filter function ---
164
  def filter_table(col, val):
165
- df = pd.read_csv("issues_log.csv")
166
- if col and val:
167
- if col in df.columns:
168
- mask = df[col].astype(str).str.contains(str(val), case=False, na=False)
169
- return df[mask]
170
- else:
171
- return pd.DataFrame({"error": [f"Column '{col}' not in table."]})
172
  return df
173
 
174
- def get_quality_text(selected_scenario):
175
- if selected_scenario == "A":
176
- return QUALITY_TEXT
177
- else:
178
- return f"Select scenario 'Urgent cleansing' to see the detailed data quality analysis."
179
-
180
- # --- Gradio UI ---
181
- with gr.Blocks() as demo:
182
  gr.Markdown("## Data Quality Scenario Explorer")
183
 
184
- with gr.Row():
185
- scenario = gr.Dropdown(
186
- choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
187
- value="0",
188
- label="Select Scenario"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
  )
 
190
 
191
- with gr.Row():
192
- out1 = gr.Image(label="Consistency Score Traffic Light")
193
- out2 = gr.Image(label="Overall Representativity Score Traffic Light")
194
- out3 = gr.Image(label="Integrity Score Traffic Light")
195
- scenario.change(show_plots, scenario, [out1, out2, out3])
196
 
197
  with gr.Row():
198
- gr.Markdown("### Overall Data Quality Analysis")
199
- analysis_text = gr.Markdown(value=get_quality_text("0"), visible=True)
200
- scenario.change(get_quality_text, scenario, analysis_text)
201
 
202
- with gr.Row():
203
- gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
204
- with gr.Row():
205
- filter_col = gr.Textbox(label="Column (optional)")
206
- filter_val = gr.Textbox(label="Value (optional)")
207
- table_out = gr.Dataframe(label="Filtered Table 1.2 (issues_log.csv)")
208
 
209
- filter_col.change(filter_table, [filter_col, filter_val], table_out)
210
- filter_val.change(filter_table, [filter_col, filter_val], table_out)
211
  demo.load(lambda: filter_table("", ""), outputs=table_out)
212
 
213
  if __name__ == "__main__":
214
  demo.launch()
215
-
216
-
 
4
  import io
5
  from PIL import Image
6
 
7
+ # ───────────────────────────────────────────
8
+ # 1. Static data for the three scenarios
9
+ # ───────────────────────────────────────────
10
  data = pd.DataFrame({
11
  "scenario": ["0", "A", "B"],
12
+ "consistency_score": [0.954451, 0.979592, 1.000000],
13
  "overall_representativity_score": [0.79486, 0.79486, 0.75695],
14
  "integrity_score": [0.983921, 0.983921, 0.983921],
15
  "data_quality_score": [0.911077, 0.919457, 0.913624]
16
+ }).set_index("scenario")
17
 
18
+ scenario_map = {"0": "No cleansing",
19
+ "A": "Urgent cleansing",
20
+ "B": "Urgent + Low-urgency cleansing"}
21
 
22
+ # ───────────────────────────────────────────
23
+ # 2. Full analysis text (shown for Scenario A)
24
+ # ───────────────────────────────────────────
25
+ QUALITY_TEXT = """\
26
  ### Overall Data Quality Analysis
27
 
28
  After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
29
 
30
+ - **Consistency Score** : 0.980
31
+ - **Overall Representativity Score** : 0.795
32
+ - **Integrity Score** : 0.984
33
+ - **Overall Data Quality Score** : 0.919
34
 
35
+ #### Summary
36
  The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
37
 
38
  ---
39
 
40
  ### Consistency Action Suggestions
 
41
  *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
42
 
43
+ The following dimensions are evaluated for consistency:
44
+ - Completeness check (missing answers)
45
+ - Dist-outlier check (extreme values)
46
+ - Free-text check (short answers)
47
+ - Model-based outlier (inconsistent values)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
+ **Key questions with many issues:**
50
+ - `enumerator_name` β€” 98 issues (mainly free-text)
51
+ - `household_average_income_female_members` β€” 81 issues (outliers & completeness)
52
+ - `household_average_income` β€” 72 issues (outliers & completeness)
53
+ - `household_average_income_male_members` β€” 39 issues (completeness)
54
+ - `household_average_expenses_education` β€” 29 issues (outliers & completeness)
55
+ - `impact_contributions_other_factors` β€” 23 issues (completeness)
56
+ - `monthly_spend_on_healthcare` β€” 21 issues (completeness)
57
 
58
+ For a detailed view of consistency issues, see the **Data Consistency Issues Deep Dive** tab.
59
 
60
  ---
61
 
62
  ### Integrity Action Suggestions
63
+ Respondent `_index: 1` shows low integrity scores:
64
+
65
+ | Check | Score | |
66
+ |-------|-------|---|
67
+ | Payment for Survey | 0/1 |
68
+ | Respondent Influenced | 0/1 |
69
+ | Response Time Integrity | 0.0/1 |
70
+ | Questions Were Difficult | 0.0/2 |
71
+ | Respondent Suspicious | 0/2 |
72
+ | Phone Number Check | 0.0/1 |
73
+ | Name Check | 0.0/1 |
74
+ | Location Check | 0/1 |
75
+
76
+ For definitions and more respondents, open the **Integrity Issues Deep Dive** tab.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  ---
79
 
80
  ### Representativity Action Suggestions
81
+ | Scenario | Score | Ξ” vs. Baseline |
82
+ |----------|-------|----------------|
83
+ | Baseline (0) | 0.795 | β€” |
84
+ | Urgent cleansing (A) | 0.795 | Β±0.000 |
85
+ | +Low-urgency cleansing (B) | 0.757 | βˆ’0.038 |
86
 
87
  ---
88
 
89
  ### Enumerator Action Suggestions
90
+ No enumerator bias detected.
91
  """
92
 
93
+ # ───────────────────────────────────────────
94
+ # 3. Traffic-light plot helper
95
+ # ───────────────────────────────────────────
96
+ def make_plot(dim, scen):
97
+ val = data.loc[scen, dim]
98
  fig, ax = plt.subplots(figsize=(4, 7))
99
+
100
+ # coloured bands
101
+ ax.axhspan(0, 0.60, color="#FF4D4F", alpha=0.3) # red
102
+ ax.axhspan(0.60, 0.80, color="#FFE58F", alpha=0.3) # yellow
103
+ ax.axhspan(0.80, 0.95, color="#52C41A", alpha=0.3) # green
104
+
105
+ # black marker
106
+ ax.axhline(val, color="black", lw=2, xmin=0.35, xmax=0.65)
107
+ ax.annotate(f"{val:.3f}", (0.5, val),
108
+ xycoords=("axes fraction", "data"),
109
+ ha="center", va="bottom",
110
+ fontsize=22, weight="bold",
111
+ bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.85))
112
+
113
+ # cosmetics
114
  ax.set_xlim(0, 1)
115
  ax.set_ylim(0, 0.95)
116
  ax.set_xticks([])
117
  ax.set_yticks([0, 0.6, 0.8, 0.95])
118
+ ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=14)
119
+ for s in ax.spines.values():
120
+ s.set_visible(False)
121
+ ax.spines["left"].set_visible(True)
122
+ ax.spines["left"].set_linewidth(2)
123
+
124
+ ax.set_title(f"{dim.replace('_', ' ').title()}\n({scenario_map[scen]})",
125
+ fontsize=15, weight="bold", pad=8)
 
 
126
  plt.tight_layout()
127
  return fig
128
 
129
+ def get_plots(scen):
130
+ imgs = []
131
+ for dim in ["consistency_score",
132
+ "overall_representativity_score",
133
+ "integrity_score"]:
 
134
  buf = io.BytesIO()
135
+ make_plot(dim, scen).savefig(buf, format="png", bbox_inches="tight")
136
  buf.seek(0)
137
+ imgs.append(Image.open(buf))
138
+ plt.close()
139
+ return imgs
140
+
141
+ # ───────────────────────────────────────────
142
+ # 4. CSV-table filter helper
143
+ # ───────────────────────────────────────────
144
+ CSV_FILE = "table_1_2.csv" # change if your file has a different name
145
 
 
146
  def filter_table(col, val):
147
+ df = pd.read_csv(CSV_FILE)
148
+ if col and val and col in df.columns:
149
+ return df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
 
 
 
 
150
  return df
151
 
152
+ # ───────────────────────────────────────────
153
+ # 5. Gradio UI
154
+ # ───────────────────────────────────────────
155
+ with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
 
 
 
 
156
  gr.Markdown("## Data Quality Scenario Explorer")
157
 
158
+ # Scenario selector
159
+ scenario = gr.Dropdown(
160
+ label="Select Scenario",
161
+ choices=[("No cleansing", "0"),
162
+ ("Urgent cleansing", "A"),
163
+ ("Urgent + Low-urgency cleansing", "B")],
164
+ value="0",
165
+ )
166
+
167
+ # Three traffic-light plots
168
+ img1 = gr.Image(label="Consistency")
169
+ img2 = gr.Image(label="Representativity")
170
+ img3 = gr.Image(label="Integrity")
171
+ scenario.change(get_plots, scenario, [img1, img2, img3])
172
+ demo.load(lambda: get_plots("0"), outputs=[img1, img2, img3])
173
+
174
+ # Button β†’ show full analysis text
175
+ show_btn = gr.Button("Show Overall Data Quality Analysis")
176
+ analysis_md = gr.Markdown(visible=False)
177
+
178
+ def show_analysis(scen):
179
+ return gr.update(
180
+ value=QUALITY_TEXT if scen == "A" else
181
+ "Please select **Urgent cleansing (Scenario A)** to view the detailed analysis.",
182
+ visible=True
183
  )
184
+ show_btn.click(show_analysis, scenario, analysis_md)
185
 
186
+ # ───────── table section ─────────
187
+ gr.Markdown("### Data Consistency Issues – Table 1.2")
 
 
 
188
 
189
  with gr.Row():
190
+ col_in = gr.Textbox(label="Column (optional)", placeholder="e.g. question")
191
+ val_in = gr.Textbox(label="Value (optional)", placeholder="e.g. income")
192
+ apply_btn = gr.Button("Apply / Refresh")
193
 
194
+ table_out = gr.Dataframe(label="Filtered table_1_2.csv")
 
 
 
 
 
195
 
196
+ apply_btn.click(filter_table, [col_in, val_in], table_out)
 
197
  demo.load(lambda: filter_table("", ""), outputs=table_out)
198
 
199
  if __name__ == "__main__":
200
  demo.launch()