fortuala commited on
Commit
f71a2f6
·
verified ·
1 Parent(s): 31279dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -80
app.py CHANGED
@@ -1,87 +1,202 @@
1
  import gradio as gr
2
  import pandas as pd
3
- import os
4
-
5
- import functions as f
6
-
7
- # Paths to the permanent files
8
- survey_path = 'Copy of AGT.MHVL.0A.202505.0001 4.xlsx'
9
- indicator_path = 'Indicators_indicators_Default view 18.xlsx'
10
- questions_path = 'Indicators_questions_Default View 18.xlsx'
11
- choice_path = 'Indicators_choices_Default View 17.xlsx'
12
- parameters_path = 'Indicators_surveys_Survey validation 1.xlsx'
13
- uuid = 'AGT.MHVL.0A.202505.0001'
14
-
15
- survey = pd.read_excel(survey_path)
16
- indicators = pd.read_excel(indicator_path)
17
- choices = pd.read_excel(choice_path)
18
- questions = pd.read_excel(questions_path)
19
-
20
- def run_validation():
21
-
22
- # Pass all inputs to your function (update name/args as needed)
23
- # parameters file
24
- indicator_df, questions_df, choice_df, data_all, raw_data, column_strategy_df = f.load_dataframes(
25
- indicator_path,
26
- questions_path,
27
- choice_path,
28
- survey_path)
29
-
30
- # consistency
31
- table_1_1, table_1_2, table_1_3 = f.consistency_score_report(
32
- raw_data=raw_data,
33
- indicator_df=indicator_df,
34
- questions_df=questions_df,
35
- column_strategy_df=column_strategy_df,
36
- data_all=data_all,
37
- theme_list=theme_list
38
- )
39
- # integrity
40
- table_2_1, table_2_2, table_2_3,table_2_4,table_2_5 = f.integrity_report(raw_data, questions_df, column_strategy_df, survey_type,table_1_2)
41
-
42
- # representativity
43
- if segmentation == 'yes':
44
- table_3_1, table_3_2, table_3_3, table_3_4 = f.representativity_report(segmentation, raw_data, table_2_4, segmentation_columns, mapping_segmentation_quotas,
45
- table_2_3, N, table_1_3)
46
- else:
47
- table_3_3, table_3_4 = f.representativity_report(segmentation, raw_data, table_2_4, segmentation_columns, mapping_segmentation_quotas,
48
- table_2_3, N, table_1_3)
49
-
50
- # enumerator bias
51
- if 'enumerator_name' in raw_data.columns:
52
- table_4_1, table_4_2 = f.enumerator_urgent_issues_report(raw_data, table_2_5)
53
- else:
54
- table_4_1 = []
55
- table_4_2 = []
56
-
57
- report = f.generate_data_quality_report(
58
- segmentation='no',
59
- table_1_1=table_1_1,
60
- table_2_1=table_2_1,
61
- table_2_3=table_2_3,
62
- table_3_1=None,
63
- table_3_2=None,
64
- table_3_3=table_3_3,
65
- table_3_4=table_3_4, # Replace with actual data
66
- table_4_1=table_4_1 # Replace with actual data
67
- )
68
-
69
- print(report)
70
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- with gr.Blocks() as app:
73
- gr.Markdown("## Survey Validation App")
74
 
75
- survey_file = gr.File(label="Upload your survey (Excel or CSV)")
76
- uuid_box = gr.Textbox(label="UUID", value="AGT.MHVL.0A.202505.0001")
77
- run_btn = gr.Button("Run Validation")
78
- output = gr.Dataframe(label="Validation Output")
79
 
80
- run_btn.click(
81
- run_validation,
82
- inputs=[survey_file, uuid_box],
83
- outputs=[]
84
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  if __name__ == "__main__":
87
- app.launch()
 
 
1
  import gradio as gr
2
  import pandas as pd
3
+ import matplotlib.pyplot as plt
4
+ import io
5
+
6
+ # --- Data (hardcoded as requested) ---
7
+ data = pd.DataFrame({
8
+ "scenario": ["0", "A", "B"],
9
+ "consistency_score": [0.954451, 0.979592, 1.0],
10
+ "overall_representativity_score": [0.79486, 0.79486, 0.75695],
11
+ "integrity_score": [0.983921, 0.983921, 0.983921],
12
+ "data_quality_score": [0.911077, 0.919457, 0.913624]
13
+ })
14
+
15
+ scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}
16
+
17
+ # --- Traffic light plotting utility ---
18
+ def plot_dimension(dim_col, scenario):
19
+ value = data.set_index("scenario").loc[scenario, dim_col]
20
+ fig, ax = plt.subplots(figsize=(5, 1.4))
21
+ # Traffic light backgrounds
22
+ ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.4, label='Red (0-0.6)')
23
+ ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.4, label='Yellow (0.6-0.8)')
24
+ ax.axhspan(0.8, 1, color="#52C41A", alpha=0.4, label='Green (0.8-1.0)')
25
+ # Value marker
26
+ ax.axhline(value, color='black', lw=4, xmin=0.2, xmax=0.8)
27
+ ax.text(0.5, value, f"{value:.3f}", ha='center', va='bottom', fontsize=13, color='black', weight='bold')
28
+ # Aesthetics
29
+ ax.set_xlim(0, 1)
30
+ ax.set_ylim(0, 1)
31
+ ax.set_xticks([])
32
+ ax.set_yticks([0, 0.6, 0.8, 1])
33
+ ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"])
34
+ ax.set_title(f"{dim_col.replace('_', ' ').title()} ({scenario_map[scenario]})")
35
+ ax.spines[['right','top','bottom','left']].set_visible(False)
36
+ return fig
37
+
38
+ # --- Scenario selector callback ---
39
+ def show_plots(scenario):
40
+ fig1 = plot_dimension("consistency_score", scenario)
41
+ fig2 = plot_dimension("overall_representativity_score", scenario)
42
+ fig3 = plot_dimension("integrity_score", scenario)
43
+ # Convert figs to images for Gradio
44
+ img_list = []
45
+ for fig in [fig1, fig2, fig3]:
46
+ buf = io.BytesIO()
47
+ fig.savefig(buf, format='png', bbox_inches='tight')
48
+ buf.seek(0)
49
+ img_list.append(buf.read())
50
+ plt.close(fig)
51
+ return img_list
52
+
53
+ # --- Button for quality text (scenario A only) ---
54
+ QUALITY_TEXT = """
55
+ ### Overall Data Quality Analysis
56
+
57
+ After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
58
+
59
+ - **Consistency Score** : 0.980
60
+ - **Overall Representativity Score** : 0.795
61
+ - **Integrity Score** : 0.984
62
+ - **Overall Data Quality Score** : 0.919
63
+
64
+ #### Summary
65
+ The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
66
+
67
+ ---
68
+
69
+ ### Consistency Action Suggestions
70
+
71
+ *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
72
+
73
+ The following dimensions are evaluated for consistency:
74
+ - Completeness check: An answer was expected but not provided.
75
+ - Dist outlier check: A value outside the range of reasonable values.
76
+ - Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
77
+ - Model-based outlier: An inconsistent or extreme value compared to typical responses.
78
+
79
+ Question: 'enumerator_name' has 98 issues.
80
+ - The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.
81
+
82
+ Question: 'household_average_income_female_members' has 81 issues.
83
+ - The dimension(s) with the most issues: model based outlier with 41 issues.
84
+ - The second dimension with issues: completeness check with 40 issues.
85
+
86
+ Question: 'household_average_income' has 72 issues.
87
+ - The dimension(s) with the most issues: model based outlier with 39 issues.
88
+ - The second dimension with issues: completeness check with 33 issues.
89
+
90
+ Question: 'household_average_income_male_members' has 39 issues.
91
+ - The dimension with the most issues: completeness check with 39 issues.
92
+
93
+ Question: 'household_average_expenses_education' has 29 issues.
94
+ - The dimension(s) with the most issues: model based outlier with 23 issues.
95
+ - The second dimension with issues: completeness check with 6 issues.
96
+
97
+ Question: 'impact_contributions_other_factors' has 23 issues.
98
+ - The dimension with the most issues: completeness check with 23 issues.
99
 
100
+ Question: 'monthly_spend_on_healthcare' has 21 issues.
101
+ - The dimension with the most issues: completeness check with 21 issues.
102
 
103
+ For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.
 
 
 
104
 
105
+ ---
106
+
107
+ ### Integrity Action Suggestions
108
+ The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.
109
+
110
+ **Respondent with _index: 1**
111
+
112
+ The following checks scored below the maximum value:
113
+ Payment For Survey (score: 0/1)
114
+ Respondent Influenced (score: 0/1)
115
+ Response Time Integrity (score: 0.0/1)
116
+ Questions Which Were Difficult (score: 0.0/2)
117
+ Respondent Suspicious (score: 0/2)
118
+ Phone Number Check (score: 0.0/1)
119
+ Name Check (score: 0.0/1)
120
+ Location Check (score: 0/1)
121
+
122
+ The following checks are evaluated for integrity:
123
+ - **Payment for Survey:** Less integrity if the respondent was paid to do it.
124
+ - **Respondent Influenced:** Less integrity score if the respondent seemed influenced.
125
+ - **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond.
126
+ - **Audio Verification:** More integrity if audio verification is in place.
127
+ - **Questions Were Difficult:** Less integrity if more questions were hard to respond to.
128
+ - **Respondent Suspicious:** Less integrity the more suspicious the respondent is.
129
+ - **Phone Number Check:** More integrity if a realistic phone number is provided.
130
+ - **Response Uniqueness:** More integrity if the response is truly unique.
131
+ - **Name Check:** More integrity if the name is realistic.
132
+ - **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided.
133
+ - **Enumerator Bias:** Less integrity if enumerator responses are biased.
134
+ - **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts.
135
+
136
+ For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.
137
+
138
+ ---
139
+
140
+ ### Representativity Action Suggestions
141
+
142
+ Baseline (no cleansing) overall representativity score: 0.795
143
+ After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000).
144
+ After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038).
145
+
146
+ ---
147
+
148
+ ### Enumerator Action Suggestions
149
+ No enumerator bias has been found.
150
+ """
151
+
152
+ # --- Table 1.2 loader & filter ---
153
+ def load_and_filter_table(col=None, val=None):
154
+ df = pd.read_csv("table_1_2.csv") # Your table_1_2 file
155
+ if col and val:
156
+ df = df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
157
+ return df
158
+
159
+ # --- Gradio UI ---
160
+ with gr.Blocks() as demo:
161
+ gr.Markdown("## Data Quality Scenario Explorer")
162
+
163
+ with gr.Row():
164
+ scenario = gr.Dropdown(
165
+ choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
166
+ value="0",
167
+ label="Select Scenario"
168
+ )
169
+
170
+ with gr.Row():
171
+ out1 = gr.Image(label="Consistency Score Traffic Light")
172
+ out2 = gr.Image(label="Overall Representativity Score Traffic Light")
173
+ out3 = gr.Image(label="Integrity Score Traffic Light")
174
+ scenario.change(show_plots, scenario, [out1, out2, out3])
175
+
176
+ # Button for analysis (scenario A)
177
+ with gr.Row():
178
+ analysis_btn = gr.Button("Show Overall Data Quality Analysis (Scenario A Only)")
179
+ analysis_text = gr.Markdown(visible=False)
180
+ def show_analysis(selected_scenario):
181
+ if selected_scenario == "A":
182
+ return gr.update(value=QUALITY_TEXT, visible=True)
183
+ else:
184
+ return gr.update(value="Select scenario 'A' (Urgent cleansing) to view the analysis.", visible=True)
185
+ analysis_btn.click(show_analysis, scenario, analysis_text)
186
+
187
+ # Table with filter
188
+ with gr.Row():
189
+ gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
190
+ with gr.Row():
191
+ filter_col = gr.Textbox(label="Column to Filter (optional)", value="")
192
+ filter_val = gr.Textbox(label="Value to Filter (optional)", value="")
193
+ table_out = gr.Dataframe(label="table_1_2.csv Filtered Results")
194
+ filter_col.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)
195
+ filter_val.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)
196
+
197
+ # Default show plots for initial scenario
198
+ demo.load(lambda: show_plots("0"), outputs=[out1, out2, out3])
199
 
200
  if __name__ == "__main__":
201
+ demo.launch()
202
+