File size: 8,784 Bytes
44e4f59 f71a2f6 363a986 f71a2f6 44e4f59 f71a2f6 44e4f59 f71a2f6 44e4f59 f71a2f6 730ab8c f71a2f6 730ab8c f71a2f6 730ab8c f71a2f6 730ab8c f71a2f6 730ab8c 44e4f59 f71a2f6 730ab8c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 | import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image
# --- Data (hardcoded as requested) ---
data = pd.DataFrame({
"scenario": ["0", "A", "B"],
"consistency_score": [0.954451, 0.979592, 1.0],
"overall_representativity_score": [0.79486, 0.79486, 0.75695],
"integrity_score": [0.983921, 0.983921, 0.983921],
"data_quality_score": [0.911077, 0.919457, 0.913624]
})
scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}
QUALITY_TEXT = """
### Overall Data Quality Analysis
After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
- **Consistency Score** : 0.980
- **Overall Representativity Score** : 0.795
- **Integrity Score** : 0.984
- **Overall Data Quality Score** : 0.919
#### Summary
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
---
### Consistency Action Suggestions
*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
The following dimensions are evaluated for consistency:
- Completeness check: An answer was expected but not provided.
- Dist outlier check: A value outside the range of reasonable values.
- Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
- Model-based outlier: An inconsistent or extreme value compared to typical responses.
Question: 'enumerator_name' has 98 issues.
- The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.
Question: 'household_average_income_female_members' has 81 issues.
- The dimension(s) with the most issues: model based outlier with 41 issues.
- The second dimension with issues: completeness check with 40 issues.
Question: 'household_average_income' has 72 issues.
- The dimension(s) with the most issues: model based outlier with 39 issues.
- The second dimension with issues: completeness check with 33 issues.
Question: 'household_average_income_male_members' has 39 issues.
- The dimension with the most issues: completeness check with 39 issues.
Question: 'household_average_expenses_education' has 29 issues.
- The dimension(s) with the most issues: model based outlier with 23 issues.
- The second dimension with issues: completeness check with 6 issues.
Question: 'impact_contributions_other_factors' has 23 issues.
- The dimension with the most issues: completeness check with 23 issues.
Question: 'monthly_spend_on_healthcare' has 21 issues.
- The dimension with the most issues: completeness check with 21 issues.
For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.
---
### Integrity Action Suggestions
The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.
**Respondent with _index: 1**
The following checks scored below the maximum value:
Payment For Survey (score: 0/1)
Respondent Influenced (score: 0/1)
Response Time Integrity (score: 0.0/1)
Questions Which Were Difficult (score: 0.0/2)
Respondent Suspicious (score: 0/2)
Phone Number Check (score: 0.0/1)
Name Check (score: 0.0/1)
Location Check (score: 0/1)
The following checks are evaluated for integrity:
- **Payment for Survey:** Less integrity if the respondent was paid to do it.
- **Respondent Influenced:** Less integrity score if the respondent seemed influenced.
- **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond.
- **Audio Verification:** More integrity if audio verification is in place.
- **Questions Were Difficult:** Less integrity if more questions were hard to respond to.
- **Respondent Suspicious:** Less integrity the more suspicious the respondent is.
- **Phone Number Check:** More integrity if a realistic phone number is provided.
- **Response Uniqueness:** More integrity if the response is truly unique.
- **Name Check:** More integrity if the name is realistic.
- **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided.
- **Enumerator Bias:** Less integrity if enumerator responses are biased.
- **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts.
For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.
---
### Representativity Action Suggestions
Baseline (no cleansing) overall representativity score: 0.795
After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000).
After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038).
---
### Enumerator Action Suggestions
No enumerator bias has been found.
"""
def plot_dimension(dim_col, scenario):
value = data.set_index("scenario").loc[scenario, dim_col]
fig, ax = plt.subplots(figsize=(4, 7))
ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.30)
ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.30)
ax.axhspan(0.8, 0.95, color="#52C41A", alpha=0.30)
ax.axhline(value, color='black', lw=2, xmin=0.35, xmax=0.65)
ax.annotate(
f"{value:.3f}",
xy=(0.5, value),
xycoords=('axes fraction', 'data'),
ha='center', va='bottom',
fontsize=22, weight='bold',
color='black',
bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, boxstyle='round,pad=0.2')
)
ax.set_xlim(0, 1)
ax.set_ylim(0, 0.95)
ax.set_xticks([])
ax.set_yticks([0, 0.6, 0.8, 0.95])
ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=15)
for spine in ax.spines.values():
spine.set_visible(False)
ax.spines['left'].set_visible(True)
ax.spines['left'].set_linewidth(2)
plt.subplots_adjust(top=0.88)
ax.set_title(
f"{dim_col.replace('_', ' ').title()}\n({scenario_map[scenario]})",
fontsize=15, weight='bold', pad=10
)
plt.tight_layout()
return fig
def show_plots(scenario):
fig1 = plot_dimension("consistency_score", scenario)
fig2 = plot_dimension("overall_representativity_score", scenario)
fig3 = plot_dimension("integrity_score", scenario)
img_list = []
for fig in [fig1, fig2, fig3]:
buf = io.BytesIO()
fig.savefig(buf, format='png', bbox_inches='tight')
buf.seek(0)
img = Image.open(buf)
img_list.append(img)
plt.close(fig)
return img_list
# --- Simple table filter function ---
def filter_table(col, val):
df = pd.read_csv("issues_log.csv")
if col and val:
if col in df.columns:
mask = df[col].astype(str).str.contains(str(val), case=False, na=False)
return df[mask]
else:
return pd.DataFrame({"error": [f"Column '{col}' not in table."]})
return df
def get_quality_text(selected_scenario):
if selected_scenario == "A":
return QUALITY_TEXT
else:
return f"Select scenario 'Urgent cleansing' to see the detailed data quality analysis."
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.Markdown("## Data Quality Scenario Explorer")
with gr.Row():
scenario = gr.Dropdown(
choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
value="0",
label="Select Scenario"
)
with gr.Row():
out1 = gr.Image(label="Consistency Score Traffic Light")
out2 = gr.Image(label="Overall Representativity Score Traffic Light")
out3 = gr.Image(label="Integrity Score Traffic Light")
scenario.change(show_plots, scenario, [out1, out2, out3])
with gr.Row():
gr.Markdown("### Overall Data Quality Analysis")
analysis_text = gr.Markdown(value=get_quality_text("0"), visible=True)
scenario.change(get_quality_text, scenario, analysis_text)
with gr.Row():
gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
with gr.Row():
filter_col = gr.Textbox(label="Column (optional)")
filter_val = gr.Textbox(label="Value (optional)")
table_out = gr.Dataframe(label="Filtered Table 1.2 (issues_log.csv)")
filter_col.change(filter_table, [filter_col, filter_val], table_out)
filter_val.change(filter_table, [filter_col, filter_val], table_out)
demo.load(lambda: filter_table("", ""), outputs=table_out)
if __name__ == "__main__":
demo.launch()
|