| | import gradio as gr |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| | import io |
| | from PIL import Image |
| |
|
| | |
| | data = pd.DataFrame({ |
| | "scenario": ["0", "A", "B"], |
| | "consistency_score": [0.954451, 0.979592, 1.0], |
| | "overall_representativity_score": [0.79486, 0.79486, 0.75695], |
| | "integrity_score": [0.983921, 0.983921, 0.983921], |
| | "data_quality_score": [0.911077, 0.919457, 0.913624] |
| | }) |
| |
|
| | scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"} |
| |
|
| | QUALITY_TEXT = """ |
| | ### Overall Data Quality Analysis |
| | |
| | After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made: |
| | |
| | - **Consistency Score** : 0.980 |
| | - **Overall Representativity Score** : 0.795 |
| | - **Integrity Score** : 0.984 |
| | - **Overall Data Quality Score** : 0.919 |
| | |
| | #### Summary |
| | The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions. |
| | |
| | --- |
| | |
| | ### Consistency Action Suggestions |
| | |
| | *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:* |
| | |
| | The following dimensions are evaluated for consistency: |
| | - Completeness check: An answer was expected but not provided. |
| | - Dist outlier check: A value outside the range of reasonable values. |
| | - Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses. |
| | - Model-based outlier: An inconsistent or extreme value compared to typical responses. |
| | |
| | Question: 'enumerator_name' has 98 issues. |
| | - The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues. |
| | |
| | Question: 'household_average_income_female_members' has 81 issues. |
| | - The dimension(s) with the most issues: model based outlier with 41 issues. |
| | - The second dimension with issues: completeness check with 40 issues. |
| | |
| | Question: 'household_average_income' has 72 issues. |
| | - The dimension(s) with the most issues: model based outlier with 39 issues. |
| | - The second dimension with issues: completeness check with 33 issues. |
| | |
| | Question: 'household_average_income_male_members' has 39 issues. |
| | - The dimension with the most issues: completeness check with 39 issues. |
| | |
| | Question: 'household_average_expenses_education' has 29 issues. |
| | - The dimension(s) with the most issues: model based outlier with 23 issues. |
| | - The second dimension with issues: completeness check with 6 issues. |
| | |
| | Question: 'impact_contributions_other_factors' has 23 issues. |
| | - The dimension with the most issues: completeness check with 23 issues. |
| | |
| | Question: 'monthly_spend_on_healthcare' has 21 issues. |
| | - The dimension with the most issues: completeness check with 21 issues. |
| | |
| | For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab. |
| | |
| | --- |
| | |
| | ### Integrity Action Suggestions |
| | The following respondents exhibit low integrity scores, and we recommend taking a closer look at them. |
| | |
| | **Respondent with _index: 1** |
| | |
| | The following checks scored below the maximum value: |
| | Payment For Survey (score: 0/1) |
| | Respondent Influenced (score: 0/1) |
| | Response Time Integrity (score: 0.0/1) |
| | Questions Which Were Difficult (score: 0.0/2) |
| | Respondent Suspicious (score: 0/2) |
| | Phone Number Check (score: 0.0/1) |
| | Name Check (score: 0.0/1) |
| | Location Check (score: 0/1) |
| | |
| | The following checks are evaluated for integrity: |
| | - **Payment for Survey:** Less integrity if the respondent was paid to do it. |
| | - **Respondent Influenced:** Less integrity score if the respondent seemed influenced. |
| | - **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond. |
| | - **Audio Verification:** More integrity if audio verification is in place. |
| | - **Questions Were Difficult:** Less integrity if more questions were hard to respond to. |
| | - **Respondent Suspicious:** Less integrity the more suspicious the respondent is. |
| | - **Phone Number Check:** More integrity if a realistic phone number is provided. |
| | - **Response Uniqueness:** More integrity if the response is truly unique. |
| | - **Name Check:** More integrity if the name is realistic. |
| | - **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided. |
| | - **Enumerator Bias:** Less integrity if enumerator responses are biased. |
| | - **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts. |
| | |
| | For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab. |
| | |
| | --- |
| | |
| | ### Representativity Action Suggestions |
| | |
| | Baseline (no cleansing) overall representativity score: 0.795 |
| | After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000). |
| | After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038). |
| | |
| | --- |
| | |
| | ### Enumerator Action Suggestions |
| | No enumerator bias has been found. |
| | """ |
| |
|
| | def plot_dimension(dim_col, scenario): |
| | value = data.set_index("scenario").loc[scenario, dim_col] |
| | fig, ax = plt.subplots(figsize=(4, 7)) |
| | ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.30) |
| | ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.30) |
| | ax.axhspan(0.8, 0.95, color="#52C41A", alpha=0.30) |
| | ax.axhline(value, color='black', lw=2, xmin=0.35, xmax=0.65) |
| | ax.annotate( |
| | f"{value:.3f}", |
| | xy=(0.5, value), |
| | xycoords=('axes fraction', 'data'), |
| | ha='center', va='bottom', |
| | fontsize=22, weight='bold', |
| | color='black', |
| | bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, boxstyle='round,pad=0.2') |
| | ) |
| | ax.set_xlim(0, 1) |
| | ax.set_ylim(0, 0.95) |
| | ax.set_xticks([]) |
| | ax.set_yticks([0, 0.6, 0.8, 0.95]) |
| | ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=15) |
| | for spine in ax.spines.values(): |
| | spine.set_visible(False) |
| | ax.spines['left'].set_visible(True) |
| | ax.spines['left'].set_linewidth(2) |
| | plt.subplots_adjust(top=0.88) |
| | ax.set_title( |
| | f"{dim_col.replace('_', ' ').title()}\n({scenario_map[scenario]})", |
| | fontsize=15, weight='bold', pad=10 |
| | ) |
| | plt.tight_layout() |
| | return fig |
| |
|
| | def show_plots(scenario): |
| | fig1 = plot_dimension("consistency_score", scenario) |
| | fig2 = plot_dimension("overall_representativity_score", scenario) |
| | fig3 = plot_dimension("integrity_score", scenario) |
| | img_list = [] |
| | for fig in [fig1, fig2, fig3]: |
| | buf = io.BytesIO() |
| | fig.savefig(buf, format='png', bbox_inches='tight') |
| | buf.seek(0) |
| | img = Image.open(buf) |
| | img_list.append(img) |
| | plt.close(fig) |
| | return img_list |
| |
|
| | |
| | def filter_table(col, val): |
| | df = pd.read_csv("issues_log.csv") |
| | if col and val: |
| | if col in df.columns: |
| | mask = df[col].astype(str).str.contains(str(val), case=False, na=False) |
| | return df[mask] |
| | else: |
| | return pd.DataFrame({"error": [f"Column '{col}' not in table."]}) |
| | return df |
| |
|
| | def get_quality_text(selected_scenario): |
| | if selected_scenario == "A": |
| | return QUALITY_TEXT |
| | else: |
| | return f"Select scenario 'Urgent cleansing' to see the detailed data quality analysis." |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("## Data Quality Scenario Explorer") |
| |
|
| | with gr.Row(): |
| | scenario = gr.Dropdown( |
| | choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")], |
| | value="0", |
| | label="Select Scenario" |
| | ) |
| |
|
| | with gr.Row(): |
| | out1 = gr.Image(label="Consistency Score Traffic Light") |
| | out2 = gr.Image(label="Overall Representativity Score Traffic Light") |
| | out3 = gr.Image(label="Integrity Score Traffic Light") |
| | scenario.change(show_plots, scenario, [out1, out2, out3]) |
| |
|
| | with gr.Row(): |
| | gr.Markdown("### Overall Data Quality Analysis") |
| | analysis_text = gr.Markdown(value=get_quality_text("0"), visible=True) |
| | scenario.change(get_quality_text, scenario, analysis_text) |
| |
|
| | with gr.Row(): |
| | gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)") |
| | with gr.Row(): |
| | filter_col = gr.Textbox(label="Column (optional)") |
| | filter_val = gr.Textbox(label="Value (optional)") |
| | table_out = gr.Dataframe(label="Filtered Table 1.2 (issues_log.csv)") |
| |
|
| | filter_col.change(filter_table, [filter_col, filter_val], table_out) |
| | filter_val.change(filter_table, [filter_col, filter_val], table_out) |
| | demo.load(lambda: filter_table("", ""), outputs=table_out) |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |
| |
|
| |
|
| |
|