Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

File size: 8,784 Bytes

import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image

# --- Data (hardcoded as requested) ---
data = pd.DataFrame({
    "scenario": ["0", "A", "B"],
    "consistency_score": [0.954451, 0.979592, 1.0],
    "overall_representativity_score": [0.79486, 0.79486, 0.75695],
    "integrity_score": [0.983921, 0.983921, 0.983921],
    "data_quality_score": [0.911077, 0.919457, 0.913624]
})

scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}

QUALITY_TEXT = """
### Overall Data Quality Analysis

After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:

- **Consistency Score**              : 0.980
- **Overall Representativity Score** : 0.795
- **Integrity Score**                : 0.984
- **Overall Data Quality Score**     : 0.919

#### Summary
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.

---

### Consistency Action Suggestions

*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*

The following dimensions are evaluated for consistency:
- Completeness check: An answer was expected but not provided.
- Dist outlier check: A value outside the range of reasonable values.
- Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
- Model-based outlier: An inconsistent or extreme value compared to typical responses.

Question: 'enumerator_name' has 98 issues.
  - The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.

Question: 'household_average_income_female_members' has 81 issues.
  - The dimension(s) with the most issues: model based outlier with 41 issues.
  - The second dimension with issues: completeness check with 40 issues.

Question: 'household_average_income' has 72 issues.
  - The dimension(s) with the most issues: model based outlier with 39 issues.
  - The second dimension with issues: completeness check with 33 issues.

Question: 'household_average_income_male_members' has 39 issues.
  - The dimension with the most issues: completeness check with 39 issues.

Question: 'household_average_expenses_education' has 29 issues.
  - The dimension(s) with the most issues: model based outlier with 23 issues.
  - The second dimension with issues: completeness check with 6 issues.

Question: 'impact_contributions_other_factors' has 23 issues.
  - The dimension with the most issues: completeness check with 23 issues.

Question: 'monthly_spend_on_healthcare' has 21 issues.
  - The dimension with the most issues: completeness check with 21 issues.

For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.

---

### Integrity Action Suggestions
The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.

**Respondent with _index: 1**

 The following checks scored below the maximum value:
Payment For Survey (score: 0/1)
Respondent Influenced (score: 0/1)
Response Time Integrity (score: 0.0/1)
Questions Which Were Difficult (score: 0.0/2)
Respondent Suspicious (score: 0/2)
Phone Number Check (score: 0.0/1)
Name Check (score: 0.0/1)
Location Check (score: 0/1)

 The following checks are evaluated for integrity:
- **Payment for Survey:** Less integrity if the respondent was paid to do it.
- **Respondent Influenced:** Less integrity score if the respondent seemed influenced.
- **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond.
- **Audio Verification:** More integrity if audio verification is in place.
- **Questions Were Difficult:** Less integrity if more questions were hard to respond to.
- **Respondent Suspicious:** Less integrity the more suspicious the respondent is.
- **Phone Number Check:** More integrity if a realistic phone number is provided.
- **Response Uniqueness:** More integrity if the response is truly unique.
- **Name Check:** More integrity if the name is realistic.
- **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided.
- **Enumerator Bias:** Less integrity if enumerator responses are biased.
- **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts.

For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.

---

### Representativity Action Suggestions

Baseline (no cleansing) overall representativity score: 0.795  
After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000).  
After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038).

---

### Enumerator Action Suggestions
No enumerator bias has been found.
"""

def plot_dimension(dim_col, scenario):
    value = data.set_index("scenario").loc[scenario, dim_col]
    fig, ax = plt.subplots(figsize=(4, 7))
    ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.30)
    ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.30)
    ax.axhspan(0.8, 0.95, color="#52C41A", alpha=0.30)
    ax.axhline(value, color='black', lw=2, xmin=0.35, xmax=0.65)
    ax.annotate(
        f"{value:.3f}",
        xy=(0.5, value),
        xycoords=('axes fraction', 'data'),
        ha='center', va='bottom',
        fontsize=22, weight='bold',
        color='black',
        bbox=dict(facecolor='white', edgecolor='none', alpha=0.8, boxstyle='round,pad=0.2')
    )
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 0.95)
    ax.set_xticks([])
    ax.set_yticks([0, 0.6, 0.8, 0.95])
    ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=15)
    for spine in ax.spines.values():
        spine.set_visible(False)
    ax.spines['left'].set_visible(True)
    ax.spines['left'].set_linewidth(2)
    plt.subplots_adjust(top=0.88)
    ax.set_title(
        f"{dim_col.replace('_', ' ').title()}\n({scenario_map[scenario]})",
        fontsize=15, weight='bold', pad=10
    )
    plt.tight_layout()
    return fig

def show_plots(scenario):
    fig1 = plot_dimension("consistency_score", scenario)
    fig2 = plot_dimension("overall_representativity_score", scenario)
    fig3 = plot_dimension("integrity_score", scenario)
    img_list = []
    for fig in [fig1, fig2, fig3]:
        buf = io.BytesIO()
        fig.savefig(buf, format='png', bbox_inches='tight')
        buf.seek(0)
        img = Image.open(buf)
        img_list.append(img)
        plt.close(fig)
    return img_list

# --- Simple table filter function ---
def filter_table(col, val):
    df = pd.read_csv("issues_log.csv")
    if col and val:
        if col in df.columns:
            mask = df[col].astype(str).str.contains(str(val), case=False, na=False)
            return df[mask]
        else:
            return pd.DataFrame({"error": [f"Column '{col}' not in table."]})
    return df

def get_quality_text(selected_scenario):
    if selected_scenario == "A":
        return QUALITY_TEXT
    else:
        return f"Select scenario 'Urgent cleansing' to see the detailed data quality analysis."

# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.Markdown("## Data Quality Scenario Explorer")

    with gr.Row():
        scenario = gr.Dropdown(
            choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
            value="0",
            label="Select Scenario"
        )

    with gr.Row():
        out1 = gr.Image(label="Consistency Score Traffic Light")
        out2 = gr.Image(label="Overall Representativity Score Traffic Light")
        out3 = gr.Image(label="Integrity Score Traffic Light")
    scenario.change(show_plots, scenario, [out1, out2, out3])

    with gr.Row():
        gr.Markdown("### Overall Data Quality Analysis")
        analysis_text = gr.Markdown(value=get_quality_text("0"), visible=True)
    scenario.change(get_quality_text, scenario, analysis_text)

    with gr.Row():
        gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
    with gr.Row():
        filter_col = gr.Textbox(label="Column (optional)")
        filter_val = gr.Textbox(label="Value (optional)")
        table_out = gr.Dataframe(label="Filtered Table 1.2 (issues_log.csv)")

    filter_col.change(filter_table, [filter_col, filter_val], table_out)
    filter_val.change(filter_table, [filter_col, filter_val], table_out)
    demo.load(lambda: filter_table("", ""), outputs=table_out)

if __name__ == "__main__":
    demo.launch()