Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

File size: 6,658 Bytes

import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image

# ───────────────────────── 1. scenario scores ──────────────────────────
scores = pd.DataFrame(
    {
        "scenario": ["0", "A", "B"],
        "consistency_score": [0.954451, 0.979592, 1.000000],
        "representativity_score": [0.79486, 0.79486, 0.75695],
        "integrity_score": [0.983921, 0.983921, 0.983921],
    }
).set_index("scenario")

scenario_map = {
    "0": "No cleansing",
    "A": "Urgent cleansing",
    "B": "Urgent + Low-urgency cleansing",
}

# ─────────────────── 2. long Markdown shown by the button ──────────────
QUALITY_TEXT = """
### Overall Data Quality Analysis

After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:

- **Consistency Score**              : 0.980  
- **Overall Representativity Score** : 0.795  
- **Integrity Score**                : 0.984  
- **Overall Data Quality Score**     : 0.919  

#### Summary  
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.

---

### Consistency Action Suggestions
*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*

The following dimensions are evaluated for consistency:  
- Completeness check  
- Dist-outlier check  
- Free-text check  
- Model-based outlier  

**Key questions with many issues**  
- `enumerator_name` – 98 issues (free-text)  
- `household_average_income_female_members` – 81 issues (outliers & completeness)  
- `household_average_income` – 72 issues (outliers & completeness)  
- `household_average_income_male_members` – 39 issues (completeness)  
- `household_average_expenses_education` – 29 issues (outliers & completeness)  
- `impact_contributions_other_factors` – 23 issues (completeness)  
- `monthly_spend_on_healthcare` – 21 issues (completeness)  

For full details see the **Data Consistency Issues Deep Dive** tab.

---

### Integrity Action Suggestions
Respondent `_index: 1` shows low integrity scores:

| Check | Score |
|-------|------:|
| Payment for Survey | 0/1 |
| Respondent Influenced | 0/1 |
| Response Time Integrity | 0.0/1 |
| Questions Were Difficult | 0.0/2 |
| Respondent Suspicious | 0/2 |
| Phone Number Check | 0.0/1 |
| Name Check | 0.0/1 |
| Location Check | 0/1 |

See **Integrity Issues Deep Dive** for more respondents.

---

### Representativity Action Suggestions
| Scenario | Score | Δ vs Baseline |
|----------|-------|--------------|
| Baseline (0) | 0.795 | — |
| Urgent cleansing (A) | 0.795 | ±0.000 |
| +Low-urgency cleansing (B) | 0.757 | −0.038 |

---

### Enumerator Action Suggestions
No enumerator bias detected.
"""

# ───────────────────── 3. traffic-light plot helper ────────────────────
def traffic_plot(metric, scen):
    y = scores.loc[scen, metric]
    fig, ax = plt.subplots(figsize=(3, 6))

    # coloured bands
    ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
    ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
    ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)

    # black line marker
    ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)

    # axes styling
    ax.set_xlim(0, 1)
    ax.set_ylim(0, 1)
    ax.set_xticks([])
    ax.set_yticks([0, .6, .8, 1])
    ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
    for s in ax.spines.values():
        s.set_visible(False)
    ax.spines["left"].set_visible(True)
    ax.spines["left"].set_linewidth(2)

    ax.set_title(
        f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
        fontsize=12,
        weight="bold",
        pad=6,
    )
    plt.tight_layout()
    return fig


def make_images(scen):
    imgs = []
    for met in [
        "consistency_score",
        "representativity_score",
        "integrity_score",
    ]:
        buf = io.BytesIO()
        traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
        buf.seek(0)
        imgs.append(Image.open(buf))
        plt.close()
    return imgs


# ───────────────────── 4. CSV-table simple filter ──────────────────────
CSV_FILE = "issues_log.csv"        # rename here if needed
df_full  = pd.read_csv(CSV_FILE)  # load once

def filter_csv(col, val):
    if col and val:
        mask = df_full[col].astype(str).str.contains(val, case=False, na=False)
        return df_full[mask]
    return df_full

# ───────────────────── 5. Gradio interface ─────────────────────────────
with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
    gr.Markdown("## Data Quality Scenario Explorer")

    scen = gr.Dropdown(
        label="Scenario",
        choices=[
            ("No cleansing", "0"),
            ("Urgent cleansing", "A"),
            ("Urgent + Low-urgency cleansing", "B"),
        ],
        value="0",
    )

    # plots side-by-side
    with gr.Row():
        im1, im2, im3 = gr.Image(), gr.Image(), gr.Image()
    scen.change(make_images, scen, [im1, im2, im3])
    demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])

    # summary text button
    summary_btn = gr.Button("Data Validation Summary")
    summary_md  = gr.Markdown(visible=False)
    summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
                      inputs=None, outputs=summary_md)

    # ──────── table viewer with universal filter ─────────
    gr.Markdown("### Table 1-2 Viewer")

    with gr.Row():
        col_dd  = gr.Dropdown(label="Column",
                              choices=list(df_full.columns),
                              value=list(df_full.columns)[0])
        val_tb  = gr.Textbox(label="Filter value (optional)")
        apply_b = gr.Button("Apply Filter")
        reset_b = gr.Button("Show All")

    table_df = gr.Dataframe(value=df_full, label="table_1_2.csv")

    apply_b.click(filter_csv,  [col_dd, val_tb],  table_df)
    reset_b.click(lambda: df_full, None, table_df)

if __name__ == "__main__":
    demo.launch()