fortuala's picture
Update app.py
9d688cb verified
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
import io
from PIL import Image
# ───────────────────────── 1. scenario scores ──────────────────────────
scores = pd.DataFrame(
{
"scenario": ["0", "A", "B"],
"consistency_score": [0.954451, 0.979592, 1.000000],
"representativity_score": [0.79486, 0.79486, 0.75695],
"integrity_score": [0.983921, 0.983921, 0.983921],
}
).set_index("scenario")
scenario_map = {
"0": "No cleansing",
"A": "Urgent cleansing",
"B": "Urgent + Low-urgency cleansing",
}
# ─────────────────── 2. long Markdown shown by the button ──────────────
QUALITY_TEXT = """
### Overall Data Quality Analysis
After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
- **Consistency Score** : 0.980
- **Overall Representativity Score** : 0.795
- **Integrity Score** : 0.984
- **Overall Data Quality Score** : 0.919
#### Summary
The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
---
### Consistency Action Suggestions
*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
The following dimensions are evaluated for consistency:
- Completeness check
- Dist-outlier check
- Free-text check
- Model-based outlier
**Key questions with many issues**
- `enumerator_name` – 98 issues (free-text)
- `household_average_income_female_members` – 81 issues (outliers & completeness)
- `household_average_income` – 72 issues (outliers & completeness)
- `household_average_income_male_members` – 39 issues (completeness)
- `household_average_expenses_education` – 29 issues (outliers & completeness)
- `impact_contributions_other_factors` – 23 issues (completeness)
- `monthly_spend_on_healthcare` – 21 issues (completeness)
For full details see the **Data Consistency Issues Deep Dive** tab.
---
### Integrity Action Suggestions
Respondent `_index: 1` shows low integrity scores:
| Check | Score |
|-------|------:|
| Payment for Survey | 0/1 |
| Respondent Influenced | 0/1 |
| Response Time Integrity | 0.0/1 |
| Questions Were Difficult | 0.0/2 |
| Respondent Suspicious | 0/2 |
| Phone Number Check | 0.0/1 |
| Name Check | 0.0/1 |
| Location Check | 0/1 |
See **Integrity Issues Deep Dive** for more respondents.
---
### Representativity Action Suggestions
| Scenario | Score | Ξ” vs Baseline |
|----------|-------|--------------|
| Baseline (0) | 0.795 | β€” |
| Urgent cleansing (A) | 0.795 | Β±0.000 |
| +Low-urgency cleansing (B) | 0.757 | βˆ’0.038 |
---
### Enumerator Action Suggestions
No enumerator bias detected.
"""
# ───────────────────── 3. traffic-light plot helper ────────────────────
def traffic_plot(metric, scen):
y = scores.loc[scen, metric]
fig, ax = plt.subplots(figsize=(3, 6))
# coloured bands
ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)
# black line marker
ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)
# axes styling
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_xticks([])
ax.set_yticks([0, .6, .8, 1])
ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
for s in ax.spines.values():
s.set_visible(False)
ax.spines["left"].set_visible(True)
ax.spines["left"].set_linewidth(2)
ax.set_title(
f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
fontsize=12,
weight="bold",
pad=6,
)
plt.tight_layout()
return fig
def make_images(scen):
imgs = []
for met in [
"consistency_score",
"representativity_score",
"integrity_score",
]:
buf = io.BytesIO()
traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
buf.seek(0)
imgs.append(Image.open(buf))
plt.close()
return imgs
# ───────────────────── 4. CSV-table simple filter ──────────────────────
CSV_FILE = "issues_log.csv" # rename here if needed
df_full = pd.read_csv(CSV_FILE) # load once
def filter_csv(col, val):
if col and val:
mask = df_full[col].astype(str).str.contains(val, case=False, na=False)
return df_full[mask]
return df_full
# ───────────────────── 5. Gradio interface ─────────────────────────────
with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
gr.Markdown("## Data Quality Scenario Explorer")
scen = gr.Dropdown(
label="Scenario",
choices=[
("No cleansing", "0"),
("Urgent cleansing", "A"),
("Urgent + Low-urgency cleansing", "B"),
],
value="0",
)
# plots side-by-side
with gr.Row():
im1, im2, im3 = gr.Image(), gr.Image(), gr.Image()
scen.change(make_images, scen, [im1, im2, im3])
demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])
# summary text button
summary_btn = gr.Button("Data Validation Summary")
summary_md = gr.Markdown(visible=False)
summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
inputs=None, outputs=summary_md)
# ──────── table viewer with universal filter ─────────
gr.Markdown("### Table 1-2 Viewer")
with gr.Row():
col_dd = gr.Dropdown(label="Column",
choices=list(df_full.columns),
value=list(df_full.columns)[0])
val_tb = gr.Textbox(label="Filter value (optional)")
apply_b = gr.Button("Apply Filter")
reset_b = gr.Button("Show All")
table_df = gr.Dataframe(value=df_full, label="table_1_2.csv")
apply_b.click(filter_csv, [col_dd, val_tb], table_df)
reset_b.click(lambda: df_full, None, table_df)
if __name__ == "__main__":
demo.launch()