Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

App Files Files Community

fortuala commited on Aug 5, 2025

Commit

c973292

verified ·

1 Parent(s): e83e816

Update app.py

Browse files

Files changed (1) hide show

app.py +105 -114

app.py CHANGED Viewed

@@ -4,25 +4,24 @@ import matplotlib.pyplot as plt
 import io
 from PIL import Image
-# ───────────────────────────────────────────
-# 1. Static data for the three scenarios
-# ───────────────────────────────────────────
-data = pd.DataFrame({
-    "scenario": ["0", "A", "B"],
-    "consistency_score": [0.954451, 0.979592, 1.000000],
-    "overall_representativity_score": [0.79486, 0.79486, 0.75695],
-    "integrity_score": [0.983921, 0.983921, 0.983921],
-    "data_quality_score": [0.911077, 0.919457, 0.913624]
-}).set_index("scenario")
-scenario_map = {"0": "No cleansing",
-                "A": "Urgent cleansing",
-                "B": "Urgent + Low-urgency cleansing"}
-# ───────────────────────────────────────────
-# 2. Full analysis text (shown for Scenario A)
-# ───────────────────────────────────────────
-QUALITY_TEXT = """\
 ### Overall Data Quality Analysis
 After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
@@ -41,29 +40,29 @@ The overall data quality score is satisfactory, but the following dimensions req
 *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
 The following dimensions are evaluated for consistency:
-- Completeness check (missing answers)
-- Dist-outlier check (extreme values)
-- Free-text check (short answers)
-- Model-based outlier (inconsistent values)
-**Key questions with many issues:**
-- `enumerator_name` — 98 issues (mainly free-text)
-- `household_average_income_female_members` — 81 issues (outliers & completeness)
-- `household_average_income` — 72 issues (outliers & completeness)
-- `household_average_income_male_members` — 39 issues (completeness)
-- `household_average_expenses_education` — 29 issues (outliers & completeness)
-- `impact_contributions_other_factors` — 23 issues (completeness)
-- `monthly_spend_on_healthcare` — 21 issues (completeness)
-For a detailed view of consistency issues, see the **Data Consistency Issues Deep Dive** tab.
 ---
 ### Integrity Action Suggestions
 Respondent `_index: 1` shows low integrity scores:
-| Check | Score | |
-|-------|-------|---|
 | Payment for Survey | 0/1 |
 | Respondent Influenced | 0/1 |
 | Response Time Integrity | 0.0/1 |
@@ -73,15 +72,15 @@ Respondent `_index: 1` shows low integrity scores:
 | Name Check | 0.0/1 |
 | Location Check | 0/1 |
-For definitions and more respondents, open the **Integrity Issues Deep Dive** tab.
 ---
 ### Representativity Action Suggestions
-| Scenario | Score | Δ vs. Baseline |
-|----------|-------|----------------|
 | Baseline (0) | 0.795 | — |
-| Urgent cleansing (A) | 0.795 |  ±0.000 |
 | +Low-urgency cleansing (B) | 0.757 | −0.038 |
 ---
@@ -90,111 +89,103 @@ For definitions and more respondents, open the **Integrity Issues Deep Dive** ta
 No enumerator bias detected.
 """
-# ──────────────────────────���────────────────
-# 3. Traffic-light plot helper
-# ───────────────────────────────────────────
-def make_plot(dim, scen):
-    val = data.loc[scen, dim]
-    fig, ax = plt.subplots(figsize=(4, 7))
     # coloured bands
-    ax.axhspan(0,   0.60, color="#FF4D4F", alpha=0.3)  # red
-    ax.axhspan(0.60, 0.80, color="#FFE58F", alpha=0.3)  # yellow
-    ax.axhspan(0.80, 0.95, color="#52C41A", alpha=0.3)  # green
-    # black marker
-    ax.axhline(val, color="black", lw=2, xmin=0.35, xmax=0.65)
-    ax.annotate(f"{val:.3f}", (0.5, val),
-                xycoords=("axes fraction", "data"),
-                ha="center", va="bottom",
-                fontsize=22, weight="bold",
-                bbox=dict(boxstyle="round,pad=0.2", fc="white", ec="none", alpha=0.85))
-    # cosmetics
     ax.set_xlim(0, 1)
-    ax.set_ylim(0, 0.95)
     ax.set_xticks([])
-    ax.set_yticks([0, 0.6, 0.8, 0.95])
-    ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"], fontsize=14)
     for s in ax.spines.values():
         s.set_visible(False)
     ax.spines["left"].set_visible(True)
     ax.spines["left"].set_linewidth(2)
-    ax.set_title(f"{dim.replace('_', ' ').title()}\n({scenario_map[scen]})",
-                 fontsize=15, weight="bold", pad=8)
     plt.tight_layout()
     return fig
-def get_plots(scen):
     imgs = []
-    for dim in ["consistency_score",
-                "overall_representativity_score",
-                "integrity_score"]:
         buf = io.BytesIO()
-        make_plot(dim, scen).savefig(buf, format="png", bbox_inches="tight")
         buf.seek(0)
         imgs.append(Image.open(buf))
         plt.close()
     return imgs
-# ───────────────────────────────────────────
-# 4. CSV-table filter helper
-# ───────────────────────────────────────────
-CSV_FILE = "table_1_2.csv"   # change if your file has a different name
-def filter_table(col, val):
     df = pd.read_csv(CSV_FILE)
     if col and val and col in df.columns:
-        return df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
     return df
-# ───────────────────────────────────────────
-# 5. Gradio UI
-# ───────────────────────────────────────────
 with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
     gr.Markdown("## Data Quality Scenario Explorer")
-    # Scenario selector
-    scenario = gr.Dropdown(
-        label="Select Scenario",
-        choices=[("No cleansing", "0"),
-                 ("Urgent cleansing", "A"),
-                 ("Urgent + Low-urgency cleansing", "B")],
         value="0",
     )
-    # Three traffic-light plots
-    img1 = gr.Image(label="Consistency")
-    img2 = gr.Image(label="Representativity")
-    img3 = gr.Image(label="Integrity")
-    scenario.change(get_plots, scenario, [img1, img2, img3])
-    demo.load(lambda: get_plots("0"), outputs=[img1, img2, img3])
-    # Button → show full analysis text
-    show_btn = gr.Button("Show Overall Data Quality Analysis")
-    analysis_md = gr.Markdown(visible=False)
-    def show_analysis(scen):
-        return gr.update(
-            value=QUALITY_TEXT if scen == "A" else
-                  "Please select **Urgent cleansing (Scenario A)** to view the detailed analysis.",
-            visible=True
-        )
-    show_btn.click(show_analysis, scenario, analysis_md)
-    # ───────── table section ─────────
-    gr.Markdown("### Data Consistency Issues – Table 1.2")
     with gr.Row():
-        col_in  = gr.Textbox(label="Column (optional)", placeholder="e.g. question")
-        val_in  = gr.Textbox(label="Value (optional)",  placeholder="e.g. income")
-        apply_btn = gr.Button("Apply / Refresh")
-    table_out = gr.Dataframe(label="Filtered table_1_2.csv")
-    apply_btn.click(filter_table, [col_in, val_in], table_out)
-    demo.load(lambda: filter_table("", ""), outputs=table_out)
 if __name__ == "__main__":
     demo.launch()

 import io
 from PIL import Image
+# ───────────────────────── 1. scenario scores ──────────────────────────
+scores = pd.DataFrame(
+    {
+        "scenario": ["0", "A", "B"],
+        "consistency_score": [0.954451, 0.979592, 1.000000],
+        "overall_representativity_score": [0.79486, 0.79486, 0.75695],
+        "integrity_score": [0.983921, 0.983921, 0.983921],
+    }
+).set_index("scenario")
+scenario_map = {
+    "0": "No cleansing",
+    "A": "Urgent cleansing",
+    "B": "Urgent + Low-urgency cleansing",
+}
+# ─────────────────── 2. long Markdown shown by the button ──────────────
+QUALITY_TEXT = """
 ### Overall Data Quality Analysis
 After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
 *Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
 The following dimensions are evaluated for consistency:
+- Completeness check
+- Dist-outlier check
+- Free-text check
+- Model-based outlier
+**Key questions with many issues**
+- `enumerator_name` – 98 issues (free-text)
+- `household_average_income_female_members` – 81 issues (outliers & completeness)
+- `household_average_income` – 72 issues (outliers & completeness)
+- `household_average_income_male_members` – 39 issues (completeness)
+- `household_average_expenses_education` – 29 issues (outliers & completeness)
+- `impact_contributions_other_factors` – 23 issues (completeness)
+- `monthly_spend_on_healthcare` – 21 issues (completeness)
+For full details see the **Data Consistency Issues Deep Dive** tab.
 ---
 ### Integrity Action Suggestions
 Respondent `_index: 1` shows low integrity scores:
+| Check | Score |
+|-------|------:|
 | Payment for Survey | 0/1 |
 | Respondent Influenced | 0/1 |
 | Response Time Integrity | 0.0/1 |
 | Name Check | 0.0/1 |
 | Location Check | 0/1 |
+See **Integrity Issues Deep Dive** for more respondents.
 ---
 ### Representativity Action Suggestions
+| Scenario | Score | Δ vs Baseline |
+|----------|-------|--------------|
 | Baseline (0) | 0.795 | — |
+| Urgent cleansing (A) | 0.795 | ±0.000 |
 | +Low-urgency cleansing (B) | 0.757 | −0.038 |
 ---
 No enumerator bias detected.
 """
+# ───────────────────── 3. traffic-light plot helper ────────────────────
+def traffic_plot(metric, scen):
+    y = scores.loc[scen, metric]
+    fig, ax = plt.subplots(figsize=(3, 6))
     # coloured bands
+    ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
+    ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
+    ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)
+    # black line marker
+    ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)
+    # axes styling
     ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
     ax.set_xticks([])
+    ax.set_yticks([0, .6, .8, 1])
+    ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
     for s in ax.spines.values():
         s.set_visible(False)
     ax.spines["left"].set_visible(True)
     ax.spines["left"].set_linewidth(2)
+    ax.set_title(
+        f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
+        fontsize=12,
+        weight="bold",
+        pad=6,
+    )
     plt.tight_layout()
     return fig
+def make_images(scen):
     imgs = []
+    for met in [
+        "consistency_score",
+        "overall_representativity_score",
+        "integrity_score",
+    ]:
         buf = io.BytesIO()
+        traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
         buf.seek(0)
         imgs.append(Image.open(buf))
         plt.close()
     return imgs
+# ───────────────────── 4. CSV-table simple filter ──────────────────────
+CSV_FILE = "table_1_2.csv"  # rename if needed
+def filter_csv(col, val):
     df = pd.read_csv(CSV_FILE)
     if col and val and col in df.columns:
+        return df[df[col].astype(str).str.contains(val, case=False, na=False)]
     return df
+# ───────────────────── 5. Gradio interface ─────────────────────────────
 with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
     gr.Markdown("## Data Quality Scenario Explorer")
+    scen = gr.Dropdown(
+        label="Scenario",
+        choices=[
+            ("No cleansing", "0"),
+            ("Urgent cleansing", "A"),
+            ("Urgent + Low-urgency cleansing", "B"),
+        ],
         value="0",
     )
+    # three images side-by-side
     with gr.Row():
+        im1 = gr.Image()
+        im2 = gr.Image()
+        im3 = gr.Image()
+    scen.change(make_images, scen, [im1, im2, im3])
+    demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])
+    # button -> full markdown
+    show_btn = gr.Button("Data Validation Summary")
+    summary_md = gr.Markdown(visible=False)
+    show_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
+                   inputs=None, outputs=summary_md)
+    # table viewer
+    gr.Markdown("### Table 1-2 Viewer")
+    col_in = gr.Textbox(label="Column (optional)")
+    val_in = gr.Textbox(label="Value (optional)")
+    tbl_btn = gr.Button("Show / Filter Table")
+    tbl_out = gr.Dataframe()
+    tbl_btn.click(filter_csv, [col_in, val_in], tbl_out)
+    demo.load(lambda: filter_csv("", ""), outputs=tbl_out)
 if __name__ == "__main__":
     demo.launch()