Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

App Files Files Community

Data_Validation_Process / app.py

fortuala

Update app.py

9d688cb verified 9 months ago

raw

history blame

6.66 kB

	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	import io
	from PIL import Image

	# ───────────────────────── 1. scenario scores ──────────────────────────
	scores = pd.DataFrame(
	{
	"scenario": ["0", "A", "B"],
	"consistency_score": [0.954451, 0.979592, 1.000000],
	"representativity_score": [0.79486, 0.79486, 0.75695],
	"integrity_score": [0.983921, 0.983921, 0.983921],
	}
	).set_index("scenario")

	scenario_map = {
	"0": "No cleansing",
	"A": "Urgent cleansing",
	"B": "Urgent + Low-urgency cleansing",
	}

	# ─────────────────── 2. long Markdown shown by the button ──────────────
	QUALITY_TEXT = """
	### Overall Data Quality Analysis

	After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:

	- Consistency Score : 0.980
	- Overall Representativity Score : 0.795
	- Integrity Score : 0.984
	- Overall Data Quality Score : 0.919

	#### Summary
	The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.

	---

	### Consistency Action Suggestions
	Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:

	The following dimensions are evaluated for consistency:
	- Completeness check
	- Dist-outlier check
	- Free-text check
	- Model-based outlier

	Key questions with many issues
	- `enumerator_name` – 98 issues (free-text)
	- `household_average_income_female_members` – 81 issues (outliers & completeness)
	- `household_average_income` – 72 issues (outliers & completeness)
	- `household_average_income_male_members` – 39 issues (completeness)
	- `household_average_expenses_education` – 29 issues (outliers & completeness)
	- `impact_contributions_other_factors` – 23 issues (completeness)
	- `monthly_spend_on_healthcare` – 21 issues (completeness)

	For full details see the Data Consistency Issues Deep Dive tab.

	---

	### Integrity Action Suggestions
	Respondent `_index: 1` shows low integrity scores:

	\| Check \| Score \|
	\|-------\|------:\|
	\| Payment for Survey \| 0/1 \|
	\| Respondent Influenced \| 0/1 \|
	\| Response Time Integrity \| 0.0/1 \|
	\| Questions Were Difficult \| 0.0/2 \|
	\| Respondent Suspicious \| 0/2 \|
	\| Phone Number Check \| 0.0/1 \|
	\| Name Check \| 0.0/1 \|
	\| Location Check \| 0/1 \|

	See Integrity Issues Deep Dive for more respondents.

	---

	### Representativity Action Suggestions
	\| Scenario \| Score \| Δ vs Baseline \|
	\|----------\|-------\|--------------\|
	\| Baseline (0) \| 0.795 \| — \|
	\| Urgent cleansing (A) \| 0.795 \| ±0.000 \|
	\| +Low-urgency cleansing (B) \| 0.757 \| −0.038 \|

	---

	### Enumerator Action Suggestions
	No enumerator bias detected.
	"""

	# ───────────────────── 3. traffic-light plot helper ────────────────────
	def traffic_plot(metric, scen):
	y = scores.loc[scen, metric]
	fig, ax = plt.subplots(figsize=(3, 6))

	# coloured bands
	ax.axhspan(0.00, 0.60, color="#ff4d4f", alpha=0.30)
	ax.axhspan(0.60, 0.80, color="#ffe58f", alpha=0.30)
	ax.axhspan(0.80, 1.00, color="#52c41a", alpha=0.30)

	# black line marker
	ax.axhline(y, color="black", lw=3, xmin=0.30, xmax=0.70)

	# axes styling
	ax.set_xlim(0, 1)
	ax.set_ylim(0, 1)
	ax.set_xticks([])
	ax.set_yticks([0, .6, .8, 1])
	ax.set_yticklabels(["0", ".6", ".8", "1"], fontsize=12)
	for s in ax.spines.values():
	s.set_visible(False)
	ax.spines["left"].set_visible(True)
	ax.spines["left"].set_linewidth(2)

	ax.set_title(
	f"{metric.replace('_', ' ').title()}\n({scenario_map[scen]})",
	fontsize=12,
	weight="bold",
	pad=6,
	)
	plt.tight_layout()
	return fig


	def make_images(scen):
	imgs = []
	for met in [
	"consistency_score",
	"representativity_score",
	"integrity_score",
	]:
	buf = io.BytesIO()
	traffic_plot(met, scen).savefig(buf, format="png", bbox_inches="tight")
	buf.seek(0)
	imgs.append(Image.open(buf))
	plt.close()
	return imgs


	# ───────────────────── 4. CSV-table simple filter ──────────────────────
	CSV_FILE = "issues_log.csv" # rename here if needed
	df_full = pd.read_csv(CSV_FILE) # load once

	def filter_csv(col, val):
	if col and val:
	mask = df_full[col].astype(str).str.contains(val, case=False, na=False)
	return df_full[mask]
	return df_full

	# ───────────────────── 5. Gradio interface ─────────────────────────────
	with gr.Blocks(title="Data Quality Scenario Explorer") as demo:
	gr.Markdown("## Data Quality Scenario Explorer")

	scen = gr.Dropdown(
	label="Scenario",
	choices=[
	("No cleansing", "0"),
	("Urgent cleansing", "A"),
	("Urgent + Low-urgency cleansing", "B"),
	],
	value="0",
	)

	# plots side-by-side
	with gr.Row():
	im1, im2, im3 = gr.Image(), gr.Image(), gr.Image()
	scen.change(make_images, scen, [im1, im2, im3])
	demo.load(lambda: make_images("0"), outputs=[im1, im2, im3])

	# summary text button
	summary_btn = gr.Button("Data Validation Summary")
	summary_md = gr.Markdown(visible=False)
	summary_btn.click(lambda: gr.update(value=QUALITY_TEXT, visible=True),
	inputs=None, outputs=summary_md)

	# ──────── table viewer with universal filter ─────────
	gr.Markdown("### Table 1-2 Viewer")

	with gr.Row():
	col_dd = gr.Dropdown(label="Column",
	choices=list(df_full.columns),
	value=list(df_full.columns)[0])
	val_tb = gr.Textbox(label="Filter value (optional)")
	apply_b = gr.Button("Apply Filter")
	reset_b = gr.Button("Show All")

	table_df = gr.Dataframe(value=df_full, label="table_1_2.csv")

	apply_b.click(filter_csv, [col_dd, val_tb], table_df)
	reset_b.click(lambda: df_full, None, table_df)

	if __name__ == "__main__":
	demo.launch()