Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

App Files Files Community

Data_Validation_Process / app.py

fortuala

Update app.py

363a986 verified 9 months ago

raw

history blame

8.85 kB

	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	import io
	from PIL import Image

	# --- Data (hardcoded as requested) ---
	data = pd.DataFrame({
	"scenario": ["0", "A", "B"],
	"consistency_score": [0.954451, 0.979592, 1.0],
	"overall_representativity_score": [0.79486, 0.79486, 0.75695],
	"integrity_score": [0.983921, 0.983921, 0.983921],
	"data_quality_score": [0.911077, 0.919457, 0.913624]
	})

	scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}

	# --- Traffic light plotting utility ---
	def plot_dimension(dim_col, scenario):
	value = data.set_index("scenario").loc[scenario, dim_col]
	fig, ax = plt.subplots(figsize=(5, 1.4))
	# Traffic light backgrounds
	ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.4, label='Red (0-0.6)')
	ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.4, label='Yellow (0.6-0.8)')
	ax.axhspan(0.8, 1, color="#52C41A", alpha=0.4, label='Green (0.8-1.0)')
	# Value marker
	ax.axhline(value, color='black', lw=4, xmin=0.2, xmax=0.8)
	ax.text(0.5, value, f"{value:.3f}", ha='center', va='bottom', fontsize=13, color='black', weight='bold')
	# Aesthetics
	ax.set_xlim(0, 1)
	ax.set_ylim(0, 1)
	ax.set_xticks([])
	ax.set_yticks([0, 0.6, 0.8, 1])
	ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"])
	ax.set_title(f"{dim_col.replace('_', ' ').title()} ({scenario_map[scenario]})")
	ax.spines[['right','top','bottom','left']].set_visible(False)
	return fig

	# --- Scenario selector callback ---
	def show_plots(scenario):
	fig1 = plot_dimension("consistency_score", scenario)
	fig2 = plot_dimension("overall_representativity_score", scenario)
	fig3 = plot_dimension("integrity_score", scenario)
	# Convert figs to images for Gradio
	img_list = []
	for fig in [fig1, fig2, fig3]:
	buf = io.BytesIO()
	fig.savefig(buf, format='png', bbox_inches='tight')
	buf.seek(0)
	img = Image.open(buf)
	img_list.append(img)
	plt.close(fig)
	return img_list

	# --- Button for quality text (scenario A only) ---
	QUALITY_TEXT = """
	### Overall Data Quality Analysis

	After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:

	- Consistency Score : 0.980
	- Overall Representativity Score : 0.795
	- Integrity Score : 0.984
	- Overall Data Quality Score : 0.919

	#### Summary
	The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.

	---

	### Consistency Action Suggestions

	Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:

	The following dimensions are evaluated for consistency:
	- Completeness check: An answer was expected but not provided.
	- Dist outlier check: A value outside the range of reasonable values.
	- Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
	- Model-based outlier: An inconsistent or extreme value compared to typical responses.

	Question: 'enumerator_name' has 98 issues.
	- The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.

	Question: 'household_average_income_female_members' has 81 issues.
	- The dimension(s) with the most issues: model based outlier with 41 issues.
	- The second dimension with issues: completeness check with 40 issues.

	Question: 'household_average_income' has 72 issues.
	- The dimension(s) with the most issues: model based outlier with 39 issues.
	- The second dimension with issues: completeness check with 33 issues.

	Question: 'household_average_income_male_members' has 39 issues.
	- The dimension with the most issues: completeness check with 39 issues.

	Question: 'household_average_expenses_education' has 29 issues.
	- The dimension(s) with the most issues: model based outlier with 23 issues.
	- The second dimension with issues: completeness check with 6 issues.

	Question: 'impact_contributions_other_factors' has 23 issues.
	- The dimension with the most issues: completeness check with 23 issues.

	Question: 'monthly_spend_on_healthcare' has 21 issues.
	- The dimension with the most issues: completeness check with 21 issues.

	For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.

	---

	### Integrity Action Suggestions
	The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.

	Respondent with _index: 1

	The following checks scored below the maximum value:
	Payment For Survey (score: 0/1)
	Respondent Influenced (score: 0/1)
	Response Time Integrity (score: 0.0/1)
	Questions Which Were Difficult (score: 0.0/2)
	Respondent Suspicious (score: 0/2)
	Phone Number Check (score: 0.0/1)
	Name Check (score: 0.0/1)
	Location Check (score: 0/1)

	The following checks are evaluated for integrity:
	- Payment for Survey: Less integrity if the respondent was paid to do it.
	- Respondent Influenced: Less integrity score if the respondent seemed influenced.
	- Response Time Integrity: Less integrity if the respondent took too long or too short to respond.
	- Audio Verification: More integrity if audio verification is in place.
	- Questions Were Difficult: Less integrity if more questions were hard to respond to.
	- Respondent Suspicious: Less integrity the more suspicious the respondent is.
	- Phone Number Check: More integrity if a realistic phone number is provided.
	- Response Uniqueness: More integrity if the response is truly unique.
	- Name Check: More integrity if the name is realistic.
	- Impact Feedback Integrity: More integrity if relevant and well-articulated feedback is provided.
	- Enumerator Bias: Less integrity if enumerator responses are biased.
	- Location Check: Less integrity if responses' locations are too close to each other in certain contexts.

	For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.

	---

	### Representativity Action Suggestions

	Baseline (no cleansing) overall representativity score: 0.795
	After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000).
	After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038).

	---

	### Enumerator Action Suggestions
	No enumerator bias has been found.
	"""

	# --- Table 1.2 loader & filter ---
	def load_and_filter_table(col=None, val=None):
	df = pd.read_csv("issues_log.csv") # Your table_1_2 file
	if col and val:
	df = df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
	return df

	# --- Gradio UI ---
	with gr.Blocks() as demo:
	gr.Markdown("## Data Quality Scenario Explorer")

	with gr.Row():
	scenario = gr.Dropdown(
	choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
	value="0",
	label="Select Scenario"
	)

	with gr.Row():
	out1 = gr.Image(label="Consistency Score Traffic Light")
	out2 = gr.Image(label="Overall Representativity Score Traffic Light")
	out3 = gr.Image(label="Integrity Score Traffic Light")
	scenario.change(show_plots, scenario, [out1, out2, out3])

	# Button for analysis (scenario A)
	with gr.Row():
	analysis_btn = gr.Button("Show Overall Data Quality Analysis (Scenario A Only)")
	analysis_text = gr.Markdown(visible=False)
	def show_analysis(selected_scenario):
	if selected_scenario == "A":
	return gr.update(value=QUALITY_TEXT, visible=True)
	else:
	return gr.update(value="Select scenario 'A' (Urgent cleansing) to view the analysis.", visible=True)
	analysis_btn.click(show_analysis, scenario, analysis_text)

	# Table with filter
	with gr.Row():
	gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
	with gr.Row():
	filter_col = gr.Textbox(label="Column to Filter (optional)", value="")
	filter_val = gr.Textbox(label="Value to Filter (optional)", value="")
	table_out = gr.Dataframe(label="table_1_2.csv Filtered Results")
	filter_col.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)
	filter_val.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)

	# Default show plots for initial scenario
	demo.load(lambda: show_plots("0"), outputs=[out1, out2, out3])

	if __name__ == "__main__":
	demo.launch()