Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

App Files Files Community

fortuala commited on Aug 5, 2025

Commit

f71a2f6

verified ·

1 Parent(s): 31279dc

Update app.py

Browse files

Files changed (1) hide show

app.py +195 -80

app.py CHANGED Viewed

@@ -1,87 +1,202 @@
 import gradio as gr
 import pandas as pd
-import os
-import functions as f
-# Paths to the permanent files
-survey_path = 'Copy of AGT.MHVL.0A.202505.0001 4.xlsx'
-indicator_path = 'Indicators_indicators_Default view 18.xlsx'
-questions_path = 'Indicators_questions_Default View 18.xlsx'
-choice_path = 'Indicators_choices_Default View 17.xlsx'
-parameters_path = 'Indicators_surveys_Survey validation 1.xlsx'
-uuid = 'AGT.MHVL.0A.202505.0001'
-survey = pd.read_excel(survey_path)
-indicators = pd.read_excel(indicator_path)
-choices = pd.read_excel(choice_path)
-questions = pd.read_excel(questions_path)
-def run_validation():
-    # Pass all inputs to your function (update name/args as needed)
-    # parameters file
-    indicator_df, questions_df, choice_df, data_all, raw_data, column_strategy_df = f.load_dataframes(
-    indicator_path,
-    questions_path,
-     choice_path,
-     survey_path)
-    # consistency
-    table_1_1, table_1_2, table_1_3 = f.consistency_score_report(
-    raw_data=raw_data,
-    indicator_df=indicator_df,
-    questions_df=questions_df,
-    column_strategy_df=column_strategy_df,
-    data_all=data_all,
-    theme_list=theme_list
-)
-    # integrity
-    table_2_1, table_2_2, table_2_3,table_2_4,table_2_5 = f.integrity_report(raw_data, questions_df, column_strategy_df, survey_type,table_1_2)
-    # representativity
-    if segmentation == 'yes':
-        table_3_1, table_3_2, table_3_3, table_3_4 = f.representativity_report(segmentation, raw_data, table_2_4, segmentation_columns, mapping_segmentation_quotas,
-                            table_2_3, N, table_1_3)
-    else:
-        table_3_3, table_3_4 = f.representativity_report(segmentation, raw_data, table_2_4, segmentation_columns, mapping_segmentation_quotas,
-                            table_2_3, N, table_1_3)
-    # enumerator bias
-    if 'enumerator_name' in raw_data.columns:
-        table_4_1, table_4_2 = f.enumerator_urgent_issues_report(raw_data, table_2_5)
-    else:
-        table_4_1 = []
-        table_4_2 = []
-    report = f.generate_data_quality_report(
-        segmentation='no',
-        table_1_1=table_1_1,
-        table_2_1=table_2_1,
-        table_2_3=table_2_3,
-        table_3_1=None,
-        table_3_2=None,
-        table_3_3=table_3_3,
-        table_3_4=table_3_4,  # Replace with actual data
-        table_4_1=table_4_1   # Replace with actual data
-    )
-    print(report)
-with gr.Blocks() as app:
-    gr.Markdown("## Survey Validation App")
-    survey_file = gr.File(label="Upload your survey (Excel or CSV)")
-    uuid_box = gr.Textbox(label="UUID", value="AGT.MHVL.0A.202505.0001")
-    run_btn = gr.Button("Run Validation")
-    output = gr.Dataframe(label="Validation Output")
-    run_btn.click(
-        run_validation,
-        inputs=[survey_file, uuid_box],
-        outputs=[]
-    )
 if __name__ == "__main__":
-    app.launch()

 import gradio as gr
 import pandas as pd
+import matplotlib.pyplot as plt
+import io
+# --- Data (hardcoded as requested) ---
+data = pd.DataFrame({
+    "scenario": ["0", "A", "B"],
+    "consistency_score": [0.954451, 0.979592, 1.0],
+    "overall_representativity_score": [0.79486, 0.79486, 0.75695],
+    "integrity_score": [0.983921, 0.983921, 0.983921],
+    "data_quality_score": [0.911077, 0.919457, 0.913624]
+})
+scenario_map = {"0": "No cleansing", "A": "Urgent cleansing", "B": "Urgent+Low urgency cleansing"}
+# --- Traffic light plotting utility ---
+def plot_dimension(dim_col, scenario):
+    value = data.set_index("scenario").loc[scenario, dim_col]
+    fig, ax = plt.subplots(figsize=(5, 1.4))
+    # Traffic light backgrounds
+    ax.axhspan(0, 0.6, color="#FF4D4F", alpha=0.4, label='Red (0-0.6)')
+    ax.axhspan(0.6, 0.8, color="#FFE58F", alpha=0.4, label='Yellow (0.6-0.8)')
+    ax.axhspan(0.8, 1, color="#52C41A", alpha=0.4, label='Green (0.8-1.0)')
+    # Value marker
+    ax.axhline(value, color='black', lw=4, xmin=0.2, xmax=0.8)
+    ax.text(0.5, value, f"{value:.3f}", ha='center', va='bottom', fontsize=13, color='black', weight='bold')
+    # Aesthetics
+    ax.set_xlim(0, 1)
+    ax.set_ylim(0, 1)
+    ax.set_xticks([])
+    ax.set_yticks([0, 0.6, 0.8, 1])
+    ax.set_yticklabels(["0.0", "0.6", "0.8", "1.0"])
+    ax.set_title(f"{dim_col.replace('_', ' ').title()} ({scenario_map[scenario]})")
+    ax.spines[['right','top','bottom','left']].set_visible(False)
+    return fig
+# --- Scenario selector callback ---
+def show_plots(scenario):
+    fig1 = plot_dimension("consistency_score", scenario)
+    fig2 = plot_dimension("overall_representativity_score", scenario)
+    fig3 = plot_dimension("integrity_score", scenario)
+    # Convert figs to images for Gradio
+    img_list = []
+    for fig in [fig1, fig2, fig3]:
+        buf = io.BytesIO()
+        fig.savefig(buf, format='png', bbox_inches='tight')
+        buf.seek(0)
+        img_list.append(buf.read())
+        plt.close(fig)
+    return img_list
+# --- Button for quality text (scenario A only) ---
+QUALITY_TEXT = """
+### Overall Data Quality Analysis
+After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
+- **Consistency Score**              : 0.980
+- **Overall Representativity Score** : 0.795
+- **Integrity Score**                : 0.984
+- **Overall Data Quality Score**     : 0.919
+#### Summary
+The overall data quality score is satisfactory, but the following dimensions require further investigation: Overall Representativity. Please refer to the suggestions below for detailed actions.
+---
+### Consistency Action Suggestions
+*Our analysis identified several questions where consistency issues need to be addressed, as detailed below. The following questions require attention:*
+The following dimensions are evaluated for consistency:
+- Completeness check: An answer was expected but not provided.
+- Dist outlier check: A value outside the range of reasonable values.
+- Free-text check (more than 3 characters but less than two words): Ensures minimal content for free-text responses.
+- Model-based outlier: An inconsistent or extreme value compared to typical responses.
+Question: 'enumerator_name' has 98 issues.
+  - The dimension with the most issues: free-text check (more than 3 characters but less than two words) with 98 issues.
+Question: 'household_average_income_female_members' has 81 issues.
+  - The dimension(s) with the most issues: model based outlier with 41 issues.
+  - The second dimension with issues: completeness check with 40 issues.
+Question: 'household_average_income' has 72 issues.
+  - The dimension(s) with the most issues: model based outlier with 39 issues.
+  - The second dimension with issues: completeness check with 33 issues.
+Question: 'household_average_income_male_members' has 39 issues.
+  - The dimension with the most issues: completeness check with 39 issues.
+Question: 'household_average_expenses_education' has 29 issues.
+  - The dimension(s) with the most issues: model based outlier with 23 issues.
+  - The second dimension with issues: completeness check with 6 issues.
+Question: 'impact_contributions_other_factors' has 23 issues.
+  - The dimension with the most issues: completeness check with 23 issues.
+Question: 'monthly_spend_on_healthcare' has 21 issues.
+  - The dimension with the most issues: completeness check with 21 issues.
+For a detailed view of each question's consistency issues, please refer to the 'Data Consistency Issues Deep Dive' tab.
+---
+### Integrity Action Suggestions
+The following respondents exhibit low integrity scores, and we recommend taking a closer look at them.
+**Respondent with _index: 1**
+ The following checks scored below the maximum value:
+Payment For Survey (score: 0/1)
+Respondent Influenced (score: 0/1)
+Response Time Integrity (score: 0.0/1)
+Questions Which Were Difficult (score: 0.0/2)
+Respondent Suspicious (score: 0/2)
+Phone Number Check (score: 0.0/1)
+Name Check (score: 0.0/1)
+Location Check (score: 0/1)
+ The following checks are evaluated for integrity:
+- **Payment for Survey:** Less integrity if the respondent was paid to do it.
+- **Respondent Influenced:** Less integrity score if the respondent seemed influenced.
+- **Response Time Integrity:** Less integrity if the respondent took too long or too short to respond.
+- **Audio Verification:** More integrity if audio verification is in place.
+- **Questions Were Difficult:** Less integrity if more questions were hard to respond to.
+- **Respondent Suspicious:** Less integrity the more suspicious the respondent is.
+- **Phone Number Check:** More integrity if a realistic phone number is provided.
+- **Response Uniqueness:** More integrity if the response is truly unique.
+- **Name Check:** More integrity if the name is realistic.
+- **Impact Feedback Integrity:** More integrity if relevant and well-articulated feedback is provided.
+- **Enumerator Bias:** Less integrity if enumerator responses are biased.
+- **Location Check:** Less integrity if responses' locations are too close to each other in certain contexts.
+For a detailed view of each respondent's integrity issues, please refer to the 'Integrity Issues Deep Dive' tab.
+---
+### Representativity Action Suggestions
+Baseline (no cleansing) overall representativity score: 0.795
+After high urgency cleansing (Scenario A), the score is 0.795 (remained the same, Δ = 0.000).
+After low urgency cleansing (Scenario B), the score is 0.757 (declined, Δ = -0.038).
+---
+### Enumerator Action Suggestions
+No enumerator bias has been found.
+"""
+# --- Table 1.2 loader & filter ---
+def load_and_filter_table(col=None, val=None):
+    df = pd.read_csv("table_1_2.csv")  # Your table_1_2 file
+    if col and val:
+        df = df[df[col].astype(str).str.contains(str(val), case=False, na=False)]
+    return df
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    gr.Markdown("## Data Quality Scenario Explorer")
+    with gr.Row():
+        scenario = gr.Dropdown(
+            choices=[("No cleansing", "0"), ("Urgent cleansing", "A"), ("Urgent+Low urgency cleansing", "B")],
+            value="0",
+            label="Select Scenario"
+        )
+    with gr.Row():
+        out1 = gr.Image(label="Consistency Score Traffic Light")
+        out2 = gr.Image(label="Overall Representativity Score Traffic Light")
+        out3 = gr.Image(label="Integrity Score Traffic Light")
+    scenario.change(show_plots, scenario, [out1, out2, out3])
+    # Button for analysis (scenario A)
+    with gr.Row():
+        analysis_btn = gr.Button("Show Overall Data Quality Analysis (Scenario A Only)")
+        analysis_text = gr.Markdown(visible=False)
+    def show_analysis(selected_scenario):
+        if selected_scenario == "A":
+            return gr.update(value=QUALITY_TEXT, visible=True)
+        else:
+            return gr.update(value="Select scenario 'A' (Urgent cleansing) to view the analysis.", visible=True)
+    analysis_btn.click(show_analysis, scenario, analysis_text)
+    # Table with filter
+    with gr.Row():
+        gr.Markdown("### Data Consistency Issues Deep Dive (Table 1.2)")
+    with gr.Row():
+        filter_col = gr.Textbox(label="Column to Filter (optional)", value="")
+        filter_val = gr.Textbox(label="Value to Filter (optional)", value="")
+        table_out = gr.Dataframe(label="table_1_2.csv Filtered Results")
+    filter_col.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)
+    filter_val.change(lambda col, val: load_and_filter_table(col, val), [filter_col, filter_val], table_out)
+    # Default show plots for initial scenario
+    demo.load(lambda: show_plots("0"), outputs=[out1, out2, out3])
 if __name__ == "__main__":
+    demo.launch()