Spaces:

leonardoimpact
/

Data_Validation_Process

Sleeping

App Files Files Community

fortuala commited on Aug 5, 2025

Commit

5106d31

verified ·

1 Parent(s): 5f9c86a

Update functions.py

Browse files

Files changed (1) hide show

functions.py +203 -0

functions.py CHANGED Viewed

@@ -3097,3 +3097,206 @@ For a detailed view of each respondent's integrity issues, please refer to the '
 """
     return final_report

 """
     return final_report
+def representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=None, table_3_4=None):
+    report = []
+    # Check if table_3_1 exists and has required columns
+    if table_3_1 is not None and not table_3_1.empty:
+        if 'Weighted_Avg_Coverage' in table_3_1.columns and 'raw_data_variant' in table_3_1.columns:
+            low_coverage_segments = table_3_1[
+                (table_3_1['Weighted_Avg_Coverage'] < 0.75) & (table_3_1['raw_data_variant'] == 'A')
+            ][['Segmentation_Column', 'Segment', 'Weighted_Avg_Coverage']].drop_duplicates()
+            if not low_coverage_segments.empty:
+                report.append("After urgent cleansing is applied, the following segments have coverage below 0.75:")
+                for _, row in low_coverage_segments.iterrows():
+                    report.append(
+                        f"- {row['Segmentation_Column']} ({row['Segment']}) with coverage {row['Weighted_Avg_Coverage']:.2f}"
+                    )
+        else:
+            report.append("table_3_1 is missing required columns: 'Weighted_Avg_Coverage' or 'raw_data_variant'.")
+    # Check table_3_3 before processing
+    if table_3_3 is not None and not table_3_3.empty:
+        scenario_a = table_3_3[table_3_3['scenario'] == 'A']
+        if not scenario_a.empty:
+            overall_score = scenario_a['overall_representativity_score'].iloc[0]
+            if overall_score < 0.80:
+                if table_3_4 is not None and not table_3_4.empty and 'representativity_clean' in table_3_4.columns:
+                    low_questions = table_3_4[
+                        table_3_4['representativity_clean'] < 0.80
+                    ]['question'].drop_duplicates()
+                    if not low_questions.empty:
+                        report.append("\nAdditionally, the following questions have representativity below 0.80:")
+                        for question in low_questions:
+                            report.append(f"- {question}")
+                else:
+                    report.append("\nQuestions representativity data is unavailable.")
+            else:
+                report.append(f"\nThe overall representativity score after urgent cleansing is {overall_score:.2f}.")
+                report.append(
+                    "The survey is able to assess the target confidence level of 90% with a margin of error of 5%."
+                )
+        else:
+            report.append("\nThe data quality report for the urgent cleansing scenario is unavailable.")
+    if not report:
+        report.append("No data available for representativity analysis.")
+    return "\n".join(report)
+def enumerator_issue_action(table_4_1):
+    """
+    Analyzes enumerator issues and generates a natural text report.
+    Parameters:
+        table_4_1 (pd.DataFrame): DataFrame containing columns 'enumerator_name_corrected',
+                                  'total_indices', and 'high_urgency_proportion'.
+    Returns:
+        str: A natural text report with recommendations or a message indicating no bias found.
+    """
+    # Filter enumerators with more than 5 total_indices
+    enumerators_with_issues = table_4_1[table_4_1['total_indices'] > 5]
+    if enumerators_with_issues.empty:
+        return "No enumerator bias has been found."
+    # Calculate the average high_urgency_proportion for enumerators with >5 total_indices
+    average_high_urgency = enumerators_with_issues['high_urgency_proportion'].mean()
+    # Identify enumerators with a high_urgency_proportion more than double the average
+    problematic_enumerators = enumerators_with_issues[
+        enumerators_with_issues['high_urgency_proportion'] > 2 * average_high_urgency
+    ]
+    if problematic_enumerators.empty:
+        return "No enumerator bias has been found."
+    # Generate the report for problematic enumerators
+    report = [
+        "After analyzing the number of urgent issues per enumerator name, we recommend a deep dive into an analysis of the responses provided by the following enumerators:"
+    ]
+    for _, row in problematic_enumerators.iterrows():
+        report.append(f"- {row['enumerator_name_corrected']} (Total Issues: {row['total_indices']}, High Urgency Proportion: {row['high_urgency_proportion']:.2f})")
+    report.append("\nWe recommend going to the tab 'Enumerator Bias Deep Dive' for further investigation.")
+    return "\n".join(report)
+def generate_data_quality_report(segmentation, table_1_1, table_2_1, table_2_3, table_3_1, table_3_2, table_3_3, table_3_4, table_4_1):
+    # Gather action texts
+    consistency_action = f.consistency_issues_action(table_1_1, table_2_3)
+    integrity_action = f.integrity_issues_action(table_2_1)
+    representativity_action = representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=table_3_3, table_3_4=table_3_4)
+    enumerator_action = enumerator_issue_action(table_4_1)
+    # Analyze overall data quality for the scenario with only urgent cleansing
+    scenario_a = table_3_3[table_3_3['scenario'] == 'A'].iloc[0]
+    consistency_score_a = scenario_a['consistency_score']
+    representativity_score_a = scenario_a['overall_representativity_score']
+    integrity_score_a = scenario_a['integrity_score']
+    data_quality_score_a = scenario_a['data_quality_score']
+    # Evaluate overall quality
+    if data_quality_score_a > 0.85 and all(score > 0.85 for score in [consistency_score_a, representativity_score_a, integrity_score_a]):
+        quality_summary = (
+            "The overall data quality of the dataset is very strong. All dimensions meet the desired thresholds, "
+            "indicating the data is well-suited for analysis."
+        )
+    elif data_quality_score_a > 0.80:
+        underperforming = [
+            name for score, name in zip(
+                [consistency_score_a, representativity_score_a, integrity_score_a],
+                ['Consistency', 'Overall Representativity', 'Integrity']
+            ) if score < 0.80
+        ]
+        quality_summary = (
+            "The overall data quality score is satisfactory, but the following dimensions require further investigation: "
+            + ", ".join(underperforming) + ". Please refer to the suggestions below for detailed actions."
+        )
+    else:
+        quality_summary = (
+            "The overall data quality score is below acceptable thresholds. Please take the suggested actions for the dimensions "
+            "with underperforming scores (< 0.80) to improve data quality."
+        )
+    # Generate the full report
+    report = f"""
+### Overall Data Quality Analysis
+After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
+- **Consistency Score**              : {consistency_score_a:.3f}
+- **Overall Representativity Score** : {representativity_score_a:.3f}
+- **Integrity Score**                : {integrity_score_a:.3f}
+- **Overall Data Quality Score**     : {data_quality_score_a:.3f}
+#### Summary
+{quality_summary}
+---
+### Consistency Action Suggestions
+{consistency_action}
+---
+### Integrity Action Suggestions
+{integrity_action}
+---
+### Representativity Action Suggestions
+{representativity_action}
+---
+### Enumerator Action Suggestions
+{enumerator_action}
+    """
+    return report.strip()
+if segmentation =='yes':
+    # Call the function
+    report = generate_data_quality_report(
+        segmentation='yes',
+        table_1_1=table_1_1,  # Replace with actual data
+        table_2_1=table_2_1,  # Replace with actual data
+        table_2_3=table_2_3,  # Replace with actual data
+        table_3_1=table_3_1,  # Replace with actual data
+        table_3_2=table_3_2,  # Replace with actual data
+        table_3_3=table_3_3,
+        table_3_4=table_3_4,  # Replace with actual data
+        table_4_1=table_4_1   # Replace with actual data
+    )
+    print(report)
+else:
+    # Call the function
+    report = generate_data_quality_report(
+        segmentation='no',
+        table_1_1=table_1_1,  # Replace with actual data
+        table_2_1=table_2_1,  # Replace with actual data
+        table_2_3=table_2_3,  # Replace with actual data
+        table_3_1=None,  # Replace with actual data
+        table_3_2=None,  # Replace with actual data
+        table_3_3=table_3_3,
+        table_3_4=table_3_4,  # Replace with actual data
+        table_4_1=table_4_1   # Replace with actual data
+    )
+    print(report)