fortuala commited on
Commit
5106d31
·
verified ·
1 Parent(s): 5f9c86a

Update functions.py

Browse files
Files changed (1) hide show
  1. functions.py +203 -0
functions.py CHANGED
@@ -3097,3 +3097,206 @@ For a detailed view of each respondent's integrity issues, please refer to the '
3097
  """
3098
 
3099
  return final_report
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3097
  """
3098
 
3099
  return final_report
3100
+
3101
+
3102
+
3103
+ def representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=None, table_3_4=None):
3104
+ report = []
3105
+
3106
+ # Check if table_3_1 exists and has required columns
3107
+ if table_3_1 is not None and not table_3_1.empty:
3108
+ if 'Weighted_Avg_Coverage' in table_3_1.columns and 'raw_data_variant' in table_3_1.columns:
3109
+ low_coverage_segments = table_3_1[
3110
+ (table_3_1['Weighted_Avg_Coverage'] < 0.75) & (table_3_1['raw_data_variant'] == 'A')
3111
+ ][['Segmentation_Column', 'Segment', 'Weighted_Avg_Coverage']].drop_duplicates()
3112
+
3113
+ if not low_coverage_segments.empty:
3114
+ report.append("After urgent cleansing is applied, the following segments have coverage below 0.75:")
3115
+ for _, row in low_coverage_segments.iterrows():
3116
+ report.append(
3117
+ f"- {row['Segmentation_Column']} ({row['Segment']}) with coverage {row['Weighted_Avg_Coverage']:.2f}"
3118
+ )
3119
+ else:
3120
+ report.append("table_3_1 is missing required columns: 'Weighted_Avg_Coverage' or 'raw_data_variant'.")
3121
+
3122
+ # Check table_3_3 before processing
3123
+ if table_3_3 is not None and not table_3_3.empty:
3124
+ scenario_a = table_3_3[table_3_3['scenario'] == 'A']
3125
+ if not scenario_a.empty:
3126
+ overall_score = scenario_a['overall_representativity_score'].iloc[0]
3127
+ if overall_score < 0.80:
3128
+ if table_3_4 is not None and not table_3_4.empty and 'representativity_clean' in table_3_4.columns:
3129
+ low_questions = table_3_4[
3130
+ table_3_4['representativity_clean'] < 0.80
3131
+ ]['question'].drop_duplicates()
3132
+ if not low_questions.empty:
3133
+ report.append("\nAdditionally, the following questions have representativity below 0.80:")
3134
+ for question in low_questions:
3135
+ report.append(f"- {question}")
3136
+ else:
3137
+ report.append("\nQuestions representativity data is unavailable.")
3138
+ else:
3139
+ report.append(f"\nThe overall representativity score after urgent cleansing is {overall_score:.2f}.")
3140
+ report.append(
3141
+ "The survey is able to assess the target confidence level of 90% with a margin of error of 5%."
3142
+ )
3143
+ else:
3144
+ report.append("\nThe data quality report for the urgent cleansing scenario is unavailable.")
3145
+
3146
+ if not report:
3147
+ report.append("No data available for representativity analysis.")
3148
+
3149
+ return "\n".join(report)
3150
+
3151
+
3152
+
3153
+ def enumerator_issue_action(table_4_1):
3154
+ """
3155
+ Analyzes enumerator issues and generates a natural text report.
3156
+
3157
+ Parameters:
3158
+ table_4_1 (pd.DataFrame): DataFrame containing columns 'enumerator_name_corrected',
3159
+ 'total_indices', and 'high_urgency_proportion'.
3160
+
3161
+ Returns:
3162
+ str: A natural text report with recommendations or a message indicating no bias found.
3163
+ """
3164
+ # Filter enumerators with more than 5 total_indices
3165
+ enumerators_with_issues = table_4_1[table_4_1['total_indices'] > 5]
3166
+
3167
+ if enumerators_with_issues.empty:
3168
+ return "No enumerator bias has been found."
3169
+
3170
+ # Calculate the average high_urgency_proportion for enumerators with >5 total_indices
3171
+ average_high_urgency = enumerators_with_issues['high_urgency_proportion'].mean()
3172
+
3173
+ # Identify enumerators with a high_urgency_proportion more than double the average
3174
+ problematic_enumerators = enumerators_with_issues[
3175
+ enumerators_with_issues['high_urgency_proportion'] > 2 * average_high_urgency
3176
+ ]
3177
+
3178
+ if problematic_enumerators.empty:
3179
+ return "No enumerator bias has been found."
3180
+
3181
+ # Generate the report for problematic enumerators
3182
+ report = [
3183
+ "After analyzing the number of urgent issues per enumerator name, we recommend a deep dive into an analysis of the responses provided by the following enumerators:"
3184
+ ]
3185
+
3186
+ for _, row in problematic_enumerators.iterrows():
3187
+ report.append(f"- {row['enumerator_name_corrected']} (Total Issues: {row['total_indices']}, High Urgency Proportion: {row['high_urgency_proportion']:.2f})")
3188
+
3189
+ report.append("\nWe recommend going to the tab 'Enumerator Bias Deep Dive' for further investigation.")
3190
+
3191
+ return "\n".join(report)
3192
+
3193
+
3194
+ def generate_data_quality_report(segmentation, table_1_1, table_2_1, table_2_3, table_3_1, table_3_2, table_3_3, table_3_4, table_4_1):
3195
+ # Gather action texts
3196
+ consistency_action = f.consistency_issues_action(table_1_1, table_2_3)
3197
+ integrity_action = f.integrity_issues_action(table_2_1)
3198
+ representativity_action = representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=table_3_3, table_3_4=table_3_4)
3199
+ enumerator_action = enumerator_issue_action(table_4_1)
3200
+
3201
+ # Analyze overall data quality for the scenario with only urgent cleansing
3202
+ scenario_a = table_3_3[table_3_3['scenario'] == 'A'].iloc[0]
3203
+ consistency_score_a = scenario_a['consistency_score']
3204
+ representativity_score_a = scenario_a['overall_representativity_score']
3205
+ integrity_score_a = scenario_a['integrity_score']
3206
+ data_quality_score_a = scenario_a['data_quality_score']
3207
+
3208
+ # Evaluate overall quality
3209
+ if data_quality_score_a > 0.85 and all(score > 0.85 for score in [consistency_score_a, representativity_score_a, integrity_score_a]):
3210
+ quality_summary = (
3211
+ "The overall data quality of the dataset is very strong. All dimensions meet the desired thresholds, "
3212
+ "indicating the data is well-suited for analysis."
3213
+ )
3214
+ elif data_quality_score_a > 0.80:
3215
+ underperforming = [
3216
+ name for score, name in zip(
3217
+ [consistency_score_a, representativity_score_a, integrity_score_a],
3218
+ ['Consistency', 'Overall Representativity', 'Integrity']
3219
+ ) if score < 0.80
3220
+ ]
3221
+ quality_summary = (
3222
+ "The overall data quality score is satisfactory, but the following dimensions require further investigation: "
3223
+ + ", ".join(underperforming) + ". Please refer to the suggestions below for detailed actions."
3224
+ )
3225
+ else:
3226
+ quality_summary = (
3227
+ "The overall data quality score is below acceptable thresholds. Please take the suggested actions for the dimensions "
3228
+ "with underperforming scores (< 0.80) to improve data quality."
3229
+ )
3230
+
3231
+ # Generate the full report
3232
+ report = f"""
3233
+ ### Overall Data Quality Analysis
3234
+
3235
+ After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
3236
+
3237
+ - **Consistency Score** : {consistency_score_a:.3f}
3238
+ - **Overall Representativity Score** : {representativity_score_a:.3f}
3239
+ - **Integrity Score** : {integrity_score_a:.3f}
3240
+ - **Overall Data Quality Score** : {data_quality_score_a:.3f}
3241
+
3242
+ #### Summary
3243
+ {quality_summary}
3244
+
3245
+ ---
3246
+
3247
+ ### Consistency Action Suggestions
3248
+ {consistency_action}
3249
+
3250
+ ---
3251
+
3252
+ ### Integrity Action Suggestions
3253
+ {integrity_action}
3254
+
3255
+ ---
3256
+
3257
+ ### Representativity Action Suggestions
3258
+ {representativity_action}
3259
+
3260
+ ---
3261
+
3262
+ ### Enumerator Action Suggestions
3263
+ {enumerator_action}
3264
+ """
3265
+
3266
+ return report.strip()
3267
+
3268
+
3269
+
3270
+ if segmentation =='yes':
3271
+
3272
+ # Call the function
3273
+ report = generate_data_quality_report(
3274
+ segmentation='yes',
3275
+ table_1_1=table_1_1, # Replace with actual data
3276
+ table_2_1=table_2_1, # Replace with actual data
3277
+ table_2_3=table_2_3, # Replace with actual data
3278
+ table_3_1=table_3_1, # Replace with actual data
3279
+ table_3_2=table_3_2, # Replace with actual data
3280
+ table_3_3=table_3_3,
3281
+ table_3_4=table_3_4, # Replace with actual data
3282
+ table_4_1=table_4_1 # Replace with actual data
3283
+ )
3284
+
3285
+ print(report)
3286
+
3287
+ else:
3288
+ # Call the function
3289
+ report = generate_data_quality_report(
3290
+ segmentation='no',
3291
+ table_1_1=table_1_1, # Replace with actual data
3292
+ table_2_1=table_2_1, # Replace with actual data
3293
+ table_2_3=table_2_3, # Replace with actual data
3294
+ table_3_1=None, # Replace with actual data
3295
+ table_3_2=None, # Replace with actual data
3296
+ table_3_3=table_3_3,
3297
+ table_3_4=table_3_4, # Replace with actual data
3298
+ table_4_1=table_4_1 # Replace with actual data
3299
+ )
3300
+
3301
+ print(report)
3302
+