Update functions.py
Browse files- functions.py +203 -0
functions.py
CHANGED
|
@@ -3097,3 +3097,206 @@ For a detailed view of each respondent's integrity issues, please refer to the '
|
|
| 3097 |
"""
|
| 3098 |
|
| 3099 |
return final_report
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3097 |
"""
|
| 3098 |
|
| 3099 |
return final_report
|
| 3100 |
+
|
| 3101 |
+
|
| 3102 |
+
|
| 3103 |
+
def representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=None, table_3_4=None):
|
| 3104 |
+
report = []
|
| 3105 |
+
|
| 3106 |
+
# Check if table_3_1 exists and has required columns
|
| 3107 |
+
if table_3_1 is not None and not table_3_1.empty:
|
| 3108 |
+
if 'Weighted_Avg_Coverage' in table_3_1.columns and 'raw_data_variant' in table_3_1.columns:
|
| 3109 |
+
low_coverage_segments = table_3_1[
|
| 3110 |
+
(table_3_1['Weighted_Avg_Coverage'] < 0.75) & (table_3_1['raw_data_variant'] == 'A')
|
| 3111 |
+
][['Segmentation_Column', 'Segment', 'Weighted_Avg_Coverage']].drop_duplicates()
|
| 3112 |
+
|
| 3113 |
+
if not low_coverage_segments.empty:
|
| 3114 |
+
report.append("After urgent cleansing is applied, the following segments have coverage below 0.75:")
|
| 3115 |
+
for _, row in low_coverage_segments.iterrows():
|
| 3116 |
+
report.append(
|
| 3117 |
+
f"- {row['Segmentation_Column']} ({row['Segment']}) with coverage {row['Weighted_Avg_Coverage']:.2f}"
|
| 3118 |
+
)
|
| 3119 |
+
else:
|
| 3120 |
+
report.append("table_3_1 is missing required columns: 'Weighted_Avg_Coverage' or 'raw_data_variant'.")
|
| 3121 |
+
|
| 3122 |
+
# Check table_3_3 before processing
|
| 3123 |
+
if table_3_3 is not None and not table_3_3.empty:
|
| 3124 |
+
scenario_a = table_3_3[table_3_3['scenario'] == 'A']
|
| 3125 |
+
if not scenario_a.empty:
|
| 3126 |
+
overall_score = scenario_a['overall_representativity_score'].iloc[0]
|
| 3127 |
+
if overall_score < 0.80:
|
| 3128 |
+
if table_3_4 is not None and not table_3_4.empty and 'representativity_clean' in table_3_4.columns:
|
| 3129 |
+
low_questions = table_3_4[
|
| 3130 |
+
table_3_4['representativity_clean'] < 0.80
|
| 3131 |
+
]['question'].drop_duplicates()
|
| 3132 |
+
if not low_questions.empty:
|
| 3133 |
+
report.append("\nAdditionally, the following questions have representativity below 0.80:")
|
| 3134 |
+
for question in low_questions:
|
| 3135 |
+
report.append(f"- {question}")
|
| 3136 |
+
else:
|
| 3137 |
+
report.append("\nQuestions representativity data is unavailable.")
|
| 3138 |
+
else:
|
| 3139 |
+
report.append(f"\nThe overall representativity score after urgent cleansing is {overall_score:.2f}.")
|
| 3140 |
+
report.append(
|
| 3141 |
+
"The survey is able to assess the target confidence level of 90% with a margin of error of 5%."
|
| 3142 |
+
)
|
| 3143 |
+
else:
|
| 3144 |
+
report.append("\nThe data quality report for the urgent cleansing scenario is unavailable.")
|
| 3145 |
+
|
| 3146 |
+
if not report:
|
| 3147 |
+
report.append("No data available for representativity analysis.")
|
| 3148 |
+
|
| 3149 |
+
return "\n".join(report)
|
| 3150 |
+
|
| 3151 |
+
|
| 3152 |
+
|
| 3153 |
+
def enumerator_issue_action(table_4_1):
|
| 3154 |
+
"""
|
| 3155 |
+
Analyzes enumerator issues and generates a natural text report.
|
| 3156 |
+
|
| 3157 |
+
Parameters:
|
| 3158 |
+
table_4_1 (pd.DataFrame): DataFrame containing columns 'enumerator_name_corrected',
|
| 3159 |
+
'total_indices', and 'high_urgency_proportion'.
|
| 3160 |
+
|
| 3161 |
+
Returns:
|
| 3162 |
+
str: A natural text report with recommendations or a message indicating no bias found.
|
| 3163 |
+
"""
|
| 3164 |
+
# Filter enumerators with more than 5 total_indices
|
| 3165 |
+
enumerators_with_issues = table_4_1[table_4_1['total_indices'] > 5]
|
| 3166 |
+
|
| 3167 |
+
if enumerators_with_issues.empty:
|
| 3168 |
+
return "No enumerator bias has been found."
|
| 3169 |
+
|
| 3170 |
+
# Calculate the average high_urgency_proportion for enumerators with >5 total_indices
|
| 3171 |
+
average_high_urgency = enumerators_with_issues['high_urgency_proportion'].mean()
|
| 3172 |
+
|
| 3173 |
+
# Identify enumerators with a high_urgency_proportion more than double the average
|
| 3174 |
+
problematic_enumerators = enumerators_with_issues[
|
| 3175 |
+
enumerators_with_issues['high_urgency_proportion'] > 2 * average_high_urgency
|
| 3176 |
+
]
|
| 3177 |
+
|
| 3178 |
+
if problematic_enumerators.empty:
|
| 3179 |
+
return "No enumerator bias has been found."
|
| 3180 |
+
|
| 3181 |
+
# Generate the report for problematic enumerators
|
| 3182 |
+
report = [
|
| 3183 |
+
"After analyzing the number of urgent issues per enumerator name, we recommend a deep dive into an analysis of the responses provided by the following enumerators:"
|
| 3184 |
+
]
|
| 3185 |
+
|
| 3186 |
+
for _, row in problematic_enumerators.iterrows():
|
| 3187 |
+
report.append(f"- {row['enumerator_name_corrected']} (Total Issues: {row['total_indices']}, High Urgency Proportion: {row['high_urgency_proportion']:.2f})")
|
| 3188 |
+
|
| 3189 |
+
report.append("\nWe recommend going to the tab 'Enumerator Bias Deep Dive' for further investigation.")
|
| 3190 |
+
|
| 3191 |
+
return "\n".join(report)
|
| 3192 |
+
|
| 3193 |
+
|
| 3194 |
+
def generate_data_quality_report(segmentation, table_1_1, table_2_1, table_2_3, table_3_1, table_3_2, table_3_3, table_3_4, table_4_1):
|
| 3195 |
+
# Gather action texts
|
| 3196 |
+
consistency_action = f.consistency_issues_action(table_1_1, table_2_3)
|
| 3197 |
+
integrity_action = f.integrity_issues_action(table_2_1)
|
| 3198 |
+
representativity_action = representativity_issues_action(segmentation, table_3_1=None, table_3_2=None, table_3_3=table_3_3, table_3_4=table_3_4)
|
| 3199 |
+
enumerator_action = enumerator_issue_action(table_4_1)
|
| 3200 |
+
|
| 3201 |
+
# Analyze overall data quality for the scenario with only urgent cleansing
|
| 3202 |
+
scenario_a = table_3_3[table_3_3['scenario'] == 'A'].iloc[0]
|
| 3203 |
+
consistency_score_a = scenario_a['consistency_score']
|
| 3204 |
+
representativity_score_a = scenario_a['overall_representativity_score']
|
| 3205 |
+
integrity_score_a = scenario_a['integrity_score']
|
| 3206 |
+
data_quality_score_a = scenario_a['data_quality_score']
|
| 3207 |
+
|
| 3208 |
+
# Evaluate overall quality
|
| 3209 |
+
if data_quality_score_a > 0.85 and all(score > 0.85 for score in [consistency_score_a, representativity_score_a, integrity_score_a]):
|
| 3210 |
+
quality_summary = (
|
| 3211 |
+
"The overall data quality of the dataset is very strong. All dimensions meet the desired thresholds, "
|
| 3212 |
+
"indicating the data is well-suited for analysis."
|
| 3213 |
+
)
|
| 3214 |
+
elif data_quality_score_a > 0.80:
|
| 3215 |
+
underperforming = [
|
| 3216 |
+
name for score, name in zip(
|
| 3217 |
+
[consistency_score_a, representativity_score_a, integrity_score_a],
|
| 3218 |
+
['Consistency', 'Overall Representativity', 'Integrity']
|
| 3219 |
+
) if score < 0.80
|
| 3220 |
+
]
|
| 3221 |
+
quality_summary = (
|
| 3222 |
+
"The overall data quality score is satisfactory, but the following dimensions require further investigation: "
|
| 3223 |
+
+ ", ".join(underperforming) + ". Please refer to the suggestions below for detailed actions."
|
| 3224 |
+
)
|
| 3225 |
+
else:
|
| 3226 |
+
quality_summary = (
|
| 3227 |
+
"The overall data quality score is below acceptable thresholds. Please take the suggested actions for the dimensions "
|
| 3228 |
+
"with underperforming scores (< 0.80) to improve data quality."
|
| 3229 |
+
)
|
| 3230 |
+
|
| 3231 |
+
# Generate the full report
|
| 3232 |
+
report = f"""
|
| 3233 |
+
### Overall Data Quality Analysis
|
| 3234 |
+
|
| 3235 |
+
After analyzing the data quality score breakdown for the scenario where only urgent cleansing has been applied, the following observations are made:
|
| 3236 |
+
|
| 3237 |
+
- **Consistency Score** : {consistency_score_a:.3f}
|
| 3238 |
+
- **Overall Representativity Score** : {representativity_score_a:.3f}
|
| 3239 |
+
- **Integrity Score** : {integrity_score_a:.3f}
|
| 3240 |
+
- **Overall Data Quality Score** : {data_quality_score_a:.3f}
|
| 3241 |
+
|
| 3242 |
+
#### Summary
|
| 3243 |
+
{quality_summary}
|
| 3244 |
+
|
| 3245 |
+
---
|
| 3246 |
+
|
| 3247 |
+
### Consistency Action Suggestions
|
| 3248 |
+
{consistency_action}
|
| 3249 |
+
|
| 3250 |
+
---
|
| 3251 |
+
|
| 3252 |
+
### Integrity Action Suggestions
|
| 3253 |
+
{integrity_action}
|
| 3254 |
+
|
| 3255 |
+
---
|
| 3256 |
+
|
| 3257 |
+
### Representativity Action Suggestions
|
| 3258 |
+
{representativity_action}
|
| 3259 |
+
|
| 3260 |
+
---
|
| 3261 |
+
|
| 3262 |
+
### Enumerator Action Suggestions
|
| 3263 |
+
{enumerator_action}
|
| 3264 |
+
"""
|
| 3265 |
+
|
| 3266 |
+
return report.strip()
|
| 3267 |
+
|
| 3268 |
+
|
| 3269 |
+
|
| 3270 |
+
if segmentation =='yes':
|
| 3271 |
+
|
| 3272 |
+
# Call the function
|
| 3273 |
+
report = generate_data_quality_report(
|
| 3274 |
+
segmentation='yes',
|
| 3275 |
+
table_1_1=table_1_1, # Replace with actual data
|
| 3276 |
+
table_2_1=table_2_1, # Replace with actual data
|
| 3277 |
+
table_2_3=table_2_3, # Replace with actual data
|
| 3278 |
+
table_3_1=table_3_1, # Replace with actual data
|
| 3279 |
+
table_3_2=table_3_2, # Replace with actual data
|
| 3280 |
+
table_3_3=table_3_3,
|
| 3281 |
+
table_3_4=table_3_4, # Replace with actual data
|
| 3282 |
+
table_4_1=table_4_1 # Replace with actual data
|
| 3283 |
+
)
|
| 3284 |
+
|
| 3285 |
+
print(report)
|
| 3286 |
+
|
| 3287 |
+
else:
|
| 3288 |
+
# Call the function
|
| 3289 |
+
report = generate_data_quality_report(
|
| 3290 |
+
segmentation='no',
|
| 3291 |
+
table_1_1=table_1_1, # Replace with actual data
|
| 3292 |
+
table_2_1=table_2_1, # Replace with actual data
|
| 3293 |
+
table_2_3=table_2_3, # Replace with actual data
|
| 3294 |
+
table_3_1=None, # Replace with actual data
|
| 3295 |
+
table_3_2=None, # Replace with actual data
|
| 3296 |
+
table_3_3=table_3_3,
|
| 3297 |
+
table_3_4=table_3_4, # Replace with actual data
|
| 3298 |
+
table_4_1=table_4_1 # Replace with actual data
|
| 3299 |
+
)
|
| 3300 |
+
|
| 3301 |
+
print(report)
|
| 3302 |
+
|