Spaces:
Sleeping
Sleeping
Commit ·
99cc145
1
Parent(s): a8443d7
Updating models
Browse files- src/utils/data_processor.py +44 -0
src/utils/data_processor.py
CHANGED
|
@@ -1081,8 +1081,49 @@ class DataProcessor:
|
|
| 1081 |
if task_id and self.task_manager:
|
| 1082 |
self.task_manager.update_task(task_id, stage='completed', progress=100)
|
| 1083 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1084 |
return {
|
| 1085 |
'processed_data': df_processed,
|
|
|
|
|
|
|
| 1086 |
'absa_details': absa_results,
|
| 1087 |
'areas_of_improvement': areas_of_improvement,
|
| 1088 |
'strength_anchors': strength_anchors,
|
|
@@ -1092,6 +1133,9 @@ class DataProcessor:
|
|
| 1092 |
'micro_summaries': micro_summaries,
|
| 1093 |
'summary': {
|
| 1094 |
'total_reviews': len(df_processed),
|
|
|
|
|
|
|
|
|
|
| 1095 |
'languages_detected': list(set(detected_languages)),
|
| 1096 |
'intents_distribution': pd.Series([r['intent'] for r in intent_results]).value_counts().to_dict(),
|
| 1097 |
'sentiment_distribution': pd.Series(overall_sentiment).value_counts().to_dict(),
|
|
|
|
| 1081 |
if task_id and self.task_manager:
|
| 1082 |
self.task_manager.update_task(task_id, stage='completed', progress=100)
|
| 1083 |
|
| 1084 |
+
# ========== NEW: ASPECT-LEVEL DATA TRANSFORMATION ==========
|
| 1085 |
+
aspect_level_data = []
|
| 1086 |
+
mixed_sentiment_reviews = []
|
| 1087 |
+
|
| 1088 |
+
for idx, row in df_processed.iterrows():
|
| 1089 |
+
aspects = row['aspects'] if isinstance(row['aspects'], list) else []
|
| 1090 |
+
aspect_sentiments = row['aspect_sentiments'] if isinstance(row['aspect_sentiments'], list) else []
|
| 1091 |
+
|
| 1092 |
+
# Check for mixed sentiments (conflicting aspect sentiments)
|
| 1093 |
+
unique_sentiments = set(aspect_sentiments)
|
| 1094 |
+
is_mixed = ('Positive' in unique_sentiments and 'Negative' in unique_sentiments)
|
| 1095 |
+
|
| 1096 |
+
if is_mixed:
|
| 1097 |
+
mixed_sentiment_reviews.append({
|
| 1098 |
+
'review_id': row['id'],
|
| 1099 |
+
'review': row['review'],
|
| 1100 |
+
'aspects': aspects,
|
| 1101 |
+
'aspect_sentiments': aspect_sentiments,
|
| 1102 |
+
'intent': row['intent'],
|
| 1103 |
+
'date': row['date']
|
| 1104 |
+
})
|
| 1105 |
+
|
| 1106 |
+
# Create aspect-level records
|
| 1107 |
+
for aspect, sentiment in zip(aspects, aspect_sentiments):
|
| 1108 |
+
aspect_level_data.append({
|
| 1109 |
+
'review_id': row['id'],
|
| 1110 |
+
'review': row['review'],
|
| 1111 |
+
'aspect': aspect,
|
| 1112 |
+
'aspect_sentiment': sentiment,
|
| 1113 |
+
'overall_sentiment': row['overall_sentiment'],
|
| 1114 |
+
'intent': row['intent'],
|
| 1115 |
+
'intent_severity': row['intent_severity'],
|
| 1116 |
+
'date': row['date'],
|
| 1117 |
+
'language': row['detected_language']
|
| 1118 |
+
})
|
| 1119 |
+
|
| 1120 |
+
aspect_level_df = pd.DataFrame(aspect_level_data) if aspect_level_data else pd.DataFrame()
|
| 1121 |
+
mixed_sentiment_df = pd.DataFrame(mixed_sentiment_reviews) if mixed_sentiment_reviews else pd.DataFrame()
|
| 1122 |
+
|
| 1123 |
return {
|
| 1124 |
'processed_data': df_processed,
|
| 1125 |
+
'aspect_level_data': aspect_level_df, # NEW: Aspect-level granular data
|
| 1126 |
+
'mixed_sentiment_reviews': mixed_sentiment_df, # NEW: Mixed sentiment detection
|
| 1127 |
'absa_details': absa_results,
|
| 1128 |
'areas_of_improvement': areas_of_improvement,
|
| 1129 |
'strength_anchors': strength_anchors,
|
|
|
|
| 1133 |
'micro_summaries': micro_summaries,
|
| 1134 |
'summary': {
|
| 1135 |
'total_reviews': len(df_processed),
|
| 1136 |
+
'total_aspects': len(aspect_level_df),
|
| 1137 |
+
'mixed_sentiment_count': len(mixed_sentiment_df),
|
| 1138 |
+
'mixed_sentiment_pct': round(len(mixed_sentiment_df) / len(df_processed) * 100, 1) if len(df_processed) > 0 else 0,
|
| 1139 |
'languages_detected': list(set(detected_languages)),
|
| 1140 |
'intents_distribution': pd.Series([r['intent'] for r in intent_results]).value_counts().to_dict(),
|
| 1141 |
'sentiment_distribution': pd.Series(overall_sentiment).value_counts().to_dict(),
|