Spaces:

minhvtt
/

Aus_F

Sleeping

App Files Files Community

minhvtt commited on Nov 24, 2025

Commit

ea06065

verified ·

1 Parent(s): c221a89

Upload 19 files

Browse files

Files changed (9) hide show

config.py +1 -3
database.py +8 -8
models/event_models.py +23 -10
models/segmentation_models.py +23 -11
models/sentiment_models.py +23 -10
scripts/create_indexes.py +47 -22
services/data_aggregation.py +60 -59
services/genai_service.py +21 -7
services/sentiment_service.py +66 -18

config.py CHANGED Viewed

@@ -16,12 +16,10 @@ class Settings(BaseSettings):
     # Hugging Face Token (optional)
     HF_TOKEN: str = os.getenv("HF_TOKEN", "")
-    # Collection Names
     COLLECTION_USERS: str = "User"
     COLLECTION_PAYMENTS: str = "Payment"
     COLLECTION_EVENT_VERSIONS: str = "EventVersion"
-    COLLECTION_USER_FOLLOWS: str = "UserFollow"
-    COLLECTION_USER_COMMENT_POST: str = "UserCommentPost"
     COLLECTION_POST_SOCIAL_MEDIA: str = "PostSocialMedia"
     # AI Result Collections

     # Hugging Face Token (optional)
     HF_TOKEN: str = os.getenv("HF_TOKEN", "")
+    # Collection Names (ACTUAL MongoDB collections)
     COLLECTION_USERS: str = "User"
     COLLECTION_PAYMENTS: str = "Payment"
     COLLECTION_EVENT_VERSIONS: str = "EventVersion"
     COLLECTION_POST_SOCIAL_MEDIA: str = "PostSocialMedia"
     # AI Result Collections

database.py CHANGED Viewed

@@ -53,28 +53,28 @@ class DatabaseManager:
             self._client.close()
             print("✓ MongoDB connection closed")
-    # Collection accessors
     @property
     def users(self) -> Collection:
         return self.get_collection(settings.COLLECTION_USERS)
     @property
     def payments(self) -> Collection:
         return self.get_collection(settings.COLLECTION_PAYMENTS)
     @property
     def event_versions(self) -> Collection:
         return self.get_collection(settings.COLLECTION_EVENT_VERSIONS)
     @property
-    def user_follows(self) -> Collection:
-        return self.get_collection(settings.COLLECTION_USER_FOLLOWS)
-    @property
-    def user_comment_post(self) -> Collection:
-        return self.get_collection(settings.COLLECTION_USER_COMMENT_POST)
-    # AI Result Collections (DEPRECATED - use event-centric versions)
     @property
     def audience_segments(self) -> Collection:
         """AudienceSegment collection (DEPRECATED - use event_audience_segments)"""

             self._client.close()
             print("✓ MongoDB connection closed")
+    # ACTUAL Collections (matching models.txt)
     @property
     def users(self) -> Collection:
+        """User collection (contains embedded UserFollows array)"""
         return self.get_collection(settings.COLLECTION_USERS)
     @property
     def payments(self) -> Collection:
+        """Payment collection"""
         return self.get_collection(settings.COLLECTION_PAYMENTS)
     @property
     def event_versions(self) -> Collection:
+        """EventVersion collection"""
         return self.get_collection(settings.COLLECTION_EVENT_VERSIONS)
     @property
+    def post_social_media(self) -> Collection:
+        """PostSocialMedia collection (contains nested Images.UserCommentPosts)"""
+        return self.get_collection(settings.COLLECTION_POST_SOCIAL_MEDIA)
+    # AI Result Collections
     @property
     def audience_segments(self) -> Collection:
         """AudienceSegment collection (DEPRECATED - use event_audience_segments)"""

models/event_models.py CHANGED Viewed

@@ -12,20 +12,33 @@ from bson import ObjectId
 class PyObjectId(ObjectId):
-    """Custom ObjectId type for Pydantic"""
-    @classmethod
-    def __get_validators__(cls):
-        yield cls.validate
     @classmethod
-    def validate(cls, v):
-        if not ObjectId.is_valid(v):
-            raise ValueError("Invalid ObjectId")
-        return ObjectId(v)
     @classmethod
-    def __modify_schema__(cls, field_schema):
-        field_schema.update(type="string")
 class MarketingContent(BaseModel):

 class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic v2"""
     @classmethod
+    def __get_pydantic_core_schema__(cls, source_type, handler):
+        from pydantic_core import core_schema
+        return core_schema.union_schema([
+            core_schema.is_instance_schema(ObjectId),
+            core_schema.chain_schema([
+                core_schema.str_schema(),
+                core_schema.no_info_plain_validator_function(cls.validate),
+            ])
+        ],
+        serialization=core_schema.plain_serializer_function_ser_schema(
+            lambda x: str(x)
+        ))
     @classmethod
+    def validate(cls, v):
+        if isinstance(v, ObjectId):
+            return v
+        if isinstance(v, str):
+            if not ObjectId.is_valid(v):
+                raise ValueError(f"Invalid ObjectId: {v}")
+            return ObjectId(v)
+        raise ValueError(f"Expected ObjectId or string, got {type(v)}")
 class MarketingContent(BaseModel):

models/segmentation_models.py CHANGED Viewed

@@ -12,20 +12,32 @@ from bson import ObjectId
 class PyObjectId(ObjectId):
-    """Custom ObjectId type for Pydantic"""
     @classmethod
-    def __get_validators__(cls):
-        yield cls.validate
     @classmethod
     def validate(cls, v):
-        if not ObjectId.is_valid(v):
-            raise ValueError("Invalid ObjectId")
-        return ObjectId(v)
-    @classmethod
-    def __modify_schema__(cls, field_schema):
-        field_schema.update(type="string")
 class AudienceSegment(BaseModel):

 class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic v2"""
     @classmethod
+    def __get_pydantic_core_schema__(cls, source_type, handler):
+        from pydantic_core import core_schema
+        return core_schema.union_schema([
+            core_schema.is_instance_schema(ObjectId),
+            core_schema.chain_schema([
+                core_schema.str_schema(),
+                core_schema.no_info_plain_validator_function(cls.validate),
+            ])
+        ],
+        serialization=core_schema.plain_serializer_function_ser_schema(
+            lambda x: str(x)
+        ))
     @classmethod
     def validate(cls, v):
+        if isinstance(v, ObjectId):
+            return v
+        if isinstance(v, str):
+            if not ObjectId.is_valid(v):
+                raise ValueError(f"Invalid ObjectId: {v}")
+            return ObjectId(v)
+        raise ValueError(f"Expected ObjectId or string, got {type(v)}")
 class AudienceSegment(BaseModel):

models/sentiment_models.py CHANGED Viewed

@@ -12,20 +12,33 @@ from bson import ObjectId
 class PyObjectId(ObjectId):
-    """Custom ObjectId type for Pydantic"""
-    @classmethod
-    def __get_validators__(cls):
-        yield cls.validate
     @classmethod
-    def validate(cls, v):
-        if not ObjectId.is_valid(v):
-            raise ValueError("Invalid ObjectId")
-        return ObjectId(v)
     @classmethod
-    def __modify_schema__(cls, field_schema):
-        field_schema.update(type="string")
 class SentimentAnalysisResult(BaseModel):

 class PyObjectId(ObjectId):
+    """Custom ObjectId type for Pydantic v2"""
     @classmethod
+    def __get_pydantic_core_schema__(cls, source_type, handler):
+        from pydantic_core import core_schema
+        return core_schema.union_schema([
+            core_schema.is_instance_schema(ObjectId),
+            core_schema.chain_schema([
+                core_schema.str_schema(),
+                core_schema.no_info_plain_validator_function(cls.validate),
+            ])
+        ],
+        serialization=core_schema.plain_serializer_function_ser_schema(
+            lambda x: str(x)
+        ))
     @classmethod
+    def validate(cls, v):
+        if isinstance(v, ObjectId):
+            return v
+        if isinstance(v, str):
+            if not ObjectId.is_valid(v):
+                raise ValueError(f"Invalid ObjectId: {v}")
+            return ObjectId(v)
+        raise ValueError(f"Expected ObjectId or string, got {type(v)}")
 class SentimentAnalysisResult(BaseModel):

scripts/create_indexes.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 MongoDB Index Creation Script
 Author: AI Generated
-Created: 2025-11-24
-Purpose: Create performance indexes for event-centric queries
 """
 from database import db
@@ -12,11 +12,11 @@ from config import settings
 def create_all_indexes():
     """
     Create all necessary indexes for optimal performance.
-    Run this once during deployment or when setting up a new environment.
     """
     print("=" * 60)
-    print("🔧 Creating MongoDB Indexes")
     print("=" * 60)
     # Payment Collection Indexes
@@ -24,37 +24,55 @@ def create_all_indexes():
     # Index for event-specific ticket purchases
     db.payments.create_index(
-        [("EventCode", 1), ("Status", 1), ("UserId", 1)],
         name="idx_payment_event_status_user"
     )
     print("  ✓ Created: idx_payment_event_status_user")
     # Index for user RFM calculation
     db.payments.create_index(
-        [("UserId", 1), ("TransactionDate", -1)],
         name="idx_payment_user_date"
     )
     print("  ✓ Created: idx_payment_user_date")
-    # UserFollow Collection Indexes
-    print("\n👥 UserFollow Collection:")
-    # Index for event followers
-    db.user_follows.create_index(
-        [("EventCode", 1), ("userId", 1)],
-        name="idx_follow_event_user"
     )
-    print("  ✓ Created: idx_follow_event_user")
-    # UserCommentPost Collection Indexes
-    print("\n💬 UserCommentPost Collection:")
-    # Index for event comments
-    db.user_comment_post.create_index(
-        [("EventCode", 1), ("CreatedDate", -1)],
-        name="idx_comment_event_date"
     )
-    print("  ✓ Created: idx_comment_event_date")
     # EventAudienceSegment Collection Indexes
     print("\n🎯 EventAudienceSegment Collection:")
@@ -93,6 +111,13 @@ def create_all_indexes():
     )
     print("  ✓ Created: idx_sentiment_result_event_date")
     print("\n" + "=" * 60)
     print("✅ All Indexes Created Successfully!")
     print("=" * 60)
@@ -100,8 +125,8 @@ def create_all_indexes():
     # List all indexes for verification
     print("\n📋 Index Summary:")
     print(f"  Payment: {len(list(db.payments.list_indexes()))} indexes")
-    print(f"  UserFollow: {len(list(db.user_follows.list_indexes()))} indexes")
-    print(f"  UserCommentPost: {len(list(db.user_comment_post.list_indexes()))} indexes")
     print(f"  EventAudienceSegment: {len(list(db.event_audience_segments.list_indexes()))} indexes")
     print(f"  EventSentimentSummary: {len(list(db.event_sentiment_summary.list_indexes()))} indexes")
     print(f"  SentimentAnalysisResult: {len(list(db.sentiment_results.list_indexes()))} indexes")

 """
 MongoDB Index Creation Script
 Author: AI Generated
+Created: 2025-11-24 (Fixed for actual schema)
+Purpose: Create performance indexes matching actual MongoDB structure
 """
 from database import db
 def create_all_indexes():
     """
     Create all necessary indexes for optimal performance.
+    Based on ACTUAL MongoDB structure from models.txt
     """
     print("=" * 60)
+    print("🔧 Creating MongoDB Indexes (Corrected Schema)")
     print("=" * 60)
     # Payment Collection Indexes
     # Index for event-specific ticket purchases
     db.payments.create_index(
+        [("eventCode", 1), ("status", 1), ("userId", 1)],
         name="idx_payment_event_status_user"
     )
     print("  ✓ Created: idx_payment_event_status_user")
     # Index for user RFM calculation
     db.payments.create_index(
+        [("userId", 1), ("transactionDate", -1)],
         name="idx_payment_user_date"
     )
     print("  ✓ Created: idx_payment_user_date")
+    # User Collection Indexes (UserFollows is EMBEDDED)
+    print("\n👥 User Collection:")
+    # Index for finding users who follow a specific event
+    # UserFollows is an embedded array, so we use dot notation
+    db.users.create_index(
+        [("UserFollows.eventCode", 1)],
+        name="idx_user_follows_event",
+        sparse=True  # Skip documents without UserFollows
     )
+    print("  ✓ Created: idx_user_follows_event (embedded array)")
+    # Index for user status (to filter Active users)
+    db.users.create_index(
+        [("status", 1)],
+        name="idx_user_status"
+    )
+    print("  ✓ Created: idx_user_status")
+    # PostSocialMedia Collection Indexes
+    print("\n💬 PostSocialMedia Collection:")
+    # Index for event-specific posts
+    db.post_social_media.create_index(
+        [("eventCode", 1), ("createdAt", -1)],
+        name="idx_post_event_date"
+    )
+    print("  ✓ Created: idx_post_event_date")
+    # Index for searching comments (nested in Images.UserCommentPosts)
+    # Using wildcard for nested arrays
+    db.post_social_media.create_index(
+        [("eventCode", 1), ("images.userCommentPosts.commentedAt", -1)],
+        name="idx_post_comments",
+        sparse=True
     )
+    print("  ✓ Created: idx_post_comments (nested array)")
     # EventAudienceSegment Collection Indexes
     print("\n🎯 EventAudienceSegment Collection:")
     )
     print("  ✓ Created: idx_sentiment_result_event_date")
+    # Index for sentiment label filtering
+    db.sentiment_results.create_index(
+        [("event_code", 1), ("sentiment_label", 1)],
+        name="idx_sentiment_event_label"
+    )
+    print("  ✓ Created: idx_sentiment_event_label")
     print("\n" + "=" * 60)
     print("✅ All Indexes Created Successfully!")
     print("=" * 60)
     # List all indexes for verification
     print("\n📋 Index Summary:")
     print(f"  Payment: {len(list(db.payments.list_indexes()))} indexes")
+    print(f"  User: {len(list(db.users.list_indexes()))} indexes")
+    print(f"  PostSocialMedia: {len(list(db.post_social_media.list_indexes()))} indexes")
     print(f"  EventAudienceSegment: {len(list(db.event_audience_segments.list_indexes()))} indexes")
     print(f"  EventSentimentSummary: {len(list(db.event_sentiment_summary.list_indexes()))} indexes")
     print(f"  SentimentAnalysisResult: {len(list(db.sentiment_results.list_indexes()))} indexes")

services/data_aggregation.py CHANGED Viewed

@@ -1,12 +1,13 @@
 """
 Data Aggregation Pipeline for Event-Centric User Segmentation
 Author: AI Generated
-Created: 2025-11-24 (Refactored for event-centric analysis)
-Purpose: Aggregate user features for a specific event using MongoDB pipelines
 """
 from typing import List, Dict
 from datetime import datetime
 from database import db
 from config import settings
@@ -14,7 +15,9 @@ from config import settings
 class UserDataAggregator:
     """
     Aggregates user behavioral data for segmentation per event.
-    Uses MongoDB Aggregation Framework to minimize data transfer.
     """
     def __init__(self, event_code: str):
@@ -22,7 +25,7 @@ class UserDataAggregator:
         Initialize aggregator for a specific event.
         Args:
-            event_code: Event identifier to filter users
         """
         self.event_code = event_code
         self.db = db
@@ -31,23 +34,23 @@ class UserDataAggregator:
         """
         Aggregate user features for the specified event.
-        Returns users who:
-        1. Bought tickets for this event
-        2. Follow this event
-        3. Commented on this event
-        Returns: List of user feature vectors with event-specific context
         """
         pipeline = [
-            # Stage 1: Start with users who interacted with THIS event
             {
                 "$match": {
-                    "Status": "Active"
                 }
             },
-            # Stage 2: Lookup tickets bought for THIS EVENT
             {
                 "$lookup": {
                     "from": settings.COLLECTION_PAYMENTS,
@@ -57,9 +60,9 @@ class UserDataAggregator:
                             "$match": {
                                 "$expr": {
                                     "$and": [
-                                        {"$eq": ["$UserId", "$$user_id"]},
-                                        {"$eq": ["$EventCode", self.event_code]},
-                                        {"$eq": ["$Status", "Completed"]}
                                     ]
                                 }
                             }
@@ -69,10 +72,34 @@ class UserDataAggregator:
                 }
             },
-            # Stage 3: Lookup follows for THIS EVENT
             {
                 "$lookup": {
-                    "from": settings.COLLECTION_USER_FOLLOWS,
                     "let": {"user_id": "$_id"},
                     "pipeline": [
                         {
@@ -80,39 +107,22 @@ class UserDataAggregator:
                                 "$expr": {
                                     "$and": [
                                         {"$eq": ["$userId", "$$user_id"]},
-                                        {"$eq": ["$EventCode", self.event_code]}
                                     ]
                                 }
                             }
                         }
                     ],
-                    "as": "event_follows"
                 }
             },
-            # Stage 4: Lookup all payments for global RFM (user lifetime value)
-            {
-                "$lookup": {
-                    "from": settings.COLLECTION_PAYMENTS,
-                    "localField": "_id",
-                    "foreignField": "UserId",
-                    "as": "all_payments",
-                    "pipeline": [
-                        {
-                            "$match": {
-                                "Status": "Completed"
-                            }
-                        }
-                    ]
-                }
-            },
-            # Stage 5: Filter users who interacted with this event
             {
                 "$match": {
                     "$or": [
-                        {"event_tickets": {"$ne": []}},
-                        {"event_follows": {"$ne": []}}
                     ]
                 }
             },
@@ -120,32 +130,23 @@ class UserDataAggregator:
             # Stage 6: Calculate event-specific metrics
             {
                 "$addFields": {
-                    # Event-specific: tickets bought for THIS event
                     "event_ticket_count": {"$size": "$event_tickets"},
-                    "event_total_spend": {"$sum": "$event_tickets.Amount"},
-                    # Event-specific: follow status
-                    "is_follower": {
-                        "$cond": [
-                            {"$gt": [{"$size": "$event_follows"}, 0]},
-                            1,
-                            0
-                        ]
-                    },
-                    # Global RFM: user's overall purchasing power
-                    "global_total_spend": {"$sum": "$all_payments.Amount"},
                     "global_transaction_count": {"$size": "$all_payments"},
-                    "global_last_transaction": {"$max": "$all_payments.TransactionDate"}
                 }
             },
-            # Stage 7: Calculate global recency
             {
                 "$addFields": {
                     "global_recency_days": {
                         "$cond": {
-                            "if": {"$gt": ["$global_last_transaction", None]},
                             "then": {
                                 "$dateDiff": {
                                     "startDate": "$global_last_transaction",
@@ -165,15 +166,15 @@ class UserDataAggregator:
                     "_id": 1,
                     "user_id": "$_id",
                     "email": 1,
-                    "firstName": "$FirstName",
-                    "lastName": "$LastName",
                     # Event-specific features
                     "event_ticket_count": 1,
                     "event_total_spend": 1,
-                    "is_follower": 1,
-                    # Global features (user power)
                     "global_recency": "$global_recency_days",
                     "global_frequency": "$global_transaction_count",
                     "global_monetary": "$global_total_spend"

 """
 Data Aggregation Pipeline for Event-Centric User Segmentation
 Author: AI Generated
+Created: 2025-11-24 (Fixed for actual MongoDB schema)
+Purpose: Aggregate user features based on EMBEDDED UserFollows and nested comments
 """
 from typing import List, Dict
 from datetime import datetime
+from bson import ObjectId
 from database import db
 from config import settings
 class UserDataAggregator:
     """
     Aggregates user behavioral data for segmentation per event.
+    CORRECTED to use:
+    - User.UserFollows (embedded array)
+    - PostSocialMedia.Images.UserCommentPosts (nested)
     """
     def __init__(self, event_code: str):
         Initialize aggregator for a specific event.
         Args:
+            event_code: Event identifier (ObjectId string)
         """
         self.event_code = event_code
         self.db = db
         """
         Aggregate user features for the specified event.
+        Users are considered "interacted" if they:
+        1. Bought tickets (Payment.eventCode)
+        2. Follow event (User.UserFollows.eventCode)
+        3. Commented on posts (PostSocialMedia.Images.UserCommentPosts where PostSocialMedia.eventCode)
+        Returns: List of user feature vectors
         """
         pipeline = [
+            # Stage 1: Start with Active users only
             {
                 "$match": {
+                    "status": "Active"
                 }
             },
+            # Stage 2: Lookup ticket purchases for THIS EVENT
             {
                 "$lookup": {
                     "from": settings.COLLECTION_PAYMENTS,
                             "$match": {
                                 "$expr": {
                                     "$and": [
+                                        {"$eq": ["$userId", "$$user_id"]},
+                                        {"$eq": ["$eventCode", ObjectId(self.event_code)]},
+                                        {"$eq": ["$status", "Completed"]}
                                     ]
                                 }
                             }
                 }
             },
+            # Stage 3: Check if user follows THIS EVENT (embedded UserFollows)
+            {
+                "$addFields": {
+                    "is_following_event": {
+                        "$cond": {
+                            "if": {
+                                "$in": [
+                                    ObjectId(self.event_code),
+                                    {
+                                        "$map": {
+                                            "input": {"$ifNull": ["$UserFollows", []]},
+                                            "as": "follow",
+                                            "in": "$$follow.eventCode"
+                                        }
+                                    }
+                                ]
+                            },
+                            "then": 1,
+                            "else": 0
+                        }
+                    }
+                }
+            },
+            # Stage 4: Lookup ALL payments for global RFM
             {
                 "$lookup": {
+                    "from": settings.COLLECTION_PAYMENTS,
                     "let": {"user_id": "$_id"},
                     "pipeline": [
                         {
                                 "$expr": {
                                     "$and": [
                                         {"$eq": ["$userId", "$$user_id"]},
+                                        {"$eq": ["$status", "Completed"]}
                                     ]
                                 }
                             }
                         }
                     ],
+                    "as": "all_payments"
                 }
             },
+            # Stage 5: Filter users who interacted with THIS EVENT
             {
                 "$match": {
                     "$or": [
+                        {"event_tickets": {"$ne": []}},  # Bought tickets
+                        {"is_following_event": 1}  # Following event
                     ]
                 }
             },
             # Stage 6: Calculate event-specific metrics
             {
                 "$addFields": {
+                    # Event-specific features
                     "event_ticket_count": {"$size": "$event_tickets"},
+                    "event_total_spend": {"$sum": "$event_tickets.amount"},
+                    # Global RFM
+                    "global_total_spend": {"$sum": "$all_payments.amount"},
                     "global_transaction_count": {"$size": "$all_payments"},
+                    "global_last_transaction": {"$max": "$all_payments.transactionDate"}
                 }
             },
+            # Stage 7: Calculate recency
             {
                 "$addFields": {
                     "global_recency_days": {
                         "$cond": {
+                            "if": {"$ne": ["$global_last_transaction", None]},
                             "then": {
                                 "$dateDiff": {
                                     "startDate": "$global_last_transaction",
                     "_id": 1,
                     "user_id": "$_id",
                     "email": 1,
+                    "firstName": "$firstName",
+                    "lastName": "$lastName",
                     # Event-specific features
                     "event_ticket_count": 1,
                     "event_total_spend": 1,
+                    "is_follower": "$is_following_event",
+                    # Global features
                     "global_recency": "$global_recency_days",
                     "global_frequency": "$global_transaction_count",
                     "global_monetary": "$global_total_spend"

services/genai_service.py CHANGED Viewed

@@ -258,7 +258,7 @@ BODY:
         """
         Generate AI insights from negative comments.
         """
-        # Get negative comments
         negative_results = list(db.sentiment_results.find({
             "event_code": self.event_code,
             "sentiment_label": "Negative"
@@ -272,13 +272,27 @@ BODY:
                 predicted_nps=70.0
             )
-        # Get comment texts
-        comment_ids = [r['source_id'] for r in negative_results]
-        comments = list(db.user_comment_post.find({
-            "_id": {"$in": comment_ids}
-        }))
-        negative_texts = [c.get('CommentText', '') for c in comments if c.get('CommentText')]
         # Build prompt
         comments_sample = "\n".join([f"- {text[:100]}" for text in negative_texts[:15]])

         """
         Generate AI insights from negative comments.
         """
+        # Get negative sentiment results (already analyzed and saved)
         negative_results = list(db.sentiment_results.find({
             "event_code": self.event_code,
             "sentiment_label": "Negative"
                 predicted_nps=70.0
             )
+        # Get original comment texts from PostSocialMedia
+        comment_ids = [ObjectId(r['source_id']) for r in negative_results]
+        # Extract comments from nested structure
+        pipeline = [
+            {"$unwind": "$images"},
+            {"$unwind": "$images.userCommentPosts"},
+            {
+                "$match": {
+                    "images.userCommentPosts.commentId": {"$in": comment_ids}
+                }
+            },
+            {
+                "$project": {
+                    "comment_text": "$images.userCommentPosts.commentText"
+                }
+            }
+        ]
+        comments = list(db.post_social_media.aggregate(pipeline))
+        negative_texts = [c.get('comment_text', '') for c in comments if c.get('comment_text')]
         # Build prompt
         comments_sample = "\n".join([f"- {text[:100]}" for text in negative_texts[:15]])

services/sentiment_service.py CHANGED Viewed

@@ -1,8 +1,8 @@
 """
 Event-Centric Sentiment Analysis Service
 Author: AI Generated
-Created: 2025-11-24 (Refactored)
-Purpose: Analyze sentiment for event comments and generate summary
 """
 import torch
@@ -22,6 +22,7 @@ from services.monitoring import monitor
 class SentimentAnalysisService:
     """
     Event-centric sentiment analysis using PhoBERT.
     """
     def __init__(self, event_code: str):
@@ -29,7 +30,7 @@ class SentimentAnalysisService:
         Initialize for a specific event.
         Args:
-            event_code: Event identifier
         """
         self.event_code = event_code
         self.model_name = settings.SENTIMENT_MODEL
@@ -58,7 +59,7 @@ class SentimentAnalysisService:
         print(f"✓ Model loaded on {self.device}")
     def analyze_text(self, text: str) -> Tuple[str, float]:
-        """Analyze single text with preprocessing."""
         if not self.model:
             self.load_model()
@@ -87,11 +88,60 @@ class SentimentAnalysisService:
         sentiment_label = self.label_map.get(predicted_class, "Neutral")
         return sentiment_label, confidence
     def analyze_event_comments(self) -> Dict:
         """
         Analyze all comments for this event.
-        Returns summary statistics.
         """
         import time
         start_time = time.time()
@@ -104,10 +154,8 @@ class SentimentAnalysisService:
             if not self.model:
                 self.load_model()
-            # Fetch comments for THIS EVENT only
-            comments = list(db.user_comment_post.find({
-                "EventCode": self.event_code
-            }).limit(1000))
             print(f"✓ Found {len(comments)} comments for this event")
@@ -122,7 +170,7 @@ class SentimentAnalysisService:
             all_keywords = []
             for comment in comments:
-                text = comment.get('CommentText', '')
                 if not text:
                     continue
@@ -131,9 +179,9 @@ class SentimentAnalysisService:
                 # Save individual result
                 result = SentimentAnalysisResult(
-                    source_id=comment['_id'],
                     source_type="UserCommentPost",
-                    event_code=self.event_code,  # NEW: link to event
                     sentiment_label=sentiment,
                     confidence_score=confidence,
                     key_phrases=keywords,
@@ -147,7 +195,7 @@ class SentimentAnalysisService:
                 total_confidence += confidence
                 all_keywords.extend(keywords)
-            # Bulk insert results
             if results_to_save:
                 db.sentiment_results.insert_many(results_to_save)
                 print(f"✓ Saved {len(results_to_save)} sentiment results")
@@ -155,7 +203,7 @@ class SentimentAnalysisService:
             # Calculate summary
             avg_confidence = total_confidence / len(results_to_save) if results_to_save else 0
-            # Get top keywords
             keyword_freq = {}
             for kw in all_keywords:
                 keyword_freq[kw] = keyword_freq.get(kw, 0) + 1
@@ -173,7 +221,7 @@ class SentimentAnalysisService:
                 sentiment_distribution=sentiment_counts,
                 avg_confidence=avg_confidence,
                 top_keywords=top_keywords,
-                ai_insights=None,  # Will be filled by GenAI
                 last_updated=datetime.utcnow()
             )
@@ -186,10 +234,10 @@ class SentimentAnalysisService:
             # Print summary
             print("\n📊 Sentiment Distribution:")
             for label, count in sentiment_counts.items():
-                pct = (count / len(results_to_save) *100) if results_to_save else 0
                 print(f"  {label}: {count} ({pct:.1f}%)")
-            # Log metrics
             execution_time = time.time() - start_time
             metrics = {
                 "event_code": self.event_code,

 """
 Event-Centric Sentiment Analysis Service
 Author: AI Generated
+Created: 2025-11-24 (Fixed for actual MongoDB schema)
+Purpose: Analyze sentiment for comments nested in PostSocialMedia.Images
 """
 import torch
 class SentimentAnalysisService:
     """
     Event-centric sentiment analysis using PhoBERT.
+    Comments are nested: PostSocialMedia.Images.UserCommentPosts
     """
     def __init__(self, event_code: str):
         Initialize for a specific event.
         Args:
+            event_code: Event identifier (ObjectId string)
         """
         self.event_code = event_code
         self.model_name = settings.SENTIMENT_MODEL
         print(f"✓ Model loaded on {self.device}")
     def analyze_text(self, text: str) -> Tuple[str, float]:
+        """Analyze single text"""
         if not self.model:
             self.load_model()
         sentiment_label = self.label_map.get(predicted_class, "Neutral")
         return sentiment_label, confidence
+    def extract_comments_from_posts(self) -> List[Dict]:
+        """
+        Extract all comments from PostSocialMedia for this event.
+        Structure: PostSocialMedia → Images[] → UserCommentPosts[]
+        """
+        pipeline = [
+            # Match posts for this event
+            {
+                "$match": {
+                    "eventCode": ObjectId(self.event_code)
+                }
+            },
+            # Unwind images array
+            {
+                "$unwind": {
+                    "path": "$images",
+                    "preserveNullAndEmptyArrays": False
+                }
+            },
+            # Unwind UserCommentPosts within each image
+            {
+                "$unwind": {
+                    "path": "$images.userCommentPosts",
+                    "preserveNullAndEmptyArrays": False
+                }
+            },
+            # Project the fields we need
+            {
+                "$project": {
+                    "post_id": "$_id",
+                    "image_id": "$images.imageInPostId",
+                    "comment_id": "$images.userCommentPosts.commentId",
+                    "user_id": "$images.userCommentPosts.userId",
+                    "comment_text": "$images.userCommentPosts.commentText",
+                    "commented_at": "$images.userCommentPosts.commentedAt"
+                }
+            },
+            # Limit for performance
+            {
+                "$limit": 1000
+            }
+        ]
+        comments = list(db.post_social_media.aggregate(pipeline))
+        return comments
     def analyze_event_comments(self) -> Dict:
         """
         Analyze all comments for this event.
         """
         import time
         start_time = time.time()
             if not self.model:
                 self.load_model()
+            # Extract comments
+            comments = self.extract_comments_from_posts()
             print(f"✓ Found {len(comments)} comments for this event")
             all_keywords = []
             for comment in comments:
+                text = comment.get('comment_text', '')
                 if not text:
                     continue
                 # Save individual result
                 result = SentimentAnalysisResult(
+                    source_id=ObjectId(comment['comment_id']),
                     source_type="UserCommentPost",
+                    event_code=self.event_code,
                     sentiment_label=sentiment,
                     confidence_score=confidence,
                     key_phrases=keywords,
                 total_confidence += confidence
                 all_keywords.extend(keywords)
+            # Bulk insert
             if results_to_save:
                 db.sentiment_results.insert_many(results_to_save)
                 print(f"✓ Saved {len(results_to_save)} sentiment results")
             # Calculate summary
             avg_confidence = total_confidence / len(results_to_save) if results_to_save else 0
+            # Top keywords
             keyword_freq = {}
             for kw in all_keywords:
                 keyword_freq[kw] = keyword_freq.get(kw, 0) + 1
                 sentiment_distribution=sentiment_counts,
                 avg_confidence=avg_confidence,
                 top_keywords=top_keywords,
+                ai_insights=None,
                 last_updated=datetime.utcnow()
             )
             # Print summary
             print("\n📊 Sentiment Distribution:")
             for label, count in sentiment_counts.items():
+                pct = (count / len(results_to_save) * 100) if results_to_save else 0
                 print(f"  {label}: {count} ({pct:.1f}%)")
+            # Monitoring
             execution_time = time.time() - start_time
             metrics = {
                 "event_code": self.event_code,