File size: 7,835 Bytes
4f6e61d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import numpy as np
import json
from typing import List, Dict, Any, Tuple
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession

from backend.db.models.comment import Comment
from backend.services.ml_model import toxicity_detector

async def find_similar_comments(
    db: AsyncSession, 
    text: str, 
    limit: int = 10, 
    classification: int = None
) -> List[Dict[str, Any]]:
    """
    Find comments similar to the given text using vector similarity
    
    Args:
        db: Database session
        text: Text to find similar comments for
        limit: Maximum number of similar comments to return
        classification: Filter by classification (optional)
        
    Returns:
        List of similar comments with similarity scores
    """
    # Get vector representation of the text
    embedding = await toxicity_detector.get_embeddings(text)
    
    # Build query to get comments
    query = select(Comment)
    
    # Add classification filter if provided
    if classification is not None:
        query = query.filter(Comment.classification_result == classification)
    
    # Execute query
    result = await db.execute(query)
    comments = result.scalars().all()
    
    # Calculate similarities manually
    comments_with_distances = []
    for comment in comments:
        try:
            # Get the vector from content_vector_json
            if hasattr(comment, "content_vector_json") and comment.content_vector_json:
                comment_vector = np.array(json.loads(comment.content_vector_json))
            elif hasattr(comment, "content_vector") and comment.content_vector:
                comment_vector = np.array(comment.content_vector)
            else:
                continue
                
            # Calculate L2 distance
            distance = np.linalg.norm(embedding - comment_vector)
            comments_with_distances.append((comment, distance))
        except (TypeError, ValueError, json.JSONDecodeError):
            # Skip comments with invalid vectors
            continue
    
    # Sort by distance (lowest first)
    comments_with_distances.sort(key=lambda x: x[1])
    
    # Take only the top 'limit' results
    comments_with_distances = comments_with_distances[:limit]
    
    # Format results
    similar_comments = []
    for comment, distance in comments_with_distances:
        # Convert distance to similarity score (0-100)
        similarity = max(0, min(100, 100 * (1 - distance / 10)))
        
        similar_comments.append({
            "id": comment.id,
            "content": comment.content,
            "classification": comment.classification_result,
            "platform": comment.source_platform,
            "similarity_score": round(similarity, 2),
            "created_at": comment.created_at.isoformat() if comment.created_at else None
        })
    
    return similar_comments

async def cluster_comments(
    db: AsyncSession, 
    limit: int = 100, 
    classification: int = None
) -> List[Dict[str, Any]]:
    """
    Perform basic clustering on comments to find groups of similar content
    
    Args:
        db: Database session
        limit: Maximum number of comments to analyze
        classification: Filter by classification (optional)
        
    Returns:
        List of comment clusters
    """
    # Build query to get comments
    query = select(Comment).limit(limit)
    
    # Add classification filter if provided
    if classification is not None:
        query = query.filter(Comment.classification_result == classification)
    
    # Execute query
    result = await db.execute(query)
    comments = result.scalars().all()
    
    # Extract comment vectors
    comment_vectors = []
    for comment in comments:
        try:
            # Get vector from content_vector_json or content_vector
            if hasattr(comment, "content_vector_json") and comment.content_vector_json:
                vector = np.array(json.loads(comment.content_vector_json))
            elif hasattr(comment, "content_vector") and comment.content_vector:
                vector = np.array(comment.content_vector)
            else:
                continue
                
            comment_vectors.append({
                "id": comment.id,
                "content": comment.content,
                "classification": comment.classification_result,
                "platform": comment.source_platform,
                "vector": vector,
                "created_at": comment.created_at
            })
        except (TypeError, ValueError, json.JSONDecodeError):
            # Skip comments with invalid vectors
            continue
    
    # If too few comments, return them all as one cluster
    if len(comment_vectors) < 5:
        return [{
            "cluster_id": 0,
            "size": len(comment_vectors),
            "comments": [
                {
                    "id": c["id"],
                    "content": c["content"],
                    "classification": c["classification"],
                    "platform": c["platform"],
                    "created_at": c["created_at"].isoformat() if c["created_at"] else None
                }
                for c in comment_vectors
            ]
        }]
    
    # Simple clustering based on vector similarity
    # This is a basic implementation - in a real system, you'd use a more sophisticated algorithm
    clusters = []
    assigned_comments = set()
    
    # For each unassigned comment, find similar ones to form a cluster
    for i, comment in enumerate(comment_vectors):
        if comment["id"] in assigned_comments:
            continue
        
        # This comment becomes the centroid of a new cluster
        cluster_comments = [comment]
        assigned_comments.add(comment["id"])
        
        # Find similar comments
        for j, other_comment in enumerate(comment_vectors):
            if other_comment["id"] in assigned_comments:
                continue
            
            # Calculate similarity
            vec1 = comment["vector"]
            vec2 = other_comment["vector"]
            distance = np.linalg.norm(vec1 - vec2)
            similarity = max(0, min(100, 100 * (1 - distance / 10)))
            
            # If similar enough, add to cluster
            if similarity > 70:  # Threshold for similarity
                cluster_comments.append(other_comment)
                assigned_comments.add(other_comment["id"])
        
        # Only create clusters with at least 2 comments
        if len(cluster_comments) > 1:
            clusters.append({
                "cluster_id": len(clusters),
                "size": len(cluster_comments),
                "comments": [
                    {
                        "id": c["id"],
                        "content": c["content"],
                        "classification": c["classification"],
                        "platform": c["platform"],
                        "created_at": c["created_at"].isoformat() if c["created_at"] else None
                    }
                    for c in cluster_comments
                ]
            })
    
    # Add remaining comments as individual clusters
    remaining = [c for c in comment_vectors if c["id"] not in assigned_comments]
    if remaining:
        clusters.append({
            "cluster_id": len(clusters),
            "size": len(remaining),
            "comments": [
                {
                    "id": c["id"],
                    "content": c["content"],
                    "classification": c["classification"],
                    "platform": c["platform"],
                    "created_at": c["created_at"].isoformat() if c["created_at"] else None
                }
                for c in remaining
            ]
        })
    
    return clusters