| import numpy as np |
| import json |
| from typing import List, Dict, Any, Tuple |
| from sqlalchemy import select, func |
| from sqlalchemy.ext.asyncio import AsyncSession |
|
|
| from backend.db.models.comment import Comment |
| from backend.services.ml_model import toxicity_detector |
|
|
| async def find_similar_comments( |
| db: AsyncSession, |
| text: str, |
| limit: int = 10, |
| classification: int = None |
| ) -> List[Dict[str, Any]]: |
| """ |
| Find comments similar to the given text using vector similarity |
| |
| Args: |
| db: Database session |
| text: Text to find similar comments for |
| limit: Maximum number of similar comments to return |
| classification: Filter by classification (optional) |
| |
| Returns: |
| List of similar comments with similarity scores |
| """ |
| |
| embedding = await toxicity_detector.get_embeddings(text) |
| |
| |
| query = select(Comment) |
| |
| |
| if classification is not None: |
| query = query.filter(Comment.classification_result == classification) |
| |
| |
| result = await db.execute(query) |
| comments = result.scalars().all() |
| |
| |
| comments_with_distances = [] |
| for comment in comments: |
| try: |
| |
| if hasattr(comment, "content_vector_json") and comment.content_vector_json: |
| comment_vector = np.array(json.loads(comment.content_vector_json)) |
| elif hasattr(comment, "content_vector") and comment.content_vector: |
| comment_vector = np.array(comment.content_vector) |
| else: |
| continue |
| |
| |
| distance = np.linalg.norm(embedding - comment_vector) |
| comments_with_distances.append((comment, distance)) |
| except (TypeError, ValueError, json.JSONDecodeError): |
| |
| continue |
| |
| |
| comments_with_distances.sort(key=lambda x: x[1]) |
| |
| |
| comments_with_distances = comments_with_distances[:limit] |
| |
| |
| similar_comments = [] |
| for comment, distance in comments_with_distances: |
| |
| similarity = max(0, min(100, 100 * (1 - distance / 10))) |
| |
| similar_comments.append({ |
| "id": comment.id, |
| "content": comment.content, |
| "classification": comment.classification_result, |
| "platform": comment.source_platform, |
| "similarity_score": round(similarity, 2), |
| "created_at": comment.created_at.isoformat() if comment.created_at else None |
| }) |
| |
| return similar_comments |
|
|
| async def cluster_comments( |
| db: AsyncSession, |
| limit: int = 100, |
| classification: int = None |
| ) -> List[Dict[str, Any]]: |
| """ |
| Perform basic clustering on comments to find groups of similar content |
| |
| Args: |
| db: Database session |
| limit: Maximum number of comments to analyze |
| classification: Filter by classification (optional) |
| |
| Returns: |
| List of comment clusters |
| """ |
| |
| query = select(Comment).limit(limit) |
| |
| |
| if classification is not None: |
| query = query.filter(Comment.classification_result == classification) |
| |
| |
| result = await db.execute(query) |
| comments = result.scalars().all() |
| |
| |
| comment_vectors = [] |
| for comment in comments: |
| try: |
| |
| if hasattr(comment, "content_vector_json") and comment.content_vector_json: |
| vector = np.array(json.loads(comment.content_vector_json)) |
| elif hasattr(comment, "content_vector") and comment.content_vector: |
| vector = np.array(comment.content_vector) |
| else: |
| continue |
| |
| comment_vectors.append({ |
| "id": comment.id, |
| "content": comment.content, |
| "classification": comment.classification_result, |
| "platform": comment.source_platform, |
| "vector": vector, |
| "created_at": comment.created_at |
| }) |
| except (TypeError, ValueError, json.JSONDecodeError): |
| |
| continue |
| |
| |
| if len(comment_vectors) < 5: |
| return [{ |
| "cluster_id": 0, |
| "size": len(comment_vectors), |
| "comments": [ |
| { |
| "id": c["id"], |
| "content": c["content"], |
| "classification": c["classification"], |
| "platform": c["platform"], |
| "created_at": c["created_at"].isoformat() if c["created_at"] else None |
| } |
| for c in comment_vectors |
| ] |
| }] |
| |
| |
| |
| clusters = [] |
| assigned_comments = set() |
| |
| |
| for i, comment in enumerate(comment_vectors): |
| if comment["id"] in assigned_comments: |
| continue |
| |
| |
| cluster_comments = [comment] |
| assigned_comments.add(comment["id"]) |
| |
| |
| for j, other_comment in enumerate(comment_vectors): |
| if other_comment["id"] in assigned_comments: |
| continue |
| |
| |
| vec1 = comment["vector"] |
| vec2 = other_comment["vector"] |
| distance = np.linalg.norm(vec1 - vec2) |
| similarity = max(0, min(100, 100 * (1 - distance / 10))) |
| |
| |
| if similarity > 70: |
| cluster_comments.append(other_comment) |
| assigned_comments.add(other_comment["id"]) |
| |
| |
| if len(cluster_comments) > 1: |
| clusters.append({ |
| "cluster_id": len(clusters), |
| "size": len(cluster_comments), |
| "comments": [ |
| { |
| "id": c["id"], |
| "content": c["content"], |
| "classification": c["classification"], |
| "platform": c["platform"], |
| "created_at": c["created_at"].isoformat() if c["created_at"] else None |
| } |
| for c in cluster_comments |
| ] |
| }) |
| |
| |
| remaining = [c for c in comment_vectors if c["id"] not in assigned_comments] |
| if remaining: |
| clusters.append({ |
| "cluster_id": len(clusters), |
| "size": len(remaining), |
| "comments": [ |
| { |
| "id": c["id"], |
| "content": c["content"], |
| "classification": c["classification"], |
| "platform": c["platform"], |
| "created_at": c["created_at"].isoformat() if c["created_at"] else None |
| } |
| for c in remaining |
| ] |
| }) |
| |
| return clusters |