File size: 4,015 Bytes
c09e844
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
"""

Visualization Service

WordCloud generation and data visualization utilities

"""
import os
from typing import List, Dict
from collections import Counter
from wordcloud import WordCloud
import matplotlib
matplotlib.use('Agg')  # Use non-GUI backend
import matplotlib.pyplot as plt
from datetime import datetime
from pathlib import Path

from app.config import WORDCLOUD_DIR


class VisualizationService:
    """Service for generating visualizations"""
    
    def __init__(self):
        # Vietnamese stopwords (common words to exclude)
        self.stopwords = set([
            'và', 'của', 'có', 'cho', 'với', 'từ', 'này', 'được',
            'là', 'để', 'một', 'các', 'trong', 'không', 'đã', 'rất',
            'cũng', 'nhưng', 'thì', 'bị', 'khi', 'nếu', 'như', 'về',
            'tôi', 'bạn', 'mình', 'nó', 'họ', 'em', 'anh', 'chị',
            'vì', 'nên', 'đến', 'lại', 'ra', 'đang', 'sẽ', 'đều',
            'hay', 'thế', 'làm', 'được', 'rồi', 'đó', 'này', 'ở'
        ])
    
    def generate_wordcloud(self, texts: List[str], filename: str = None) -> str:
        """

        Generate word cloud from list of texts

        

        Args:

            texts: List of Vietnamese comments

            filename: Optional custom filename

            

        Returns:

            str: Path to generated word cloud image

        """
        # Combine all texts
        combined_text = ' '.join(texts)
        
        # Generate filename if not provided
        if filename is None:
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"wordcloud_{timestamp}.png"
        
        filepath = WORDCLOUD_DIR / filename
        
        # Create word cloud
        wordcloud = WordCloud(
            width=800,
            height=400,
            background_color='white',
            stopwords=self.stopwords,
            colormap='viridis',
            max_words=100,
            relative_scaling=0.5,
            min_font_size=10
        ).generate(combined_text)
        
        # Save to file
        plt.figure(figsize=(10, 5))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(filepath, dpi=150, bbox_inches='tight')
        plt.close()
        
        # Return relative URL path
        return f"/static/uploads/wordclouds/{filename}"
    
    def calculate_rating_distribution(self, ratings: List[int]) -> Dict[int, int]:
        """

        Calculate distribution of ratings

        

        Args:

            ratings: List of ratings (1-5)

            

        Returns:

            dict: {rating: count}

        """
        distribution = Counter(ratings)
        
        # Ensure all ratings 1-5 are present
        for rating in range(1, 6):
            if rating not in distribution:
                distribution[rating] = 0
        
        return dict(sorted(distribution.items()))
    
    def get_top_words(self, texts: List[str], top_n: int = 20) -> List[tuple]:
        """

        Get most frequent words from texts

        

        Args:

            texts: List of comments

            top_n: Number of top words to return

            

        Returns:

            list: [(word, count), ...]

        """
        # Combine and split texts
        words = []
        for text in texts:
            words.extend(text.lower().split())
        
        # Filter stopwords
        filtered_words = [w for w in words if w not in self.stopwords and len(w) > 2]
        
        # Count and return top words
        word_counts = Counter(filtered_words)
        return word_counts.most_common(top_n)


# Singleton instance
viz_service = VisualizationService()


def get_viz_service() -> VisualizationService:
    """Dependency to get visualization service"""
    return viz_service