File size: 18,202 Bytes
196c707
 
 
 
8056e83
 
196c707
 
8056e83
 
 
 
196c707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db39ccf
196c707
 
 
db39ccf
196c707
db39ccf
 
 
 
 
 
196c707
db39ccf
 
 
 
 
 
 
196c707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db39ccf
 
 
196c707
 
 
 
db39ccf
196c707
db39ccf
 
 
 
 
 
 
196c707
 
db39ccf
 
196c707
db39ccf
196c707
db39ccf
 
 
196c707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db39ccf
 
 
196c707
 
 
 
db39ccf
 
 
 
 
 
 
196c707
db39ccf
196c707
db39ccf
196c707
 
 
 
 
db39ccf
196c707
db39ccf
 
 
196c707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
db39ccf
 
 
 
 
 
 
 
 
 
 
 
 
 
196c707
db39ccf
 
196c707
db39ccf
196c707
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
"""

Data Analysis Module - AI-assisted analysis of survey responses

"""
import json
import sys
import os
from typing import Dict, List, Optional
from collections import Counter

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(__file__))

from llm_backend import LLMBackend


class DataAnalyzer:
    """

    Analyzes survey responses to uncover key findings, trends, and patterns.

    Provides AI-assisted summaries for qualitative research data.

    """

    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    def analyze_responses(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
        """

        Comprehensive analysis of survey responses.



        Args:

            responses: List of response dictionaries

            questions: Optional list of questions for context



        Returns:

            Analysis results including themes, sentiment, and insights

        """
        if not responses:
            return {"error": "No responses to analyze"}

        analysis = {
            "summary": {},
            "themes": [],
            "sentiment": {},
            "key_insights": [],
            "response_count": len(responses)
        }

        # Generate overall summary
        analysis["summary"] = self._generate_summary(responses, questions)

        # Extract themes
        analysis["themes"] = self._extract_themes(responses)

        # Analyze sentiment
        analysis["sentiment"] = self._analyze_sentiment(responses)

        # Generate key insights
        analysis["key_insights"] = self._generate_insights(responses, questions)

        # Add quantitative stats if applicable
        analysis["statistics"] = self._compute_statistics(responses, questions)

        return analysis

    def _generate_summary(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
        """Generate an executive summary of responses"""
        # Prepare context
        response_texts = self._extract_text_responses(responses)
        sample_size = min(50, len(response_texts))  # Use sample for large datasets
        sample_responses = response_texts[:sample_size]

        context = f"Total responses: {len(responses)}\n\n"
        if questions:
            context += "Questions asked:\n"
            for i, q in enumerate(questions[:10], 1):  # Limit to first 10 questions
                context += f"{i}. {q.get('question_text', '')}\n"
            context += "\n"

        context += "Sample responses:\n"
        for i, resp in enumerate(sample_responses, 1):
            context += f"{i}. {resp[:200]}...\n"  # Truncate long responses

        prompt = f"""Task: Analyze survey responses and generate an executive summary



{context}



**Your Analysis Should Include:**



1. **Overview:** A clear, concise high-level summary of what the data reveals (2-3 sentences)

2. **Key Patterns:** Main patterns, trends, or recurring themes observed across responses

3. **Notable Findings:** Interesting, surprising, or unexpected discoveries in the data

4. **Response Quality:** Assessment of how thoughtful, engaged, and detailed the responses are



**Output Format:** Respond ONLY with valid JSON:

{{

  "overview": "Clear summary of overall findings",

  "key_patterns": ["pattern 1", "pattern 2", "pattern 3"],

  "notable_findings": ["surprising finding 1", "unexpected discovery"],

  "response_quality": "Assessment of engagement level"

}}



**Important:** Ensure your response is valid JSON that can be parsed. Do not include any text outside the JSON object."""

        messages = [
            {"role": "system", "content": self._get_analyst_system_prompt()},
            {"role": "user", "content": prompt}
        ]

        try:
            response = self.llm.generate(messages, max_tokens=1000, temperature=0.5)
            return self._parse_json_response(response)
        except Exception as e:
            return {"error": f"Summary generation failed: {str(e)}"}

    def _extract_themes(self, responses: List[Dict], num_themes: int = 5) -> List[Dict]:
        """Extract main themes from responses using AI"""
        response_texts = self._extract_text_responses(responses)

        if not response_texts:
            return []

        # Sample for large datasets
        sample_size = min(100, len(response_texts))
        sample_responses = response_texts[:sample_size]

        prompt = f"""Task: Extract and analyze themes from survey responses



**Data:** Analyzing {len(sample_responses)} survey responses



Responses:

{self._format_responses_for_prompt(sample_responses)}



**Your Task:** Identify the top {num_themes} distinct themes that emerge from these responses.



**For Each Theme, Provide:**

1. **Theme Name:** A short, memorable, and descriptive label

2. **Description:** Clear explanation of what this theme represents and its significance

3. **Prevalence:** Estimated percentage of responses that mention or relate to this theme

4. **Example Quotes:** 2-3 actual, representative quotes from responses that illustrate this theme



**Output Format:** Respond ONLY with a valid JSON array:

[

  {{

    "theme_name": "Clear, concise theme label",

    "description": "What this theme means and why it matters",

    "prevalence": "XX%",

    "example_quotes": ["exact quote from responses", "another quote"]

  }}

]



**Important:** Ensure all responses are valid JSON. Do not include text outside the array."""

        messages = [
            {"role": "system", "content": self._get_analyst_system_prompt()},
            {"role": "user", "content": prompt}
        ]

        try:
            response = self.llm.generate(messages, max_tokens=1500, temperature=0.6)
            themes = self._parse_json_response(response)
            if isinstance(themes, list):
                return themes
            return []
        except Exception as e:
            return [{"error": f"Theme extraction failed: {str(e)}"}]

    def _analyze_sentiment(self, responses: List[Dict]) -> Dict:
        """Analyze overall sentiment of responses"""
        response_texts = self._extract_text_responses(responses)

        if not response_texts:
            return {}

        # Sample for analysis
        sample_size = min(100, len(response_texts))
        sample_responses = response_texts[:sample_size]

        prompt = f"""Task: Analyze sentiment across survey responses



**Data:** Analyzing sentiment in {len(sample_responses)} survey responses



Responses:

{self._format_responses_for_prompt(sample_responses)}



**Your Task:** Conduct a comprehensive sentiment analysis of these responses.



**Analysis Should Include:**

1. **Overall Sentiment:** The dominant sentiment tone (positive, negative, neutral, or mixed)

2. **Sentiment Distribution:** Estimated percentage breakdown across sentiment categories

3. **Emotional Tone:** Key emotions or emotional themes detected in responses

4. **Intensity:** The strength of the sentiments (low, moderate, or high)



**Output Format:** Respond ONLY with valid JSON:

{{

  "overall_sentiment": "positive|negative|neutral|mixed",

  "distribution": {{

    "positive": "XX%",

    "neutral": "XX%",

    "negative": "XX%"

  }},

  "emotions": ["emotion1", "emotion2", "emotion3"],

  "intensity": "low|moderate|high"

}}



**Important:** Return only valid JSON. Do not include explanatory text."""

        messages = [
            {"role": "system", "content": self._get_analyst_system_prompt()},
            {"role": "user", "content": prompt}
        ]

        try:
            response = self.llm.generate(messages, max_tokens=500, temperature=0.4)
            return self._parse_json_response(response)
        except Exception as e:
            return {"error": f"Sentiment analysis failed: {str(e)}"}

    def _generate_insights(self, responses: List[Dict], questions: List[Dict] = None) -> List[str]:
        """Generate actionable insights from the data"""
        response_texts = self._extract_text_responses(responses)

        if not response_texts:
            return []

        sample_size = min(100, len(response_texts))
        sample_responses = response_texts[:sample_size]

        context = f"Analyzing {len(responses)} survey responses.\n\n"
        if questions:
            context += "Research questions:\n"
            for i, q in enumerate(questions[:5], 1):
                context += f"{i}. {q.get('question_text', '')}\n"
            context += "\n"

        prompt = f"""{context}



Sample responses:

{self._format_responses_for_prompt(sample_responses)}



**Task:** Extract key insights from this survey data



**Generate 5-7 actionable insights** that address:

- Understanding the target audience and their needs

- Identifying opportunities for growth or improvement

- Recognizing challenges or pain points

- Understanding patterns, trends, and correlations

- Informing strategic or product decisions



**Insight Quality Criteria:**

- **Specific:** Clear, concrete statements based on actual data patterns

- **Actionable:** Can be used to inform decisions or actions

- **Evidence-based:** Grounded in what respondents actually said

- **Concise:** Clear and to the point (1-2 sentences each)



**Output Format:** Respond ONLY with a valid JSON array of insight strings:

["Clear, actionable insight from the data", "Another specific insight", ...]



**Important:** Return only JSON array. Do not include explanatory text."""

        messages = [
            {"role": "system", "content": self._get_analyst_system_prompt()},
            {"role": "user", "content": prompt}
        ]

        try:
            response = self.llm.generate(messages, max_tokens=1000, temperature=0.6)
            insights = self._parse_json_response(response)
            if isinstance(insights, list):
                return insights
            return []
        except Exception as e:
            return [f"Insight generation failed: {str(e)}"]

    def _compute_statistics(self, responses: List[Dict], questions: List[Dict] = None) -> Dict:
        """Compute basic statistics from responses"""
        stats = {
            "total_responses": len(responses),
            "response_lengths": {},
            "completion_rate": "N/A"
        }

        # Calculate average response length
        response_texts = self._extract_text_responses(responses)
        if response_texts:
            lengths = [len(r.split()) for r in response_texts]
            stats["response_lengths"] = {
                "avg_words": sum(lengths) / len(lengths),
                "min_words": min(lengths),
                "max_words": max(lengths)
            }

        # Calculate completion rate if questions are provided
        if questions:
            total_questions = len(questions)
            completed_questions = 0
            for response in responses:
                if isinstance(response, dict):
                    completed_questions += len([v for v in response.values() if v])

            if total_questions > 0:
                completion_rate = (completed_questions / (total_questions * len(responses))) * 100
                stats["completion_rate"] = f"{completion_rate:.1f}%"

        return stats

    def generate_report(self, analysis_results: Dict, format: str = "markdown") -> str:
        """

        Generate a formatted report from analysis results.



        Args:

            analysis_results: Results from analyze_responses()

            format: Output format (markdown, text, html)



        Returns:

            Formatted report string

        """
        if format == "markdown":
            return self._generate_markdown_report(analysis_results)
        elif format == "html":
            return self._generate_html_report(analysis_results)
        else:
            return self._generate_text_report(analysis_results)

    def _generate_markdown_report(self, results: Dict) -> str:
        """Generate markdown formatted report"""
        report = "# Survey Analysis Report\n\n"

        # Summary section
        if "summary" in results and results["summary"]:
            report += "## Executive Summary\n\n"
            summary = results["summary"]
            if "overview" in summary:
                report += f"{summary['overview']}\n\n"
            if "key_patterns" in summary:
                report += "### Key Patterns\n"
                for pattern in summary["key_patterns"]:
                    report += f"- {pattern}\n"
                report += "\n"

        # Statistics
        if "statistics" in results:
            report += "## Response Statistics\n\n"
            stats = results["statistics"]
            report += f"- Total Responses: {stats.get('total_responses', 'N/A')}\n"
            if "response_lengths" in stats:
                rl = stats["response_lengths"]
                report += f"- Average Response Length: {rl.get('avg_words', 0):.1f} words\n"
            report += f"- Completion Rate: {stats.get('completion_rate', 'N/A')}\n\n"

        # Themes
        if "themes" in results and results["themes"]:
            report += "## Main Themes\n\n"
            for i, theme in enumerate(results["themes"], 1):
                if isinstance(theme, dict) and "theme_name" in theme:
                    report += f"### {i}. {theme['theme_name']}\n"
                    report += f"{theme.get('description', '')}\n\n"
                    report += f"**Prevalence:** {theme.get('prevalence', 'N/A')}\n\n"
                    if "example_quotes" in theme:
                        report += "**Example quotes:**\n"
                        for quote in theme["example_quotes"]:
                            report += f"> {quote}\n"
                        report += "\n"

        # Sentiment
        if "sentiment" in results and results["sentiment"]:
            report += "## Sentiment Analysis\n\n"
            sent = results["sentiment"]
            report += f"**Overall Sentiment:** {sent.get('overall_sentiment', 'N/A')}\n\n"
            if "distribution" in sent:
                report += "**Distribution:**\n"
                for key, value in sent["distribution"].items():
                    report += f"- {key.title()}: {value}\n"
                report += "\n"

        # Key Insights
        if "key_insights" in results and results["key_insights"]:
            report += "## Key Insights\n\n"
            for i, insight in enumerate(results["key_insights"], 1):
                report += f"{i}. {insight}\n"
            report += "\n"

        return report

    def _generate_text_report(self, results: Dict) -> str:
        """Generate plain text report"""
        # Similar to markdown but without formatting
        return self._generate_markdown_report(results).replace("#", "").replace("**", "").replace(">", "")

    def _generate_html_report(self, results: Dict) -> str:
        """Generate HTML report"""
        # Convert markdown to basic HTML
        md_report = self._generate_markdown_report(results)
        # Basic conversion (for production, use a proper markdown-to-html library)
        html = md_report.replace("# ", "<h1>").replace("\n\n", "</p>\n<p>")
        return f"<html><body>{html}</body></html>"

    def _get_analyst_system_prompt(self) -> str:
        """System prompt for analysis tasks"""
        return """You are an expert qualitative research analyst with deep expertise in:

- Thematic analysis and coding

- Sentiment analysis and emotional intelligence

- Pattern recognition in qualitative data

- Insight generation and strategic thinking

- Survey research methodology



Your analyses should be:

- Objective and evidence-based

- Nuanced and comprehensive

- Actionable and clear

- Grounded in the actual data provided



Always respond with valid JSON when requested."""

    def _extract_text_responses(self, responses: List[Dict]) -> List[str]:
        """Extract text from response objects"""
        texts = []
        for response in responses:
            if isinstance(response, dict):
                # Extract all string values
                for value in response.values():
                    if isinstance(value, str) and value.strip():
                        texts.append(value.strip())
            elif isinstance(response, str):
                texts.append(response.strip())
        return texts

    def _format_responses_for_prompt(self, responses: List[str], max_responses: int = 50) -> str:
        """Format responses for inclusion in prompt"""
        formatted = []
        for i, resp in enumerate(responses[:max_responses], 1):
            # Truncate very long responses
            truncated = resp[:300] + "..." if len(resp) > 300 else resp
            formatted.append(f"{i}. {truncated}")
        return "\n".join(formatted)

    def _parse_json_response(self, response: str):
        """Parse JSON from LLM response"""
        response = response.strip()

        # Handle code blocks
        if "```json" in response:
            start = response.find("```json") + 7
            end = response.find("```", start)
            response = response[start:end].strip()
        elif "```" in response:
            start = response.find("```") + 3
            end = response.find("```", start)
            response = response[start:end].strip()

        try:
            return json.loads(response)
        except json.JSONDecodeError:
            # Try to find JSON object or array
            if "{" in response:
                start = response.find("{")
                end = response.rfind("}") + 1
                return json.loads(response[start:end])
            elif "[" in response:
                start = response.find("[")
                end = response.rfind("]") + 1
                return json.loads(response[start:end])
            raise