Spaces:

jmisak
/

ProjectEcho

Sleeping

File size: 16,359 Bytes

"""

Survey Generation Module - Generate AI-powered surveys from outlines

"""
import json
import sys
import os
from typing import List, Dict, Optional

# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(__file__))

from llm_backend import LLMBackend


class SurveyGenerator:
    """

    Generates professional surveys from user outlines using AI.

    Follows industry best practices for qualitative research.

    """

    def __init__(self, llm_backend: LLMBackend):
        self.llm = llm_backend

    def generate_survey(self,

                       outline: str,

                       survey_type: str = "qualitative",

                       num_questions: int = 10,

                       target_audience: str = "general") -> Dict:
        """

        Generate a complete survey from an outline.



        Args:

            outline: User's outline or topic description

            survey_type: Type of survey (qualitative, quantitative, mixed)

            num_questions: Target number of questions

            target_audience: Description of target respondents



        Returns:

            Dict containing survey metadata and questions

        """
        prompt = self._build_generation_prompt(outline, survey_type, num_questions, target_audience)

        messages = [
            {"role": "system", "content": self._get_system_prompt()},
            {"role": "user", "content": prompt}
        ]

        try:
            response = self.llm.generate(messages, max_tokens=2000, temperature=0.7)
            survey_data = self._parse_survey_response(response)

            # Generate better title based on outline
            survey_data["title"] = self._generate_title(outline, survey_type)

            # Add metadata
            survey_data["metadata"] = {
                "outline": outline,
                "survey_type": survey_type,
                "target_audience": target_audience,
                "generated_question_count": len(survey_data.get("questions", []))
            }

            return survey_data

        except Exception as e:
            raise Exception(f"Survey generation failed: {str(e)}")

    def _generate_title(self, outline: str, survey_type: str) -> str:
        """Generate a survey title from the outline"""
        # Extract key topic from outline (first sentence or first 50 chars)
        first_sentence = outline.split('.')[0].strip()
        if len(first_sentence) > 60:
            first_sentence = first_sentence[:60] + "..."

        # Capitalize first letter
        topic = first_sentence[0].upper() + first_sentence[1:] if first_sentence else "Research"

        # Create title based on survey type
        if survey_type.lower() == "qualitative":
            return f"{topic} - Qualitative Survey"
        elif survey_type.lower() == "quantitative":
            return f"{topic} - Quantitative Survey"
        else:
            return f"{topic} Survey"

    def _get_system_prompt(self) -> str:
        """System prompt for survey generation - optimized for Mistral/Mixtral"""
        return """You are an expert survey designer specializing in qualitative research. Your role is to create clear, professionally-written, and contextually relevant survey questions that elicit detailed responses from respondents."""

    def _build_generation_prompt(self, outline, survey_type, num_questions, target_audience) -> str:
        """Build the user prompt for survey generation - optimized for Mistral/Mixtral"""
        return f"""You are creating a {survey_type.lower()} research survey.



**Research Focus:** {outline}



**Target Participants:** {target_audience}



**Your Task:** Generate exactly {num_questions} high-quality survey questions.



**Quality Requirements:**

- Each question must be directly relevant to the research focus

- Questions should be specific enough to guide responses but open enough to capture diverse perspectives

- For {survey_type.lower()} surveys: Use open-ended questions that encourage detailed, thoughtful responses

- Avoid leading questions, double questions, or jargon that may confuse respondents

- Ensure questions are appropriate for the target audience's knowledge and context

- Progress from general to specific topics when possible



**Format:** Output as a numbered list (1. Question text 2. Question text, etc.)



**Output {num_questions} Survey Questions:**



1."""

    def _parse_survey_response(self, response: str) -> Dict:
        """Parse LLM response into survey structure"""
        # Parse numbered list format (not JSON)
        return self._parse_numbered_list(response)

    def _parse_numbered_list(self, response: str) -> Dict:
        """Parse numbered list of questions into survey structure"""
        import re

        # First, try numbered list approach
        # Pattern to match numbered questions: "1. Question" or "1) Question"
        pattern = r'\d+[\.\)]\s+'
        parts = re.split(pattern, response)
        parts = [p.strip() for p in parts if p.strip()]

        questions = []
        question_id = 1

        for part in parts:
            # Skip if too short
            if len(part) < 10:
                continue

            # Take only the first sentence/question if there are multiple
            # Split by question mark, period, or newline
            sentences = re.split(r'[\n]+|[?.!]\s+(?=\d+[\.\)]|\Z)', part)
            clean_line = sentences[0].strip()

            # Remove any leading hyphens or bullets that might appear
            clean_line = re.sub(r'^[-•*]\s*', '', clean_line)

            # Add question mark if missing
            if clean_line and not clean_line.endswith('?'):
                clean_line += '?'

            # Skip if still too short
            if len(clean_line) < 10:
                continue

            # Determine question type based on content
            question_type = "open_ended"
            options = None

            lower_line = clean_line.lower()

            # Check for rating/scale questions
            if any(word in lower_line for word in ['rate', 'scale', 'rating', 'score']):
                question_type = "rating"
                options = ["1 - Poor", "2 - Fair", "3 - Good", "4 - Very Good", "5 - Excellent"]

            # Check for yes/no questions
            elif clean_line.endswith('?') and any(word in lower_line for word in ['do you', 'have you', 'would you', 'can you', 'should', 'is it', 'are you']):
                if 'how much' not in lower_line and 'how many' not in lower_line:
                    question_type = "yes_no"
                    options = ["Yes", "No"]

            # Check for satisfaction questions
            elif any(word in lower_line for word in ['satisfy', 'satisfaction', 'satisfied']):
                question_type = "likert_scale"
                options = ["Very Satisfied", "Satisfied", "Neutral", "Dissatisfied", "Very Dissatisfied"]

            question = {
                "id": question_id,
                "question_text": clean_line,
                "question_type": question_type,
                "required": True
            }

            if options:
                question["options"] = options

            questions.append(question)
            question_id += 1

        # If we found few or no questions from numbered list, try alternative parsing
        # This helps catch responses that don't use numbered format
        if len(questions) < 3:
            alt_questions = self._parse_alternative_format(response)
            # Use alternative if it found more questions
            if len(alt_questions) > len(questions):
                questions = alt_questions

        # Final fallback if still no questions
        if len(questions) == 0:
            questions = [
                {"id": 1, "question_text": "What are your overall thoughts on this topic?", "question_type": "open_ended", "required": True},
                {"id": 2, "question_text": "Can you describe your experience in detail?", "question_type": "open_ended", "required": True},
                {"id": 3, "question_text": "What specific suggestions do you have for improvement?", "question_type": "open_ended", "required": True}
            ]

        return {
            "title": "Research Survey",
            "introduction": "Thank you for taking the time to participate in this survey. Your responses will help us better understand your experiences and perspectives. Please answer all questions honestly and thoroughly.",
            "questions": questions[:20],  # Limit to 20 questions
            "closing": "Thank you for your valuable time and feedback! Your responses are greatly appreciated and will be used to improve our understanding of this topic."
        }

    def _parse_alternative_format(self, response: str) -> List[Dict]:
        """Try alternative parsing approaches if numbered list fails"""
        import re

        questions = []
        question_id = 1

        # Try splitting by lines and looking for question patterns
        lines = response.split('\n')

        for line in lines:
            line = line.strip()

            # Skip empty lines
            if not line or len(line) < 10:
                continue

            # Skip lines that are just labels or instructions
            skip_keywords = ['format:', 'requirements:', 'task:', 'topic:', 'audience:', 'here are', 'survey questions:', 'questions:']
            if any(keyword in line.lower() for keyword in skip_keywords):
                continue

            # Check if this looks like a question (has ?, or starts with question words)
            has_question_mark = '?' in line
            starts_with_question_word = any(word in line.lower() for word in ['describe', 'explain', 'what', 'how', 'why', 'when', 'where', 'who', 'can you', 'would you', 'do you', 'have you'])

            if has_question_mark or starts_with_question_word:
                # Clean up the line (remove bullets, numbers, etc)
                clean_line = re.sub(r'^[-•*\d+\.\)]\s*', '', line).strip()

                # Ensure it ends with question mark
                if clean_line and not clean_line.endswith('?'):
                    # Only add if it doesn't already end with punctuation
                    if not any(c in clean_line for c in [':', '!', '.']):
                        clean_line += '?'

                # Skip if too short after cleaning
                if len(clean_line) < 10:
                    continue

                # Determine question type based on content
                question_type = "open_ended"
                options = None

                lower_line = clean_line.lower()

                # Check for rating/scale questions
                if any(word in lower_line for word in ['rate', 'scale', 'rating', 'score']):
                    question_type = "rating"
                    options = ["1 - Poor", "2 - Fair", "3 - Good", "4 - Very Good", "5 - Excellent"]

                question = {
                    "id": question_id,
                    "question_text": clean_line,
                    "question_type": question_type,
                    "required": True
                }

                if options:
                    question["options"] = options

                questions.append(question)
                question_id += 1

        # If still no questions found, create fallback questions based on topic hints
        if len(questions) == 0:
            questions = [
                {"id": 1, "question_text": "What are your overall thoughts on this topic?", "question_type": "open_ended", "required": True},
                {"id": 2, "question_text": "Can you describe your experience in detail?", "question_type": "open_ended", "required": True},
                {"id": 3, "question_text": "What specific suggestions do you have for improvement?", "question_type": "open_ended", "required": True}
            ]

        return questions

    def refine_question(self, question: str, improvement_type: str = "clarity") -> str:
        """

        Refine a single survey question - optimized for Mistral/Mixtral



        Args:

            question: The question to improve

            improvement_type: Type of improvement (clarity, neutrality, specificity)



        Returns:

            Improved question text

        """
        improvement_guidance = {
            "clarity": "Makes the question clearer and easier for respondents to understand without ambiguity",
            "neutrality": "Removes any bias, leading language, or assumptions that could influence responses",
            "specificity": "Makes the question more specific and actionable while remaining open-ended"
        }

        guidance = improvement_guidance.get(improvement_type, improvement_guidance["clarity"])

        prompt = f"""Task: Improve a survey question



**Original Question:** "{question}"



**Improvement Type:** {improvement_type.title()}



**Your Goal:** Rewrite this question so that it {guidance}.



**Guidelines:**

- Keep the question focused on a single topic

- Use simple, clear language appropriate for the target audience

- Avoid assumptions or leading language

- Ensure the question can elicit meaningful responses



Provide ONLY the improved question text. Do not include explanations or alternative versions."""

        messages = [
            {"role": "system", "content": "You are an expert survey question designer with deep experience in qualitative research methodology."},
            {"role": "user", "content": prompt}
        ]

        return self.llm.generate(messages, max_tokens=150, temperature=0.5).strip()

    def add_follow_up_questions(self, base_question: str, num_follow_ups: int = 3) -> List[str]:
        """

        Generate follow-up questions for deeper exploration - optimized for Mistral/Mixtral



        Args:

            base_question: The main question

            num_follow_ups: Number of follow-up questions to generate



        Returns:

            List of follow-up question texts

        """
        prompt = f"""Task: Generate probing follow-up questions



**Main Question:** {base_question}



**Your Task:** Create {num_follow_ups} thoughtful follow-up questions that probe deeper into the respondent's answer.



**Quality Criteria for Follow-ups:**

1. Each question should explore a different aspect, dimension, or implication of the main topic

2. Questions should encourage more detailed, nuanced responses

3. Follow a logical progression from the main question

4. Build on what a respondent might answer to the main question

5. Each should be specific but open-ended



**Format:** Number each question (1., 2., 3., etc.)



**Output {num_follow_ups} Follow-up Questions:**



1."""

        messages = [
            {"role": "system", "content": "You are an expert qualitative research interviewer skilled at designing probing questions that uncover deeper insights and nuances."},
            {"role": "user", "content": prompt}
        ]

        response = self.llm.generate(messages, max_tokens=500, temperature=0.7)

        # Parse the response for follow-up questions
        import re

        # Try numbered list format first
        pattern = r'\d+[\.\)]\s+(.+?)(?=\d+[\.\)]|\Z)'
        matches = re.findall(pattern, response, re.DOTALL)

        if matches:
            follow_ups = [m.split('\n')[0].strip() for m in matches if m.strip()][:num_follow_ups]
            # Ensure all end with question mark
            follow_ups = [q if q.endswith('?') else q + '?' for q in follow_ups]
            if follow_ups:
                return follow_ups

        # Fallback: split by newlines and look for questions
        lines = [line.strip() for line in response.split("\n") if line.strip()]
        follow_ups = [line.lstrip("0123456789.-) ") for line in lines if "?" in line][:num_follow_ups]

        return follow_ups if follow_ups else [f"Can you elaborate on {base_question.lower()}?" for _ in range(num_follow_ups)]