ProjectEcho / survey_generator.py
jmisak's picture
Upload 5 files
8056e83 verified
"""
Survey Generation Module - Generate AI-powered surveys from outlines
"""
import json
import sys
import os
from typing import List, Dict, Optional
# Add parent directory to path for imports
sys.path.insert(0, os.path.dirname(__file__))
from llm_backend import LLMBackend
class SurveyGenerator:
"""
Generates professional surveys from user outlines using AI.
Follows industry best practices for qualitative research.
"""
def __init__(self, llm_backend: LLMBackend):
self.llm = llm_backend
def generate_survey(self,
outline: str,
survey_type: str = "qualitative",
num_questions: int = 10,
target_audience: str = "general") -> Dict:
"""
Generate a complete survey from an outline.
Args:
outline: User's outline or topic description
survey_type: Type of survey (qualitative, quantitative, mixed)
num_questions: Target number of questions
target_audience: Description of target respondents
Returns:
Dict containing survey metadata and questions
"""
prompt = self._build_generation_prompt(outline, survey_type, num_questions, target_audience)
messages = [
{"role": "system", "content": self._get_system_prompt()},
{"role": "user", "content": prompt}
]
try:
response = self.llm.generate(messages, max_tokens=2000, temperature=0.7)
survey_data = self._parse_survey_response(response)
# Generate better title based on outline
survey_data["title"] = self._generate_title(outline, survey_type)
# Add metadata
survey_data["metadata"] = {
"outline": outline,
"survey_type": survey_type,
"target_audience": target_audience,
"generated_question_count": len(survey_data.get("questions", []))
}
return survey_data
except Exception as e:
raise Exception(f"Survey generation failed: {str(e)}")
def _generate_title(self, outline: str, survey_type: str) -> str:
"""Generate a survey title from the outline"""
# Extract key topic from outline (first sentence or first 50 chars)
first_sentence = outline.split('.')[0].strip()
if len(first_sentence) > 60:
first_sentence = first_sentence[:60] + "..."
# Capitalize first letter
topic = first_sentence[0].upper() + first_sentence[1:] if first_sentence else "Research"
# Create title based on survey type
if survey_type.lower() == "qualitative":
return f"{topic} - Qualitative Survey"
elif survey_type.lower() == "quantitative":
return f"{topic} - Quantitative Survey"
else:
return f"{topic} Survey"
def _get_system_prompt(self) -> str:
"""System prompt for survey generation - optimized for Mistral/Mixtral"""
return """You are an expert survey designer specializing in qualitative research. Your role is to create clear, professionally-written, and contextually relevant survey questions that elicit detailed responses from respondents."""
def _build_generation_prompt(self, outline, survey_type, num_questions, target_audience) -> str:
"""Build the user prompt for survey generation - optimized for Mistral/Mixtral"""
return f"""You are creating a {survey_type.lower()} research survey.
**Research Focus:** {outline}
**Target Participants:** {target_audience}
**Your Task:** Generate exactly {num_questions} high-quality survey questions.
**Quality Requirements:**
- Each question must be directly relevant to the research focus
- Questions should be specific enough to guide responses but open enough to capture diverse perspectives
- For {survey_type.lower()} surveys: Use open-ended questions that encourage detailed, thoughtful responses
- Avoid leading questions, double questions, or jargon that may confuse respondents
- Ensure questions are appropriate for the target audience's knowledge and context
- Progress from general to specific topics when possible
**Format:** Output as a numbered list (1. Question text 2. Question text, etc.)
**Output {num_questions} Survey Questions:**
1."""
def _parse_survey_response(self, response: str) -> Dict:
"""Parse LLM response into survey structure"""
# Parse numbered list format (not JSON)
return self._parse_numbered_list(response)
def _parse_numbered_list(self, response: str) -> Dict:
"""Parse numbered list of questions into survey structure"""
import re
# First, try numbered list approach
# Pattern to match numbered questions: "1. Question" or "1) Question"
pattern = r'\d+[\.\)]\s+'
parts = re.split(pattern, response)
parts = [p.strip() for p in parts if p.strip()]
questions = []
question_id = 1
for part in parts:
# Skip if too short
if len(part) < 10:
continue
# Take only the first sentence/question if there are multiple
# Split by question mark, period, or newline
sentences = re.split(r'[\n]+|[?.!]\s+(?=\d+[\.\)]|\Z)', part)
clean_line = sentences[0].strip()
# Remove any leading hyphens or bullets that might appear
clean_line = re.sub(r'^[-•*]\s*', '', clean_line)
# Add question mark if missing
if clean_line and not clean_line.endswith('?'):
clean_line += '?'
# Skip if still too short
if len(clean_line) < 10:
continue
# Determine question type based on content
question_type = "open_ended"
options = None
lower_line = clean_line.lower()
# Check for rating/scale questions
if any(word in lower_line for word in ['rate', 'scale', 'rating', 'score']):
question_type = "rating"
options = ["1 - Poor", "2 - Fair", "3 - Good", "4 - Very Good", "5 - Excellent"]
# Check for yes/no questions
elif clean_line.endswith('?') and any(word in lower_line for word in ['do you', 'have you', 'would you', 'can you', 'should', 'is it', 'are you']):
if 'how much' not in lower_line and 'how many' not in lower_line:
question_type = "yes_no"
options = ["Yes", "No"]
# Check for satisfaction questions
elif any(word in lower_line for word in ['satisfy', 'satisfaction', 'satisfied']):
question_type = "likert_scale"
options = ["Very Satisfied", "Satisfied", "Neutral", "Dissatisfied", "Very Dissatisfied"]
question = {
"id": question_id,
"question_text": clean_line,
"question_type": question_type,
"required": True
}
if options:
question["options"] = options
questions.append(question)
question_id += 1
# If we found few or no questions from numbered list, try alternative parsing
# This helps catch responses that don't use numbered format
if len(questions) < 3:
alt_questions = self._parse_alternative_format(response)
# Use alternative if it found more questions
if len(alt_questions) > len(questions):
questions = alt_questions
# Final fallback if still no questions
if len(questions) == 0:
questions = [
{"id": 1, "question_text": "What are your overall thoughts on this topic?", "question_type": "open_ended", "required": True},
{"id": 2, "question_text": "Can you describe your experience in detail?", "question_type": "open_ended", "required": True},
{"id": 3, "question_text": "What specific suggestions do you have for improvement?", "question_type": "open_ended", "required": True}
]
return {
"title": "Research Survey",
"introduction": "Thank you for taking the time to participate in this survey. Your responses will help us better understand your experiences and perspectives. Please answer all questions honestly and thoroughly.",
"questions": questions[:20], # Limit to 20 questions
"closing": "Thank you for your valuable time and feedback! Your responses are greatly appreciated and will be used to improve our understanding of this topic."
}
def _parse_alternative_format(self, response: str) -> List[Dict]:
"""Try alternative parsing approaches if numbered list fails"""
import re
questions = []
question_id = 1
# Try splitting by lines and looking for question patterns
lines = response.split('\n')
for line in lines:
line = line.strip()
# Skip empty lines
if not line or len(line) < 10:
continue
# Skip lines that are just labels or instructions
skip_keywords = ['format:', 'requirements:', 'task:', 'topic:', 'audience:', 'here are', 'survey questions:', 'questions:']
if any(keyword in line.lower() for keyword in skip_keywords):
continue
# Check if this looks like a question (has ?, or starts with question words)
has_question_mark = '?' in line
starts_with_question_word = any(word in line.lower() for word in ['describe', 'explain', 'what', 'how', 'why', 'when', 'where', 'who', 'can you', 'would you', 'do you', 'have you'])
if has_question_mark or starts_with_question_word:
# Clean up the line (remove bullets, numbers, etc)
clean_line = re.sub(r'^[-•*\d+\.\)]\s*', '', line).strip()
# Ensure it ends with question mark
if clean_line and not clean_line.endswith('?'):
# Only add if it doesn't already end with punctuation
if not any(c in clean_line for c in [':', '!', '.']):
clean_line += '?'
# Skip if too short after cleaning
if len(clean_line) < 10:
continue
# Determine question type based on content
question_type = "open_ended"
options = None
lower_line = clean_line.lower()
# Check for rating/scale questions
if any(word in lower_line for word in ['rate', 'scale', 'rating', 'score']):
question_type = "rating"
options = ["1 - Poor", "2 - Fair", "3 - Good", "4 - Very Good", "5 - Excellent"]
question = {
"id": question_id,
"question_text": clean_line,
"question_type": question_type,
"required": True
}
if options:
question["options"] = options
questions.append(question)
question_id += 1
# If still no questions found, create fallback questions based on topic hints
if len(questions) == 0:
questions = [
{"id": 1, "question_text": "What are your overall thoughts on this topic?", "question_type": "open_ended", "required": True},
{"id": 2, "question_text": "Can you describe your experience in detail?", "question_type": "open_ended", "required": True},
{"id": 3, "question_text": "What specific suggestions do you have for improvement?", "question_type": "open_ended", "required": True}
]
return questions
def refine_question(self, question: str, improvement_type: str = "clarity") -> str:
"""
Refine a single survey question - optimized for Mistral/Mixtral
Args:
question: The question to improve
improvement_type: Type of improvement (clarity, neutrality, specificity)
Returns:
Improved question text
"""
improvement_guidance = {
"clarity": "Makes the question clearer and easier for respondents to understand without ambiguity",
"neutrality": "Removes any bias, leading language, or assumptions that could influence responses",
"specificity": "Makes the question more specific and actionable while remaining open-ended"
}
guidance = improvement_guidance.get(improvement_type, improvement_guidance["clarity"])
prompt = f"""Task: Improve a survey question
**Original Question:** "{question}"
**Improvement Type:** {improvement_type.title()}
**Your Goal:** Rewrite this question so that it {guidance}.
**Guidelines:**
- Keep the question focused on a single topic
- Use simple, clear language appropriate for the target audience
- Avoid assumptions or leading language
- Ensure the question can elicit meaningful responses
Provide ONLY the improved question text. Do not include explanations or alternative versions."""
messages = [
{"role": "system", "content": "You are an expert survey question designer with deep experience in qualitative research methodology."},
{"role": "user", "content": prompt}
]
return self.llm.generate(messages, max_tokens=150, temperature=0.5).strip()
def add_follow_up_questions(self, base_question: str, num_follow_ups: int = 3) -> List[str]:
"""
Generate follow-up questions for deeper exploration - optimized for Mistral/Mixtral
Args:
base_question: The main question
num_follow_ups: Number of follow-up questions to generate
Returns:
List of follow-up question texts
"""
prompt = f"""Task: Generate probing follow-up questions
**Main Question:** {base_question}
**Your Task:** Create {num_follow_ups} thoughtful follow-up questions that probe deeper into the respondent's answer.
**Quality Criteria for Follow-ups:**
1. Each question should explore a different aspect, dimension, or implication of the main topic
2. Questions should encourage more detailed, nuanced responses
3. Follow a logical progression from the main question
4. Build on what a respondent might answer to the main question
5. Each should be specific but open-ended
**Format:** Number each question (1., 2., 3., etc.)
**Output {num_follow_ups} Follow-up Questions:**
1."""
messages = [
{"role": "system", "content": "You are an expert qualitative research interviewer skilled at designing probing questions that uncover deeper insights and nuances."},
{"role": "user", "content": prompt}
]
response = self.llm.generate(messages, max_tokens=500, temperature=0.7)
# Parse the response for follow-up questions
import re
# Try numbered list format first
pattern = r'\d+[\.\)]\s+(.+?)(?=\d+[\.\)]|\Z)'
matches = re.findall(pattern, response, re.DOTALL)
if matches:
follow_ups = [m.split('\n')[0].strip() for m in matches if m.strip()][:num_follow_ups]
# Ensure all end with question mark
follow_ups = [q if q.endswith('?') else q + '?' for q in follow_ups]
if follow_ups:
return follow_ups
# Fallback: split by newlines and look for questions
lines = [line.strip() for line in response.split("\n") if line.strip()]
follow_ups = [line.lstrip("0123456789.-) ") for line in lines if "?" in line][:num_follow_ups]
return follow_ups if follow_ups else [f"Can you elaborate on {base_question.lower()}?" for _ in range(num_follow_ups)]