trace-crs-chatbot / utils /clarification_export.py
Ashmi Banerjee
Initial deployment: Sustainable Tourism CRS Chatbot
c3674d7
"""
Clarification Data Export Utility
Export clarification questions and answers from the database for analysis
"""
import asyncio
import json
from typing import Dict, Any, List, Optional
from datetime import datetime, timedelta
from database.config import get_conversation_store
async def export_clarification_data(
output_file: str = "clarification_data.json",
days_back: int = 30,
format: str = "detailed"
) -> Dict[str, Any]:
"""
Export clarification question/answer pairs from the database
Args:
output_file: Output filename
days_back: Number of days to look back
format: 'detailed' or 'simple'
Returns:
Dictionary with export statistics
"""
store = get_conversation_store()
# Get all conversations
conversations = await store.export_for_training(
output_format='full',
limit=10000
)
clarification_data = []
total_sessions = 0
total_qa_pairs = 0
for conv in conversations:
# Check if conversation has clarification data
if not conv.get('collected_entities', {}).get('clarification_answers'):
continue
total_sessions += 1
session_data = {
'session_id': conv.get('session_id'),
'timestamp': conv.get('created_at'),
'user_type': conv.get('user_type', 'unknown'),
'original_query': None,
'clarification_qa': []
}
# Extract Q&A pairs
answers = conv['collected_entities']['clarification_answers']
for q_id, qa_data in answers.items():
total_qa_pairs += 1
if format == 'detailed':
session_data['clarification_qa'].append({
'question_id': q_id,
# 'category': qa_data.get('category', 'unknown'),
'question': qa_data.get('question', ''),
'answer': qa_data.get('answer', '')
})
else:
session_data['clarification_qa'].append({
'q': qa_data.get('question', ''),
'a': qa_data.get('answer', '')
})
# Try to extract original query from conversation history
for msg in conv.get('conversation_history', []):
if msg.get('metadata', {}).get('type') == 'clarification_trigger':
session_data['original_query'] = msg.get('content')
break
clarification_data.append(session_data)
# Export to file
export = {
'export_date': datetime.now().isoformat(),
'total_sessions': total_sessions,
'total_qa_pairs': total_qa_pairs,
'format': format,
'data': clarification_data
}
with open(output_file, 'w') as f:
json.dump(export, f, indent=2)
print(f"✅ Exported {total_sessions} sessions with {total_qa_pairs} Q&A pairs")
print(f" Output: {output_file}")
return export
async def get_clarification_statistics() -> Dict[str, Any]:
"""
Get statistics about clarification usage
Returns:
Dictionary with various statistics
"""
store = get_conversation_store()
conversations = await store.export_for_training(
output_format='full',
limit=10000
)
stats = {
'total_conversations': len(conversations),
'with_clarification': 0,
'without_clarification': 0,
'total_questions_asked': 0,
'total_answers_collected': 0,
# 'category_breakdown': {},
'completion_rate': 0
}
clarification_completed = 0
for conv in conversations:
clarification_answers = conv.get('collected_entities', {}).get('clarification_answers')
if clarification_answers:
stats['with_clarification'] += 1
# Check if completed
if conv.get('clarification_complete'):
clarification_completed += 1
# Count answers
for q_id, qa_data in clarification_answers.items():
stats['total_answers_collected'] += 1
# category = qa_data.get('category', 'unknown')
# if category not in stats['category_breakdown']:
# stats['category_breakdown'][category] = 0
# stats['category_breakdown'][category] += 1
else:
stats['without_clarification'] += 1
if stats['with_clarification'] > 0:
stats['completion_rate'] = (clarification_completed / stats['with_clarification']) * 100
return stats
async def print_clarification_report():
"""Print a detailed report of clarification usage"""
print("=" * 60)
print("CLARIFICATION FLOW USAGE REPORT")
print("=" * 60)
stats = await get_clarification_statistics()
print(f"\nTotal Conversations: {stats['total_conversations']}")
print(f" With Clarification: {stats['with_clarification']}")
print(f" Without Clarification: {stats['without_clarification']}")
if stats['with_clarification'] > 0:
usage_rate = (stats['with_clarification'] / stats['total_conversations']) * 100
print(f" Usage Rate: {usage_rate:.1f}%")
print(f"\nClarification Stats:")
print(f" Total Answers Collected: {stats['total_answers_collected']}")
print(f" Completion Rate: {stats['completion_rate']:.1f}%")
# print(f"\nQuestion Category Breakdown:")
# for category, count in sorted(stats['category_breakdown'].items(),
# key=lambda x: x[1], reverse=True):
# print(f" {category}: {count}")
print("\n" + "=" * 60)
async def export_for_training(
output_file: str = "clarification_training_data.jsonl"
):
"""
Export clarification data in JSONL format for model training
Args:
output_file: Output filename
"""
store = get_conversation_store()
conversations = await store.export_for_training(
output_format='full',
limit=10000
)
training_examples = []
for conv in conversations:
clarification_answers = conv.get('collected_entities', {}).get('clarification_answers')
if not clarification_answers:
continue
# Extract original query
original_query = None
for msg in conv.get('conversation_history', []):
if msg.get('metadata', {}).get('type') == 'clarification_trigger':
original_query = msg.get('content')
break
if not original_query:
continue
# Create training example
for q_id, qa_data in clarification_answers.items():
example = {
"query": original_query,
# "question_category": qa_data.get('category'),
"clarifying_question": qa_data.get('question'),
"user_answer": qa_data.get('answer'),
"user_type": conv.get('user_type', 'unknown')
}
training_examples.append(example)
# Write JSONL
with open(output_file, 'w') as f:
for example in training_examples:
f.write(json.dumps(example) + '\n')
print(f"✅ Exported {len(training_examples)} training examples")
print(f" Output: {output_file}")
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
command = sys.argv[1]
if command == "export":
asyncio.run(export_clarification_data())
elif command == "stats":
asyncio.run(print_clarification_report())
elif command == "training":
asyncio.run(export_for_training())
else:
print(f"Unknown command: {command}")
print("Usage: python clarification_export.py [export|stats|training]")
else:
print("Clarification Data Export Utility")
print("\nUsage:")
print(" python clarification_export.py export - Export all clarification data")
print(" python clarification_export.py stats - Show usage statistics")
print(" python clarification_export.py training - Export for model training")