morris-bot / validate_training_examples.py
eusholli's picture
Upload folder using huggingface_hub
599c2c0 verified
"""
Validate the quality of generated training examples
"""
import json
import re
from typing import List, Dict, Tuple
def analyze_training_examples(filepath: str) -> Dict:
"""Analyze the quality and characteristics of training examples"""
with open(filepath, 'r', encoding='utf-8') as f:
examples = json.load(f)
analysis = {
'total_examples': len(examples),
'provocative_titles': 0,
'cynical_phrases': 0,
'technical_content': 0,
'negative_analogies': 0,
'avg_article_length': 0,
'style_consistency': 0,
'sample_titles': []
}
# Style indicators
provocative_words = [
'disaster', 'catastrophe', 'crash', 'burn', 'fail', 'collapse', 'meltdown',
'nightmare', 'fiasco', 'debacle', 'train wreck', 'explosion', 'implosion'
]
cynical_phrases = [
'of course', 'naturally', 'predictably', 'unsurprisingly', 'evidently',
'clearly', 'obviously', 'needless to say'
]
negative_analogies = [
'train wreck', 'collision', 'explosion', 'disaster', 'catastrophe',
'meltdown', 'implosion', 'crash', 'carnival barker', 'unicorn'
]
technical_terms = [
'5G', 'RAN', 'AI', 'edge computing', 'automation', 'cloud', 'network',
'operator', 'vendor', 'infrastructure', 'deployment', 'integration'
]
total_length = 0
style_score = 0
for example in examples:
if 'messages' in example and len(example['messages']) >= 3:
content = example['messages'][2]['content']
title_line = content.split('\n\n')[0]
title = title_line[2:] if title_line.startswith('# ') else title_line
# Collect sample titles
if len(analysis['sample_titles']) < 10:
analysis['sample_titles'].append(title)
content_lower = content.lower()
# Check for provocative titles
if any(word in title.lower() for word in provocative_words):
analysis['provocative_titles'] += 1
# Check for cynical phrases
if any(phrase in content_lower for phrase in cynical_phrases):
analysis['cynical_phrases'] += 1
# Check for technical content
if any(term.lower() in content_lower for term in technical_terms):
analysis['technical_content'] += 1
# Check for negative analogies
if any(analogy in content_lower for analogy in negative_analogies):
analysis['negative_analogies'] += 1
# Calculate article length
article_length = len(content)
total_length += article_length
# Style consistency score (0-4 based on presence of key elements)
style_elements = 0
if any(word in title.lower() for word in provocative_words):
style_elements += 1
if any(phrase in content_lower for phrase in cynical_phrases):
style_elements += 1
if any(analogy in content_lower for analogy in negative_analogies):
style_elements += 1
if any(term.lower() in content_lower for term in technical_terms):
style_elements += 1
style_score += style_elements
# Calculate averages and percentages
if examples:
analysis['avg_article_length'] = total_length // len(examples)
analysis['style_consistency'] = (style_score / (len(examples) * 4)) * 100
# Convert counts to percentages
analysis['provocative_titles'] = (analysis['provocative_titles'] / len(examples)) * 100
analysis['cynical_phrases'] = (analysis['cynical_phrases'] / len(examples)) * 100
analysis['technical_content'] = (analysis['technical_content'] / len(examples)) * 100
analysis['negative_analogies'] = (analysis['negative_analogies'] / len(examples)) * 100
return analysis
def print_analysis_report(analysis: Dict):
"""Print a detailed analysis report"""
print("=" * 60)
print("TRAINING EXAMPLES QUALITY ANALYSIS")
print("=" * 60)
print(f"Total Examples: {analysis['total_examples']}")
print(f"Average Article Length: {analysis['avg_article_length']:,} characters")
print()
print("STYLE ANALYSIS:")
print(f" Provocative Titles: {analysis['provocative_titles']:.1f}%")
print(f" Cynical Phrases: {analysis['cynical_phrases']:.1f}%")
print(f" Technical Content: {analysis['technical_content']:.1f}%")
print(f" Negative Analogies: {analysis['negative_analogies']:.1f}%")
print(f" Overall Style Consistency: {analysis['style_consistency']:.1f}%")
print()
print("SAMPLE TITLES:")
for i, title in enumerate(analysis['sample_titles'], 1):
print(f" {i:2d}. {title}")
print()
# Quality assessment
quality_score = (
analysis['provocative_titles'] +
analysis['cynical_phrases'] +
analysis['technical_content'] +
analysis['negative_analogies']
) / 4
print("QUALITY ASSESSMENT:")
if quality_score >= 80:
print(" ✅ EXCELLENT - High-quality examples with strong style consistency")
elif quality_score >= 60:
print(" ✅ GOOD - Solid examples with good style elements")
elif quality_score >= 40:
print(" ⚠️ FAIR - Acceptable but could use improvement")
else:
print(" ❌ POOR - Needs significant improvement")
print(f" Overall Quality Score: {quality_score:.1f}%")
print()
def compare_datasets(original_file: str, new_file: str):
"""Compare original and new datasets"""
print("DATASET COMPARISON:")
print("-" * 40)
original_analysis = analyze_training_examples(original_file)
new_analysis = analyze_training_examples(new_file)
print(f"Original Dataset: {original_analysis['total_examples']} examples")
print(f"Expanded Dataset: {new_analysis['total_examples']} examples")
print(f"New Examples Added: {new_analysis['total_examples'] - original_analysis['total_examples']}")
print()
print("STYLE CONSISTENCY COMPARISON:")
print(f" Original: {original_analysis['style_consistency']:.1f}%")
print(f" Expanded: {new_analysis['style_consistency']:.1f}%")
if new_analysis['style_consistency'] >= original_analysis['style_consistency']:
print(" ✅ Style consistency maintained or improved")
else:
print(" ⚠️ Style consistency decreased")
print()
def main():
"""Main validation function"""
print("Validating training examples quality...\n")
# Analyze the new examples
print("ANALYZING NEW EXAMPLES:")
new_analysis = analyze_training_examples('data/additional_training_examples.json')
print_analysis_report(new_analysis)
# Analyze the expanded dataset
print("ANALYZING EXPANDED DATASET:")
expanded_analysis = analyze_training_examples('data/expanded_train_dataset.json')
print_analysis_report(expanded_analysis)
# Compare with original
try:
compare_datasets('data/train_dataset.json', 'data/expanded_train_dataset.json')
except FileNotFoundError:
print("Original dataset not found for comparison.")
print("=" * 60)
print("VALIDATION COMPLETE")
print("=" * 60)
if __name__ == "__main__":
main()