Spaces:

vimalk78
/

abc123

Running

App Files Files Community

abc123 / hack /test_semantic_clues.py

vimalk78

feat(crossword): generated crosswords with clues

486eff6 8 months ago

raw

history blame

4.58 kB

	#!/usr/bin/env python3
	"""
	Quick Test: Semantic Template Clue Generation
	Test the semantic template approach against the same problematic examples that failed with LLM.
	"""

	import sys
	import logging
	from pathlib import Path

	# Add hack directory to path for imports
	sys.path.insert(0, str(Path(__file__).parent))

	try:
	from semantic_clue_generator import SemanticClueGenerator
	GENERATOR_AVAILABLE = True
	except ImportError as e:
	print(f"❌ Import error: {e}")
	GENERATOR_AVAILABLE = False

	# Set up logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def test_semantic_clues():
	"""Test semantic template clue generation with problematic examples."""
	if not GENERATOR_AVAILABLE:
	print("❌ Cannot run test - Semantic generator not available")
	return

	print("🧪 Testing Semantic Template Clue Generation")
	print("=" * 60)

	# Initialize generator
	print("🔄 Initializing semantic clue generator...")
	generator = SemanticClueGenerator()

	try:
	generator.initialize()
	print("✅ Generator initialized successfully")
	except Exception as e:
	print(f"❌ Failed to initialize generator: {e}")
	return

	# Test cases that failed with LLM - same examples user reported as junk
	test_cases = [
	# Previously bad LLM examples
	("CAT", "animals"),
	("KITTY", "animals"),
	("MEAL", "food"),
	("HUNGER", "food"),
	("TECH", "technology"),
	("SCIENTIST", "science"),

	# Additional test cases
	("DOG", "animals"),
	("PYTHON", "technology"),
	("GUITAR", "music"),
	("OCEAN", "geography"),
	("ATOM", "science"),
	("PIZZA", "food"),
	]

	print(f"\n🎯 Testing {len(test_cases)} word-topic combinations")
	print("=" * 60)

	successful_clues = 0
	total_tests = len(test_cases)

	for word, topic in test_cases:
	print(f"\n📝 Testing: '{word}' + '{topic}'")
	print("-" * 40)

	try:
	# Generate multiple clues with different styles for variety
	styles = ["category", "definition", "description"]
	candidates = []

	for style in styles:
	clue = generator.generate_clue(
	word=word,
	topic=topic,
	clue_style=style
	)
	if clue and clue not in candidates:
	candidates.append(clue)

	print(f"Generated {len(candidates)} candidates:")
	for i, candidate in enumerate(candidates, 1):
	print(f" {i}. {candidate}")

	# Use the first/best clue
	best_clue = candidates[0] if candidates else None

	print(f"\n🏆 Best clue: {best_clue}")

	# Quality evaluation - more comprehensive than LLM test
	if (best_clue and
	len(best_clue) > 3 and
	word.lower() not in best_clue.lower() and
	not any(junk in best_clue.lower() for junk in ['trick and treating', 'gritting your teeth', 'fender', 'occurrence'])):
	successful_clues += 1
	print("✅ Quality: GOOD")
	else:
	print("❌ Quality: POOR")

	except Exception as e:
	print(f"❌ Error generating clue: {e}")
	logger.exception("Detailed error:")

	print(f"\n" + "=" * 60)
	print(f"📊 SEMANTIC TEMPLATE RESULTS")
	print(f"=" * 60)
	print(f"Total tests: {total_tests}")
	print(f"Successful clues: {successful_clues}")
	print(f"Success rate: {(successful_clues/total_tests)*100:.1f}%")

	# Compare with LLM performance (which was ~0% success)
	if successful_clues >= total_tests * 0.8: # 80% success rate
	print("🎉 Semantic templates show MAJOR improvement over LLM!")
	elif successful_clues >= total_tests * 0.6: # 60% success rate
	print("🔄 Good improvement, semantic approach is viable")
	elif successful_clues >= total_tests * 0.3: # 30% success rate
	print("⚠️ Some improvement, but templates need refinement")
	else:
	print("❌ Semantic approach also struggling, may need hybrid method")


	def main():
	"""Run the semantic template test."""
	test_semantic_clues()


	if __name__ == "__main__":
	main()