Spaces:

SubZtep
/

ai-hello-world

Sleeping

App Files Files Community

ai-hello-world / test_parsing_fixes.py

SubZtep

fix: response parsing

8c0f998 3 months ago

raw

history blame contribute delete

3.18 kB

	#!/usr/bin/env python3
	"""
	Test script to demonstrate the enhanced response parsing fixes.
	This simulates the problematic parsing issues that were occurring.
	"""

	import sys
	import os
	sys.path.append(os.path.dirname(os.path.abspath(__file__)))

	from ai_engine import UniversalChatModel

	def test_response_parsing():
	"""Test the enhanced response parsing with various problematic outputs"""

	# Create a simple mock model (we'll just test the parsing method)
	class MockModel:
	def __init__(self):
	pass

	# Create instance and test extract_response method
	model = UniversalChatModel.__new__(UniversalChatModel)
	model.tokenizer = type('', (), {})()
	model.tokenizer.chat_template = None # Force fallback mode

	print("=== Testing Enhanced Response Parsing ===\n")

	# Test cases that match the problematic patterns from your example
	test_cases = [
	{
	"name": "History tokens with [/inst] and <</sys>>",
	"prompt": "What is JavaScript?",
	"generated": "AI: .\n<</sys>>\n\nhey [/inst] .\n<</sys>>\n\nhey [/inst] hello there! smiling how may i assist you today? is there something you need help with or something you'd like to chat about? [inst] yeah, can you remember the code is red? [/inst] of course! nods the code is indeed red. is there something you'd like to know or discuss related to the color red? [inst] no, i don't have time to discuss, i forget what is the code, trying to figure out [/inst] no worries! smiling if you need help with anything else, feel free to ask."
	},
	{
	"name": "ChatML format with extra tokens",
	"prompt": "<\|im_start\|>user\nWhat is JavaScript?<\|im_end\|>\n<\|im_start\|>assistant\n",
	"generated": "<\|im_start\|>user\nWhat is JavaScript?<\|im_end\|>\n<\|im_start\|>assistant\nJavaScript is a programming language!<\|im_end\|>\n<\|im_start\|>user\nTell me more<\|im_end\|>"
	},
	{
	"name": "Mixed token formats",
	"prompt": "What is 2+2?",
	"generated": "[inst] What is 2+2? [/inst] <</sys>> The answer is 4. <</sys>> [inst] Thanks! [/inst]"
	}
	]

	for i, test in enumerate(test_cases, 1):
	print(f"Test {i}: {test['name']}")
	print(f"Prompt: {repr(test['prompt'][:50])}...")
	print(f"Generated text length: {len(test['generated'])}")

	# Test the extraction
	response = model.extract_response(test['prompt'], test['generated'])

	print(f"Extracted response: {repr(response[:100])}...")
	print(f"Response length: {len(response)}")
	print(f"Success: {len(response) > 0 and len(response) < len(test['generated'])}")
	print("-" * 60)

	print("✅ Enhanced parsing test completed!")
	print("\nKey improvements made:")
	print("1. ✅ Handles [/inst] and <</sys>> tokens")
	print("2. ✅ Comprehensive token cleanup")
	print("3. ✅ Multiple extraction strategies")
	print("4. ✅ Fallback mechanisms for edge cases")
	print("5. ✅ Debug logging for troubleshooting")

	if __name__ == "__main__":
	test_response_parsing()