| |
| """ |
| Test script to verify the fixes for the ASL gloss processing |
| """ |
|
|
| import asyncio |
| import re |
| from vectorizer import Vectorizer |
|
|
|
|
| def clean_gloss_token(token): |
| """ |
| Clean a gloss token by removing brackets, newlines, and extra whitespace |
| """ |
| |
| cleaned = re.sub(r'[\[\]\n\r]', '', token) |
| |
| cleaned = re.sub(r'\s+', ' ', cleaned).strip() |
| return cleaned.lower() |
|
|
|
|
| def test_gloss_parsing(): |
| """Test the gloss parsing functionality""" |
| |
| sample_gloss = ("ASL [BEAR] [NAME] [OSKI] [LOVE] [HONEY] [BUT] [ALWAYS] " |
| "[GET-STUCK] [TREE]\n\n[ONE_DAY] [HE] [DISCOVER] [LADDER]\n\n" |
| "[PROBLEM] [SOLVE] [FINISH]") |
| |
| print("Original gloss:") |
| print(sample_gloss) |
| print("\n" + "="*50 + "\n") |
| |
| |
| gloss_tokens = sample_gloss.split() |
| cleaned_tokens = [] |
| |
| for token in gloss_tokens: |
| cleaned = clean_gloss_token(token) |
| if cleaned: |
| cleaned_tokens.append(cleaned) |
| |
| print("Cleaned tokens:") |
| for i, token in enumerate(cleaned_tokens): |
| print(f"{i+1:2d}. {token}") |
| |
| return cleaned_tokens |
|
|
|
|
| async def test_vectorizer(): |
| """Test the vectorizer functionality""" |
| try: |
| vectorizer = Vectorizer() |
| |
| |
| test_words = ["BEAR", "LOVE", "TREE", "HE", "FINISH"] |
| |
| for word in test_words: |
| print(f"\nTesting word: {word}") |
| result = await vectorizer.vector_query_from_supabase(word) |
| print(f"Result: {result}") |
| |
| except Exception as e: |
| print(f"Error testing vectorizer: {e}") |
|
|
|
|
| async def main(): |
| """Main test function""" |
| print("Testing ASL Gloss Processing Fixes") |
| print("=" * 50) |
| |
| |
| print("\n1. Testing gloss parsing...") |
| cleaned_tokens = test_gloss_parsing() |
| print(f"Total cleaned tokens: {len(cleaned_tokens)}") |
| |
| |
| print("\n2. Testing vectorizer...") |
| await test_vectorizer() |
| |
| print("\n" + "=" * 50) |
| print("Test completed!") |
|
|
|
|
| if __name__ == "__main__": |
| asyncio.run(main()) |