|
|
import os |
|
|
import json |
|
|
import numpy as np |
|
|
import matplotlib.pyplot as plt |
|
|
from typing import List, Dict, Any |
|
|
from transformers import AutoTokenizer |
|
|
|
|
|
class HFTokenizerTestSuite: |
|
|
def __init__(self, model_name: str, test_data_paths: List[str]): |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
|
|
|
self.languages = ['hindi', 'english'] |
|
|
|
|
|
self.edge_cases = { |
|
|
'hindi': { |
|
|
'script_test': 'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।', |
|
|
'unicode_test': 'हिन्दी १२३४५६७८९ vowels: अ आ इ ई उ ऊ', |
|
|
'special_chars': 'हिन्दी! @ # $ % ^ & * ( ) _ + = [ ] { }', |
|
|
}, |
|
|
'english': { |
|
|
'script_test': 'Hello, I am from the United States. New York is a beautiful city.', |
|
|
'unicode_test': 'English 0123456789 vowels: a e i o u', |
|
|
'special_chars': 'English! @ # $ % ^ & * ( ) _ + = [ ] { }', |
|
|
} |
|
|
} |
|
|
|
|
|
self.test_data = self._load_test_data(test_data_paths) |
|
|
|
|
|
self.results = { |
|
|
'coverage': {}, |
|
|
'complexity': {}, |
|
|
'language_analysis': {}, |
|
|
'edge_cases': {} |
|
|
} |
|
|
|
|
|
def _load_test_data(self, data_paths: List[str]) -> Dict[str, List[str]]: |
|
|
test_data = {lang: [] for lang in self.languages} |
|
|
|
|
|
for path in data_paths: |
|
|
try: |
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
texts = f.readlines() |
|
|
|
|
|
for i, text in enumerate(texts): |
|
|
lang = self.languages[i % len(self.languages)] |
|
|
test_data[lang].append(text.strip()) |
|
|
except Exception as e: |
|
|
print(f"Error loading {path}: {e}") |
|
|
|
|
|
return test_data |
|
|
|
|
|
def unicode_coverage_analysis(self) -> Dict[str, Any]: |
|
|
unicode_results = {} |
|
|
|
|
|
for lang, edge_cases in self.edge_cases.items(): |
|
|
unicode_test = edge_cases['unicode_test'] |
|
|
tokenizer_output = self.tokenizer(unicode_test, return_tensors="pt") |
|
|
tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0]) |
|
|
|
|
|
unicode_results[lang] = { |
|
|
'original_text': unicode_test, |
|
|
'tokens': tokens, |
|
|
'token_count': len(tokens), |
|
|
'unique_tokens': len(set(tokens)), |
|
|
'coverage_ratio': len(set(tokens)) / len(tokens) |
|
|
} |
|
|
|
|
|
self.results['unicode_coverage'] = unicode_results |
|
|
return unicode_results |
|
|
|
|
|
def language_specific_edge_cases(self) -> Dict[str, Any]: |
|
|
edge_case_results = {} |
|
|
|
|
|
for lang, cases in self.edge_cases.items(): |
|
|
language_results = {} |
|
|
|
|
|
for case_name, text in cases.items(): |
|
|
try: |
|
|
tokenizer_output = self.tokenizer(text, return_tensors="pt") |
|
|
tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0]) |
|
|
|
|
|
language_results[case_name] = { |
|
|
'tokens': tokens, |
|
|
'token_count': len(tokens), |
|
|
'unique_tokens': len(set(tokens)) |
|
|
} |
|
|
except Exception as e: |
|
|
language_results[case_name] = { |
|
|
'error': str(e) |
|
|
} |
|
|
|
|
|
edge_case_results[lang] = language_results |
|
|
|
|
|
self.results['edge_cases'] = edge_case_results |
|
|
return edge_case_results |
|
|
|
|
|
def script_complexity_analysis(self) -> Dict[str, Any]: |
|
|
complexity_results = {} |
|
|
|
|
|
for lang in self.languages: |
|
|
text = self.edge_cases[lang]['script_test'] |
|
|
|
|
|
tokenizer_output = self.tokenizer(text, return_tensors="pt") |
|
|
tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0]) |
|
|
|
|
|
|
|
|
filtered_tokens = [token for token in tokens if not token.startswith('[') or not token.endswith(']')] |
|
|
|
|
|
complexity_results[lang] = { |
|
|
'original_text_length': len(text), |
|
|
'tokens': tokens, |
|
|
'token_count': len(tokens), |
|
|
'avg_token_length': np.mean([len(token) for token in filtered_tokens]) if filtered_tokens else 0, |
|
|
'token_diversity': len(set(tokens)) / len(tokens) |
|
|
} |
|
|
|
|
|
self.results['script_complexity'] = complexity_results |
|
|
return complexity_results |
|
|
|
|
|
def generate_token_histograms(self): |
|
|
plt.figure(figsize=(15, 10)) |
|
|
|
|
|
for i, lang in enumerate(self.languages): |
|
|
text = self.test_data[lang][0] if self.test_data[lang] else self.edge_cases[lang]['script_test'] |
|
|
|
|
|
tokenizer_output = self.tokenizer(text, return_tensors="pt") |
|
|
tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0]) |
|
|
|
|
|
|
|
|
filtered_tokens = [token for token in tokens if not token.startswith('[') or not token.endswith(']')] |
|
|
token_lengths = [len(token) for token in filtered_tokens] |
|
|
|
|
|
plt.subplot(len(self.languages), 1, i+1) |
|
|
plt.hist(token_lengths, bins=range(1, max(token_lengths) + 2), alpha=0.7) |
|
|
plt.title(f'Token Length Distribution for {lang.capitalize()}') |
|
|
plt.xlabel('Token Length') |
|
|
plt.ylabel('Frequency') |
|
|
plt.grid(True, alpha=0.3) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('token_length_histograms.png') |
|
|
plt.close() |
|
|
|
|
|
def generate_unicode_visualization(self): |
|
|
plt.figure(figsize=(15, 10)) |
|
|
|
|
|
unicode_results = self.results.get('unicode_coverage', {}) |
|
|
|
|
|
plt.subplot(2, 1, 1) |
|
|
token_counts = [result['token_count'] for result in unicode_results.values()] |
|
|
plt.bar(unicode_results.keys(), token_counts) |
|
|
plt.title('Token Count in Unicode Test Texts') |
|
|
plt.xlabel('Language') |
|
|
plt.ylabel('Number of Tokens') |
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.subplot(2, 1, 2) |
|
|
coverage_ratios = [result['coverage_ratio'] for result in unicode_results.values()] |
|
|
plt.bar(unicode_results.keys(), coverage_ratios) |
|
|
plt.title('Token Diversity Ratio') |
|
|
plt.xlabel('Language') |
|
|
plt.ylabel('Unique Tokens / Total Tokens') |
|
|
plt.xticks(rotation=45) |
|
|
|
|
|
plt.tight_layout() |
|
|
plt.savefig('unicode_token_analysis.png') |
|
|
plt.close() |
|
|
|
|
|
def run_all_tests(self): |
|
|
print("Running Tokenizer Test Suite for Hindi and English...") |
|
|
|
|
|
print("1. Unicode Coverage Analysis...") |
|
|
self.unicode_coverage_analysis() |
|
|
|
|
|
print("2. Language-Specific Edge Cases...") |
|
|
self.language_specific_edge_cases() |
|
|
|
|
|
print("3. Script Complexity Analysis...") |
|
|
self.script_complexity_analysis() |
|
|
|
|
|
print("4. Generating Token Histograms...") |
|
|
self.generate_token_histograms() |
|
|
|
|
|
print("5. Generating Unicode Visualizations...") |
|
|
self.generate_unicode_visualization() |
|
|
|
|
|
print("Test Suite Complete!") |
|
|
|
|
|
return self.results |
|
|
|
|
|
if __name__ == "__main__": |
|
|
MODEL_NAME = "tinycompany/ShawtyIsBad-bgem3" |
|
|
|
|
|
TEST_DATA_PATHS = [ |
|
|
'./test2.txt' |
|
|
] |
|
|
|
|
|
test_suite = HFTokenizerTestSuite(MODEL_NAME, TEST_DATA_PATHS) |
|
|
results = test_suite.run_all_tests() |
|
|
|
|
|
with open('result1.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(results, f, ensure_ascii=False, indent=4) |