aloobun commited on
Commit
551eebe
·
verified ·
1 Parent(s): b8eb008

Create test_script.py

Browse files
Files changed (1) hide show
  1. test_bg/test_script.py +202 -0
test_bg/test_script.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from typing import List, Dict, Any
6
+ from transformers import AutoTokenizer
7
+
8
+ class HFTokenizerTestSuite:
9
+ def __init__(self, model_name: str, test_data_paths: List[str]):
10
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+
12
+ self.languages = ['hindi', 'english']
13
+
14
+ self.edge_cases = {
15
+ 'hindi': {
16
+ 'script_test': 'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।',
17
+ 'unicode_test': 'हिन्दी १२३४५६७८९ vowels: अ आ इ ई उ ऊ',
18
+ 'special_chars': 'हिन्दी! @ # $ % ^ & * ( ) _ + = [ ] { }',
19
+ },
20
+ 'english': {
21
+ 'script_test': 'Hello, I am from the United States. New York is a beautiful city.',
22
+ 'unicode_test': 'English 0123456789 vowels: a e i o u',
23
+ 'special_chars': 'English! @ # $ % ^ & * ( ) _ + = [ ] { }',
24
+ }
25
+ }
26
+
27
+ self.test_data = self._load_test_data(test_data_paths)
28
+
29
+ self.results = {
30
+ 'coverage': {},
31
+ 'complexity': {},
32
+ 'language_analysis': {},
33
+ 'edge_cases': {}
34
+ }
35
+
36
+ def _load_test_data(self, data_paths: List[str]) -> Dict[str, List[str]]:
37
+ test_data = {lang: [] for lang in self.languages}
38
+
39
+ for path in data_paths:
40
+ try:
41
+ with open(path, 'r', encoding='utf-8') as f:
42
+ texts = f.readlines()
43
+
44
+ for i, text in enumerate(texts):
45
+ lang = self.languages[i % len(self.languages)]
46
+ test_data[lang].append(text.strip())
47
+ except Exception as e:
48
+ print(f"Error loading {path}: {e}")
49
+
50
+ return test_data
51
+
52
+ def unicode_coverage_analysis(self) -> Dict[str, Any]:
53
+ unicode_results = {}
54
+
55
+ for lang, edge_cases in self.edge_cases.items():
56
+ unicode_test = edge_cases['unicode_test']
57
+ tokenizer_output = self.tokenizer(unicode_test, return_tensors="pt")
58
+ tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0])
59
+
60
+ unicode_results[lang] = {
61
+ 'original_text': unicode_test,
62
+ 'tokens': tokens,
63
+ 'token_count': len(tokens),
64
+ 'unique_tokens': len(set(tokens)),
65
+ 'coverage_ratio': len(set(tokens)) / len(tokens)
66
+ }
67
+
68
+ self.results['unicode_coverage'] = unicode_results
69
+ return unicode_results
70
+
71
+ def language_specific_edge_cases(self) -> Dict[str, Any]:
72
+ edge_case_results = {}
73
+
74
+ for lang, cases in self.edge_cases.items():
75
+ language_results = {}
76
+
77
+ for case_name, text in cases.items():
78
+ try:
79
+ tokenizer_output = self.tokenizer(text, return_tensors="pt")
80
+ tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0])
81
+
82
+ language_results[case_name] = {
83
+ 'tokens': tokens,
84
+ 'token_count': len(tokens),
85
+ 'unique_tokens': len(set(tokens))
86
+ }
87
+ except Exception as e:
88
+ language_results[case_name] = {
89
+ 'error': str(e)
90
+ }
91
+
92
+ edge_case_results[lang] = language_results
93
+
94
+ self.results['edge_cases'] = edge_case_results
95
+ return edge_case_results
96
+
97
+ def script_complexity_analysis(self) -> Dict[str, Any]:
98
+ complexity_results = {}
99
+
100
+ for lang in self.languages:
101
+ text = self.edge_cases[lang]['script_test']
102
+
103
+ tokenizer_output = self.tokenizer(text, return_tensors="pt")
104
+ tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0])
105
+
106
+ # Filter out special tokens for accurate length calculation
107
+ filtered_tokens = [token for token in tokens if not token.startswith('[') or not token.endswith(']')]
108
+
109
+ complexity_results[lang] = {
110
+ 'original_text_length': len(text),
111
+ 'tokens': tokens,
112
+ 'token_count': len(tokens),
113
+ 'avg_token_length': np.mean([len(token) for token in filtered_tokens]) if filtered_tokens else 0,
114
+ 'token_diversity': len(set(tokens)) / len(tokens)
115
+ }
116
+
117
+ self.results['script_complexity'] = complexity_results
118
+ return complexity_results
119
+
120
+ def generate_token_histograms(self):
121
+ plt.figure(figsize=(15, 10))
122
+
123
+ for i, lang in enumerate(self.languages):
124
+ text = self.test_data[lang][0] if self.test_data[lang] else self.edge_cases[lang]['script_test']
125
+
126
+ tokenizer_output = self.tokenizer(text, return_tensors="pt")
127
+ tokens = self.tokenizer.convert_ids_to_tokens(tokenizer_output['input_ids'][0])
128
+
129
+ # Filter out special tokens
130
+ filtered_tokens = [token for token in tokens if not token.startswith('[') or not token.endswith(']')]
131
+ token_lengths = [len(token) for token in filtered_tokens]
132
+
133
+ plt.subplot(len(self.languages), 1, i+1)
134
+ plt.hist(token_lengths, bins=range(1, max(token_lengths) + 2), alpha=0.7)
135
+ plt.title(f'Token Length Distribution for {lang.capitalize()}')
136
+ plt.xlabel('Token Length')
137
+ plt.ylabel('Frequency')
138
+ plt.grid(True, alpha=0.3)
139
+
140
+ plt.tight_layout()
141
+ plt.savefig('token_length_histograms.png')
142
+ plt.close()
143
+
144
+ def generate_unicode_visualization(self):
145
+ plt.figure(figsize=(15, 10))
146
+
147
+ unicode_results = self.results.get('unicode_coverage', {})
148
+
149
+ plt.subplot(2, 1, 1)
150
+ token_counts = [result['token_count'] for result in unicode_results.values()]
151
+ plt.bar(unicode_results.keys(), token_counts)
152
+ plt.title('Token Count in Unicode Test Texts')
153
+ plt.xlabel('Language')
154
+ plt.ylabel('Number of Tokens')
155
+ plt.xticks(rotation=45)
156
+
157
+ plt.subplot(2, 1, 2)
158
+ coverage_ratios = [result['coverage_ratio'] for result in unicode_results.values()]
159
+ plt.bar(unicode_results.keys(), coverage_ratios)
160
+ plt.title('Token Diversity Ratio')
161
+ plt.xlabel('Language')
162
+ plt.ylabel('Unique Tokens / Total Tokens')
163
+ plt.xticks(rotation=45)
164
+
165
+ plt.tight_layout()
166
+ plt.savefig('unicode_token_analysis.png')
167
+ plt.close()
168
+
169
+ def run_all_tests(self):
170
+ print("Running Tokenizer Test Suite for Hindi and English...")
171
+
172
+ print("1. Unicode Coverage Analysis...")
173
+ self.unicode_coverage_analysis()
174
+
175
+ print("2. Language-Specific Edge Cases...")
176
+ self.language_specific_edge_cases()
177
+
178
+ print("3. Script Complexity Analysis...")
179
+ self.script_complexity_analysis()
180
+
181
+ print("4. Generating Token Histograms...")
182
+ self.generate_token_histograms()
183
+
184
+ print("5. Generating Unicode Visualizations...")
185
+ self.generate_unicode_visualization()
186
+
187
+ print("Test Suite Complete!")
188
+
189
+ return self.results
190
+
191
+ if __name__ == "__main__":
192
+ MODEL_NAME = "tinycompany/ShawtyIsBad-bgem3"
193
+
194
+ TEST_DATA_PATHS = [
195
+ './test2.txt'
196
+ ]
197
+
198
+ test_suite = HFTokenizerTestSuite(MODEL_NAME, TEST_DATA_PATHS)
199
+ results = test_suite.run_all_tests()
200
+
201
+ with open('result1.json', 'w', encoding='utf-8') as f:
202
+ json.dump(results, f, ensure_ascii=False, indent=4)