|
|
from transformers import AutoTokenizer, AutoModel |
|
|
import numpy as np |
|
|
|
|
|
class HindiEnglishEncodeDecode: |
|
|
def __init__(self, model_name): |
|
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
self.model = AutoModel.from_pretrained(model_name) |
|
|
|
|
|
def test_languages(self): |
|
|
test_texts = { |
|
|
'Hindi': [ |
|
|
'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।', |
|
|
'हिंदी भाषा बहुत सुंदर है।', |
|
|
'मुझे किताबें पढ़ना पसंद है।', |
|
|
'यह एक उदाहरण वाक्य है।' |
|
|
], |
|
|
'English': [ |
|
|
'Hello, I am from India. Delhi is a big city.', |
|
|
'The English language is widely spoken.', |
|
|
'I enjoy reading books.', |
|
|
'This is an example sentence.' |
|
|
] |
|
|
} |
|
|
|
|
|
results = {} |
|
|
|
|
|
for language, texts in test_texts.items(): |
|
|
results[language] = [] |
|
|
for text in texts: |
|
|
try: |
|
|
token_ids = self.tokenizer.encode(text, add_special_tokens=True) |
|
|
token_strings = self.tokenizer.tokenize(text) |
|
|
|
|
|
decoded_text = self.tokenizer.decode(token_ids, skip_special_tokens=True) |
|
|
|
|
|
token_stats = { |
|
|
'min': min(token_ids), |
|
|
'max': max(token_ids), |
|
|
'mean': np.mean(token_ids) |
|
|
} |
|
|
|
|
|
|
|
|
results[language].append({ |
|
|
'original_text': text, |
|
|
'token_ids_count': len(token_ids), |
|
|
'token_strings_count': len(token_strings), |
|
|
'decoded_text': decoded_text, |
|
|
'text_match': text == decoded_text, |
|
|
'token_id_stats': token_stats |
|
|
}) |
|
|
|
|
|
print(f"\n{language} Analysis:") |
|
|
print(f"Original Text: {text}") |
|
|
print(f"Token IDs Count: {len(token_ids)}") |
|
|
print(f"Token Strings: {token_strings}") |
|
|
print(f"Decoded Text: {decoded_text}") |
|
|
print(f"Text Reconstruction: {text == decoded_text}") |
|
|
|
|
|
except Exception as e: |
|
|
results[language].append({'error': str(e)}) |
|
|
print(f"{language} Error: {e}") |
|
|
|
|
|
return results |
|
|
|
|
|
def detailed_token_analysis(self, text): |
|
|
token_ids = self.tokenizer.encode(text, add_special_tokens=True) |
|
|
token_strings = self.tokenizer.tokenize(text) |
|
|
|
|
|
analysis = { |
|
|
'original_text': text, |
|
|
'original_length': len(text), |
|
|
'tokens': { |
|
|
'ids': token_ids, |
|
|
'strings': token_strings |
|
|
}, |
|
|
'token_stats': { |
|
|
'total_tokens': len(token_ids), |
|
|
'unique_tokens': len(set(token_ids)), |
|
|
'avg_token_length': np.mean([len(token) for token in token_strings]) |
|
|
} |
|
|
} |
|
|
|
|
|
return analysis |
|
|
|
|
|
def main(): |
|
|
MODEL_NAME = 'tinycompany/ShawtyIsBad-bgem3' |
|
|
|
|
|
tokenizer_model = HindiEnglishEncodeDecode(MODEL_NAME) |
|
|
|
|
|
results = tokenizer_model.test_languages() |
|
|
|
|
|
sample_text = 'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।' |
|
|
detailed_result = tokenizer_model.detailed_token_analysis(sample_text) |
|
|
|
|
|
import json |
|
|
with open('hindi_english_tokenization_results.json', 'w', encoding='utf-8') as f: |
|
|
json.dump(results, f, ensure_ascii=False, indent=4) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |