from transformers import AutoTokenizer, AutoModel import numpy as np class HindiEnglishEncodeDecode: def __init__(self, model_name): self.tokenizer = AutoTokenizer.from_pretrained(model_name) self.model = AutoModel.from_pretrained(model_name) def test_languages(self): test_texts = { 'Hindi': [ 'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।', 'हिंदी भाषा बहुत सुंदर है।', 'मुझे किताबें पढ़ना पसंद है।', 'यह एक उदाहरण वाक्य है।' ], 'English': [ 'Hello, I am from India. Delhi is a big city.', 'The English language is widely spoken.', 'I enjoy reading books.', 'This is an example sentence.' ] } results = {} for language, texts in test_texts.items(): results[language] = [] for text in texts: try: token_ids = self.tokenizer.encode(text, add_special_tokens=True) token_strings = self.tokenizer.tokenize(text) decoded_text = self.tokenizer.decode(token_ids, skip_special_tokens=True) token_stats = { 'min': min(token_ids), 'max': max(token_ids), 'mean': np.mean(token_ids) } # Append results for this text results[language].append({ 'original_text': text, 'token_ids_count': len(token_ids), 'token_strings_count': len(token_strings), 'decoded_text': decoded_text, 'text_match': text == decoded_text, 'token_id_stats': token_stats }) print(f"\n{language} Analysis:") print(f"Original Text: {text}") print(f"Token IDs Count: {len(token_ids)}") print(f"Token Strings: {token_strings}") print(f"Decoded Text: {decoded_text}") print(f"Text Reconstruction: {text == decoded_text}") except Exception as e: results[language].append({'error': str(e)}) print(f"{language} Error: {e}") return results def detailed_token_analysis(self, text): token_ids = self.tokenizer.encode(text, add_special_tokens=True) token_strings = self.tokenizer.tokenize(text) analysis = { 'original_text': text, 'original_length': len(text), 'tokens': { 'ids': token_ids, 'strings': token_strings }, 'token_stats': { 'total_tokens': len(token_ids), 'unique_tokens': len(set(token_ids)), 'avg_token_length': np.mean([len(token) for token in token_strings]) } } return analysis def main(): MODEL_NAME = 'tinycompany/ShawtyIsBad-bgem3' tokenizer_model = HindiEnglishEncodeDecode(MODEL_NAME) results = tokenizer_model.test_languages() sample_text = 'नमस्ते, मैं भारत से हूँ। दिल्ली बहुत बड़ा शहर है।' detailed_result = tokenizer_model.detailed_token_analysis(sample_text) import json with open('hindi_english_tokenization_results.json', 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=4) if __name__ == "__main__": main()