import ctranslate2 import transformers import time import os def convert_model(): print("Converting model to CTranslate2...") # System command for conversion os.system("ct2-transformers-converter --model ./final_model --output_dir ./ct2_model --quantization int8") def run_benchmark(): device = "cpu" translator = ctranslate2.Translator("./ct2_model", device=device) tokenizer = transformers.AutoTokenizer.from_pretrained("./final_model") test_inputs = [" shanti", " namaskar", " vanakkam"] tokens = [tokenizer.convert_ids_to_tokens(tokenizer.encode(t)) for t in test_inputs] # Benchmark CTranslate2 start = time.time() for _ in range(10): # Average over 10 runs results = translator.translate_batch(tokens) ct2_time = (time.time() - start) / 30 # 10 runs * 3 inputs print(f"CTranslate2 Latency: {ct2_time:.4f}s per word") # Size comparison orig_size = sum(os.path.getsize(os.path.join("./final_model", f)) for f in os.listdir("./final_model") if os.path.isfile(os.path.join("./final_model", f))) opt_size = sum(os.path.getsize(os.path.join("./ct2_model", f)) for f in os.listdir("./ct2_model") if os.path.isfile(os.path.join("./ct2_model", f))) print(f"Compression Ratio: {orig_size / opt_size:.2f}x") if __name__ == "__main__": convert_model() run_benchmark()