Spaces:
Runtime error
Runtime error
File size: 3,443 Bytes
e1d9ec2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import os
import time
import ctranslate2
import transformers
from datasets import load_dataset
import pandas as pd
MODEL_DIR = "models"
CT2_MODEL_DIR = "models" # Set to models for HF Spaces compatibility (outputs model.bin here)
def optimize_model():
print("Converting model to CTranslate2 format...")
# Ensure source files exist
if not any(f for f in os.listdir(MODEL_DIR) if f.startswith("pytorch_model") or f.endswith(".safetensors")):
print(f"Error: No source weights found in {MODEL_DIR}. Cannot convert.")
return
# Converter for mBART
converter = ctranslate2.converters.TransformersConverter(
MODEL_DIR,
activation_scales=None,
copy_files=["tokenizer.json", "sentencepiece.bpe.model"] # Ensure tokenizer files are copied
)
# Quantization often helps speed. Int8 is common.
converter.convert(
CT2_MODEL_DIR,
quantization="int8",
force=True
)
print(f"Model converted and saved to {CT2_MODEL_DIR}")
def benchmark():
print("\nStarting Benchmarking...")
# Load original model (for size check only, inference might be slow to load)
# original_size = get_dir_size(MODEL_DIR)
# ct2_size = get_dir_size(CT2_MODEL_DIR)
# print(f"Original Model Size: {original_size / 1e6:.2f} MB")
# print(f"Optimized Model Size: {ct2_size / 1e6:.2f} MB")
# Load CT2 model
translator = ctranslate2.Translator(CT2_MODEL_DIR)
tokenizer = transformers.MBart50TokenizerFast.from_pretrained(MODEL_DIR)
# Test data
texts = ["Namaste", "Hello", "How are you", "Good morning", "India"]
target_lang = "hi_IN" # Test with Hindi
tokenizer.src_lang = "en_XX"
start_time = time.time()
# Tokenize
source = tokenizer(texts, return_tensors="pt", padding=True)
input_tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in source["input_ids"]]
# Remove padding/eos if needed specifically for CT2, but usually it handles list of strings
# Actually CT2 expects list of list of str tokens
# Let's re-do properly for CT2 text input
input_tokens_batch = []
for text in texts:
tokens = tokenizer.tokenize(text)
input_tokens_batch.append(tokens)
# Translate
results = translator.translate_batch(
input_tokens_batch,
target_prefix=[[target_lang]] * len(texts) # Force target lang
)
end_time = time.time()
decoded = []
for result in results:
decoded.append(tokenizer.decode(tokenizer.convert_tokens_to_ids(result.hypotheses[0])))
duration = end_time - start_time
print(f"Inference Time for {len(texts)} sentences: {duration:.4f}s")
print(f"Speed: {len(texts)/duration:.2f} sentences/s")
for src, tgt in zip(texts, decoded):
print(f"{src} -> {tgt}")
def get_dir_size(path):
total = 0
with os.scandir(path) as it:
for entry in it:
if entry.is_file():
total += entry.stat().st_size
elif entry.is_dir():
total += get_dir_size(entry.path)
return total
if __name__ == "__main__":
if not os.path.exists(MODEL_DIR):
print(f"Model directory {MODEL_DIR} not found. Please train first.")
else:
optimize_model()
benchmark()
|