| |
|
| |
|
| | import argparse
|
| | import sys
|
| | from common import compare_tokens
|
| |
|
| |
|
| | def parse_arguments():
|
| | parser = argparse.ArgumentParser(
|
| | description='Compare tokens between two models',
|
| | formatter_class=argparse.RawDescriptionHelpFormatter,
|
| | epilog="""
|
| | Examples:
|
| | %(prog)s pytorch-gemma-3-270m-it llamacpp-gemma-3-270m-it-bf16
|
| | """
|
| | )
|
| | parser.add_argument(
|
| | 'original',
|
| | help='Original model name'
|
| | )
|
| | parser.add_argument(
|
| | 'converted',
|
| | help='Converted model name'
|
| | )
|
| | parser.add_argument(
|
| | '-s', '--suffix',
|
| | default='',
|
| | help='Type suffix (e.g., "-embeddings")'
|
| | )
|
| | parser.add_argument(
|
| | '-d', '--data-dir',
|
| | default='data',
|
| | help='Directory containing token files (default: data)'
|
| | )
|
| | parser.add_argument(
|
| | '-v', '--verbose',
|
| | action='store_true',
|
| | help='Print prompts from both models'
|
| | )
|
| | return parser.parse_args()
|
| |
|
| |
|
| | def main():
|
| | args = parse_arguments()
|
| |
|
| | if args.verbose:
|
| | from pathlib import Path
|
| | data_dir = Path(args.data_dir)
|
| |
|
| | prompt1_file = data_dir / f"{args.original}{args.suffix}-prompt.txt"
|
| | prompt2_file = data_dir / f"{args.converted}{args.suffix}-prompt.txt"
|
| |
|
| | if prompt1_file.exists():
|
| | print(f"\nOriginal model prompt ({args.original}):")
|
| | print(f" {prompt1_file.read_text().strip()}")
|
| |
|
| | if prompt2_file.exists():
|
| | print(f"\nConverted model prompt ({args.converted}):")
|
| | print(f" {prompt2_file.read_text().strip()}")
|
| |
|
| | print()
|
| |
|
| | result = compare_tokens(
|
| | args.original,
|
| | args.converted,
|
| | type_suffix=args.suffix,
|
| | output_dir=args.data_dir
|
| | )
|
| |
|
| |
|
| |
|
| | sys.exit(0 if result else 1)
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|