File size: 2,861 Bytes
ff1b54b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import re
import sys
import argparse

def minify_jinja(content):
    """
    Minifies a Jinja2 template by removing comments and collapsing whitespace.
    
    This function is designed to be "safe" for chat templates:
    1. It removes all Jinja2 comments.
    2. It replaces newlines with spaces to ensure words don't merge (e.g., "Hello\nWorld" -> "Hello World").
    3. It collapses multiple spaces into one to keep the file size small.
    4. It removes spaces between adjacent tags, as the template's use of 
       white-space stripping markers (e.g., '{%-') ensures this is safe.
    
    Args:
        content (str): The raw Jinja2 template content.
        
    Returns:
        str: The minified, single-line template.
    """
    # Remove Jinja2 comments: {# ... #}
    content = re.sub(r'\{#.*?#\}', '', content, flags=re.DOTALL)
    
    # Replace newlines and tabs with spaces. This is a "safe" minification 
    # strategy that prevents content from merging incorrectly.
    content = content.replace('\n', ' ').replace('\t', ' ')
    
    # Collapse multiple spaces into a single space.
    content = re.sub(r' +', ' ', content)
    
    # Remove spaces between Jinja tags. This is generally safe in templates 
    # that use white-space stripping (the '-' in '{%-') and significantly
    # reduces the token count for the tokenizer_config.json.
    content = content.replace('%} {%', '%}{%').replace('%} {{', '%}{{')
    content = content.replace('}} {%', '}}{%').replace('}} {{', '}}{{')
    
    return content.strip()

def main():
    # Setup argument parser for a better command-line interface
    parser = argparse.ArgumentParser(
        description="Minify a Jinja2 chat template for use in tokenizer_config.json"
    )
    parser.add_argument("input", help="Path to the source .jinja file")
    parser.add_argument("output", help="Path to save the minified output")
    
    args = parser.parse_args()
    
    try:
        # Read the input template
        with open(args.input, 'r', encoding='utf-8') as f:
            content = f.read()
        
        # Perform minification
        minified = minify_jinja(content)
        
        # Write the result to the output file
        with open(args.output, 'w', encoding='utf-8') as f:
            f.write(minified)
            
        # Provide feedback on the process
        print(f"Minification complete: '{args.input}' -> '{args.output}'")
        print(f"Original size: {len(content)} bytes")
        print(f"Minified size: {len(minified)} bytes")
        print(f"Reduction: {100 - (len(minified) / len(content) * 100):.1f}%")
        
    except FileNotFoundError:
        print(f"Error: The file '{args.input}' was not found.")
        sys.exit(1)
    except Exception as e:
        print(f"An error occurred: {e}")
        sys.exit(1)

if __name__ == '__main__':
    main()