File size: 1,566 Bytes
7500cab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47

"""Converting the URL string to fixed-length integer sequence"""

from typing import List

# Maximum length of a URL sequence (longer ones will be cut,
# shorter ones will be padded with zeros)
MAX_LEN = 75 # each url represeented as exactly 75 chars

# I create a simple vocabulary for all printable ASCII characters
# ASCII codes 32–126 will cover characters like letters, digits, symbols, etc.
# Example: 'A' -> 34, 'a' -> 66, '/' -> 17, etc.
# Subtracting 31 shifts them to start from 1 instead of 32
VOCAB = {chr(i): i-31 for i in range(32, 127)}

# Special token IDs:
PAD = 0   # padding token (for short URLs)
UNK = len(VOCAB) + 1 # unknown character token (for chars not in VOCAB)

def url_to_ids(url : str , max_len : int = MAX_LEN) -> List[int]:
    """
    Convert a URL string into a fixed-length list of integers.

    Steps:
      1. Map each character to an integer ID using VOCAB.
         - If a character isn't in VOCAB, use UNK (unknown token).
      2. Truncate to `max_len` if the URL is longer.
      3. Pad with PAD (0) on the right if it's shorter.

    Returns:
        List[int]: List of integer IDs, length = max_len
    """

    # Convert each character to its numeric ID (or UNK if missing)
    ids = [VOCAB.get(c, UNK) for c in url[:max_len]]

    # Add padding tokens to reach max_len
    ids += [PAD] * (max_len - len(ids)) # pad right

    # Ensure the final list has exactly max_len elements
    return ids[:max_len] 

# sanity check

if __name__ == "__main__":
    # Test conversation
    print(url_to_ids("https://google.com"))