| """Converting the URL string to fixed-length integer sequence""" | |
| from typing import List | |
| # Maximum length of a URL sequence (longer ones will be cut, | |
| # shorter ones will be padded with zeros) | |
| MAX_LEN = 75 # each url represeented as exactly 75 chars | |
| # I create a simple vocabulary for all printable ASCII characters | |
| # ASCII codes 32–126 will cover characters like letters, digits, symbols, etc. | |
| # Example: 'A' -> 34, 'a' -> 66, '/' -> 17, etc. | |
| # Subtracting 31 shifts them to start from 1 instead of 32 | |
| VOCAB = {chr(i): i-31 for i in range(32, 127)} | |
| # Special token IDs: | |
| PAD = 0 # padding token (for short URLs) | |
| UNK = len(VOCAB) + 1 # unknown character token (for chars not in VOCAB) | |
| def url_to_ids(url : str , max_len : int = MAX_LEN) -> List[int]: | |
| """ | |
| Convert a URL string into a fixed-length list of integers. | |
| Steps: | |
| 1. Map each character to an integer ID using VOCAB. | |
| - If a character isn't in VOCAB, use UNK (unknown token). | |
| 2. Truncate to `max_len` if the URL is longer. | |
| 3. Pad with PAD (0) on the right if it's shorter. | |
| Returns: | |
| List[int]: List of integer IDs, length = max_len | |
| """ | |
| # Convert each character to its numeric ID (or UNK if missing) | |
| ids = [VOCAB.get(c, UNK) for c in url[:max_len]] | |
| # Add padding tokens to reach max_len | |
| ids += [PAD] * (max_len - len(ids)) # pad right | |
| # Ensure the final list has exactly max_len elements | |
| return ids[:max_len] | |
| # sanity check | |
| if __name__ == "__main__": | |
| # Test conversation | |
| print(url_to_ids("https://google.com")) |