phish-transformer / src /data /tokenizer.py
RamadhanZome's picture
Upload folder using huggingface_hub
7500cab verified
"""Converting the URL string to fixed-length integer sequence"""
from typing import List
# Maximum length of a URL sequence (longer ones will be cut,
# shorter ones will be padded with zeros)
MAX_LEN = 75 # each url represeented as exactly 75 chars
# I create a simple vocabulary for all printable ASCII characters
# ASCII codes 32–126 will cover characters like letters, digits, symbols, etc.
# Example: 'A' -> 34, 'a' -> 66, '/' -> 17, etc.
# Subtracting 31 shifts them to start from 1 instead of 32
VOCAB = {chr(i): i-31 for i in range(32, 127)}
# Special token IDs:
PAD = 0 # padding token (for short URLs)
UNK = len(VOCAB) + 1 # unknown character token (for chars not in VOCAB)
def url_to_ids(url : str , max_len : int = MAX_LEN) -> List[int]:
"""
Convert a URL string into a fixed-length list of integers.
Steps:
1. Map each character to an integer ID using VOCAB.
- If a character isn't in VOCAB, use UNK (unknown token).
2. Truncate to `max_len` if the URL is longer.
3. Pad with PAD (0) on the right if it's shorter.
Returns:
List[int]: List of integer IDs, length = max_len
"""
# Convert each character to its numeric ID (or UNK if missing)
ids = [VOCAB.get(c, UNK) for c in url[:max_len]]
# Add padding tokens to reach max_len
ids += [PAD] * (max_len - len(ids)) # pad right
# Ensure the final list has exactly max_len elements
return ids[:max_len]
# sanity check
if __name__ == "__main__":
# Test conversation
print(url_to_ids("https://google.com"))