Spaces:
Sleeping
Sleeping
| import re | |
| # Clean and tokenize text | |
| def clean_text(text): | |
| text = text or "" | |
| text = text.lower() | |
| text = re.sub(r"http\S+|www\S+|https\S+", "", text) | |
| text = re.sub(r"[@#]\w+", "", text) | |
| text = re.sub(r"[^a-z\s]", "", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| # Example: encode tokens to indices (implement your vocab) | |
| def encode(text, vocab): | |
| tokens = text.split() | |
| return [vocab.get(t, vocab.get("<UNK>")) for t in tokens] | |
| # Pad or truncate sequences to fixed length | |
| def pad_sequence(seq, max_len, pad_value=0): | |
| if len(seq) >= max_len: | |
| return seq[:max_len] | |
| return seq + [pad_value] * (max_len - len(seq)) |