opinder2906 commited on
Commit
4bfb7b6
·
verified ·
1 Parent(s): 2fcdbc3

Create data_processing.py

Browse files
Files changed (1) hide show
  1. .src/data_processing.py +23 -0
.src/data_processing.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+ # Clean and tokenize text
4
+
5
+ def clean_text(text):
6
+ text = text or ""
7
+ text = text.lower()
8
+ text = re.sub(r"http\S+|www\S+|https\S+", "", text)
9
+ text = re.sub(r"[@#]\w+", "", text)
10
+ text = re.sub(r"[^a-z\s]", "", text)
11
+ text = re.sub(r"\s+", " ", text).strip()
12
+ return text
13
+
14
+ # Example: encode tokens to indices (implement your vocab)
15
+ def encode(text, vocab):
16
+ tokens = text.split()
17
+ return [vocab.get(t, vocab.get("<UNK>")) for t in tokens]
18
+
19
+ # Pad or truncate sequences to fixed length
20
+ def pad_sequence(seq, max_len, pad_value=0):
21
+ if len(seq) >= max_len:
22
+ return seq[:max_len]
23
+ return seq + [pad_value] * (max_len - len(seq))