opinder2906 commited on
Commit
b9797cb
·
verified ·
1 Parent(s): db008b2

Update src/data_processing.py

Browse files
Files changed (1) hide show
  1. src/data_processing.py +9 -19
src/data_processing.py CHANGED
@@ -1,23 +1,13 @@
1
  import re
 
2
 
3
- # Clean and tokenize text
4
-
5
  def clean_text(text):
6
- text = text or ""
 
7
  text = text.lower()
8
- text = re.sub(r"http\S+|www\S+|https\S+", "", text)
9
- text = re.sub(r"[@#]\w+", "", text)
10
- text = re.sub(r"[^a-z\s]", "", text)
11
- text = re.sub(r"\s+", " ", text).strip()
12
- return text
13
-
14
- # Example: encode tokens to indices (implement your vocab)
15
- def encode(text, vocab):
16
- tokens = text.split()
17
- return [vocab.get(t, vocab.get("<UNK>")) for t in tokens]
18
-
19
- # Pad or truncate sequences to fixed length
20
- def pad_sequence(seq, max_len, pad_value=0):
21
- if len(seq) >= max_len:
22
- return seq[:max_len]
23
- return seq + [pad_value] * (max_len - len(seq))
 
1
  import re
2
+ import pandas as pd
3
 
4
+ # --- Cleaning & basic preprocessing ---
 
5
  def clean_text(text):
6
+ if pd.isnull(text):
7
+ return ""
8
  text = text.lower()
9
+ text = re.sub(r"http\S+|www\S+|https\S+", '', text)
10
+ text = re.sub(r"\@\w+|\#", '', text)
11
+ text = re.sub(r"[^a-z\s]", '', text)
12
+ text = re.sub(r"\s+", ' ', text).strip()
13
+ return text