Spaces:
Sleeping
Sleeping
Update src/data_processing.py
Browse files- src/data_processing.py +9 -19
src/data_processing.py
CHANGED
|
@@ -1,23 +1,13 @@
|
|
| 1 |
import re
|
|
|
|
| 2 |
|
| 3 |
-
#
|
| 4 |
-
|
| 5 |
def clean_text(text):
|
| 6 |
-
|
|
|
|
| 7 |
text = text.lower()
|
| 8 |
-
text = re.sub(r"http\S+|www\S+|https\S+",
|
| 9 |
-
text = re.sub(r"
|
| 10 |
-
text = re.sub(r"[^a-z\s]",
|
| 11 |
-
text = re.sub(r"\s+",
|
| 12 |
-
return text
|
| 13 |
-
|
| 14 |
-
# Example: encode tokens to indices (implement your vocab)
|
| 15 |
-
def encode(text, vocab):
|
| 16 |
-
tokens = text.split()
|
| 17 |
-
return [vocab.get(t, vocab.get("<UNK>")) for t in tokens]
|
| 18 |
-
|
| 19 |
-
# Pad or truncate sequences to fixed length
|
| 20 |
-
def pad_sequence(seq, max_len, pad_value=0):
|
| 21 |
-
if len(seq) >= max_len:
|
| 22 |
-
return seq[:max_len]
|
| 23 |
-
return seq + [pad_value] * (max_len - len(seq))
|
|
|
|
| 1 |
import re
|
| 2 |
+
import pandas as pd
|
| 3 |
|
| 4 |
+
# --- Cleaning & basic preprocessing ---
|
|
|
|
| 5 |
def clean_text(text):
|
| 6 |
+
if pd.isnull(text):
|
| 7 |
+
return ""
|
| 8 |
text = text.lower()
|
| 9 |
+
text = re.sub(r"http\S+|www\S+|https\S+", '', text)
|
| 10 |
+
text = re.sub(r"\@\w+|\#", '', text)
|
| 11 |
+
text = re.sub(r"[^a-z\s]", '', text)
|
| 12 |
+
text = re.sub(r"\s+", ' ', text).strip()
|
| 13 |
+
return text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|