File size: 2,630 Bytes
8e63cf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import re
import json
import emoji
import pyarabic.araby as araby
import tensorflow as tf
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

def load_tokenizer(path='tokenizer.json'):
    with open(path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        return tokenizer_from_json(data)

TOKENIZER = load_tokenizer()

def data_cleaning(text):

  # For any foreign language or for links
  text = text.lower()

  # Removing all mentions with @
  text = re.sub(r"@\w+", '', text)

  # Remove all links
  text = re.sub(r'https?:\/\/.*[\r\n]*', "", text, flags=re.MULTILINE)

  # These are some issues that couldnt be removed so I had to manually force remove them
  for char in ["مستخدم@", "#", "…", "RT", "\ufffd"]:
    text = text.replace(char, "")

  # Convert exclamation point and question mark into words, to give them more weight in the context
  text = re.sub(r'!+', ' [EXCLAMATION] ', text)
  text = re.sub(r'\?+', ' [QUESTION] ', text)

  # Removed some useless characters
  chars_to_remove = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~،؛؟ـ٪٫٬«»“”•·…﴾﴿〈〉°±÷ש®™€£¥¢]'
  text = re.sub(chars_to_remove, ' ', text)

  # I am keeping everything on one line
  text = re.sub(r'[\r\n]+', ' ', text)

  # Normalizing some letters to unify some words and match more words
  text = re.sub("[إأآا]", "ا", text)
  text = re.sub("ى", "ي", text)
  text = re.sub("ؤ", "ء", text)
  text = re.sub("ئ", "ء", text)
  text = re.sub("ة", "ه", text)
  text = re.sub("گ", "ك", text)

  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text)

  # This is for underscores inside hashtags, so I am converting the hashtag into words
  text = text.replace("_", " ")

  # Since I dont have much data, I am converting emojis into text to also give more context to sentences
  text = emoji.demojize(text, delimiters=(" ", " "))

  # strip tashkeel and tatweel
  text = araby.strip_tashkeel(text)
  text = araby.strip_tatweel(text)

  text = text.strip()
  return text

def prepare_input(text, max_len=48):
    # Step 1: Clean
    cleaned_text = data_cleaning(text)

    # Step 2: Tokenize (Convert text to sequence of integers)
    # Wrap cleaned_text in a list because texts_to_sequences expects a list of strings
    sequence = TOKENIZER.texts_to_sequences([cleaned_text])

    # Step 3: Pad (Ensure fixed length for the CNN-BiGRU architecture)
    padded = pad_sequences(sequence, maxlen=max_len)

    return padded