Spaces:
Sleeping
Sleeping
| import re | |
| import json | |
| import emoji | |
| import pyarabic.araby as araby | |
| import tensorflow as tf | |
| from tensorflow.keras.preprocessing.text import tokenizer_from_json | |
| from tensorflow.keras.preprocessing.sequence import pad_sequences | |
| def load_tokenizer(path='tokenizer.json'): | |
| with open(path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| return tokenizer_from_json(data) | |
| TOKENIZER = load_tokenizer() | |
| def data_cleaning(text): | |
| # For any foreign language or for links | |
| text = text.lower() | |
| # Removing all mentions with @ | |
| text = re.sub(r"@\w+", '', text) | |
| # Remove all links | |
| text = re.sub(r'https?:\/\/.*[\r\n]*', "", text, flags=re.MULTILINE) | |
| # These are some issues that couldnt be removed so I had to manually force remove them | |
| for char in ["مستخدم@", "#", "…", "RT", "\ufffd"]: | |
| text = text.replace(char, "") | |
| # Convert exclamation point and question mark into words, to give them more weight in the context | |
| text = re.sub(r'!+', ' [EXCLAMATION] ', text) | |
| text = re.sub(r'\?+', ' [QUESTION] ', text) | |
| # Removed some useless characters | |
| chars_to_remove = r'[!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~،؛؟ـ٪٫٬«»“”•·…﴾﴿〈〉°±÷ש®™€£¥¢]' | |
| text = re.sub(chars_to_remove, ' ', text) | |
| # I am keeping everything on one line | |
| text = re.sub(r'[\r\n]+', ' ', text) | |
| # Normalizing some letters to unify some words and match more words | |
| text = re.sub("[إأآا]", "ا", text) | |
| text = re.sub("ى", "ي", text) | |
| text = re.sub("ؤ", "ء", text) | |
| text = re.sub("ئ", "ء", text) | |
| text = re.sub("ة", "ه", text) | |
| text = re.sub("گ", "ك", text) | |
| # Remove extra spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # This is for underscores inside hashtags, so I am converting the hashtag into words | |
| text = text.replace("_", " ") | |
| # Since I dont have much data, I am converting emojis into text to also give more context to sentences | |
| text = emoji.demojize(text, delimiters=(" ", " ")) | |
| # strip tashkeel and tatweel | |
| text = araby.strip_tashkeel(text) | |
| text = araby.strip_tatweel(text) | |
| text = text.strip() | |
| return text | |
| def prepare_input(text, max_len=48): | |
| # Step 1: Clean | |
| cleaned_text = data_cleaning(text) | |
| # Step 2: Tokenize (Convert text to sequence of integers) | |
| # Wrap cleaned_text in a list because texts_to_sequences expects a list of strings | |
| sequence = TOKENIZER.texts_to_sequences([cleaned_text]) | |
| # Step 3: Pad (Ensure fixed length for the CNN-BiGRU architecture) | |
| padded = pad_sequences(sequence, maxlen=max_len) | |
| return padded | |