Spaces:
Runtime error
Runtime error
Commit ·
281f87d
1
Parent(s): 71b7f95
Create preprocess_fun.py
Browse files- preprocess_fun.py +80 -0
preprocess_fun.py
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import re
|
| 3 |
+
from nltk import ngrams
|
| 4 |
+
from nltk.corpus import wordnet
|
| 5 |
+
from nltk.corpus import stopwords
|
| 6 |
+
from nltk.tokenize import word_tokenize
|
| 7 |
+
from nltk.stem import WordNetLemmatizer
|
| 8 |
+
import nltk
|
| 9 |
+
nltk.download('wordnet')
|
| 10 |
+
nltk.download('stopwords')
|
| 11 |
+
nltk.download('punkt')
|
| 12 |
+
nltk.download('stopwords')
|
| 13 |
+
stop_words = set(stopwords.words('english'))
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
stop_words_2 = ('show','international','exhibition','trade','fair','global','conference','world',
|
| 17 |
+
'expo','event','wellknown','popular','new', 'together',
|
| 18 |
+
'latest','offer','trend','sector','exhibitor','th','one','like','also','held','well','etc','u','bb',
|
| 19 |
+
'provide', 'provides', 'provide','day','attendee','year', 'best','top','management',
|
| 20 |
+
'brings','bring','event','topic','visitor','buyer','brand','take','u','national','great','come')
|
| 21 |
+
|
| 22 |
+
stop_words = stop_words.union(stop_words_2)
|
| 23 |
+
|
| 24 |
+
list_location = []
|
| 25 |
+
for col in ['name','capital','region','subregion']:#countries
|
| 26 |
+
list_location.extend(list(set(pd.read_csv('/content/countries.csv')[col])))
|
| 27 |
+
list_location.extend(list(set(pd.read_csv('/content/states.csv')['name'])))
|
| 28 |
+
list_location.extend(list(set(pd.read_csv('/content/cities.csv')['name'])))
|
| 29 |
+
list_location.extend(list(set(pd.read_csv('/content/zones.csv')['Zone'])))
|
| 30 |
+
|
| 31 |
+
locations_removal = set([x.lower() for x in list_location if not pd.isna(x)])
|
| 32 |
+
|
| 33 |
+
locations_removal.discard('nan')
|
| 34 |
+
|
| 35 |
+
stop_words_bert = stop_words.union(locations_removal).union(stop_words_2)
|
| 36 |
+
|
| 37 |
+
def preprocess_text(keyword):
|
| 38 |
+
keyword = ' '.join([w for w in word_tokenize(keyword) if not w.lower() in stop_words])
|
| 39 |
+
keyword = keyword.replace('/', ' ')
|
| 40 |
+
keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword).strip()
|
| 41 |
+
keyword = keyword.replace('_', ' ')
|
| 42 |
+
keyword = keyword.replace('&', ' ').strip()
|
| 43 |
+
keyword = keyword.encode('ascii', 'ignore').decode('utf-8').strip().lower()
|
| 44 |
+
keyword = re.sub(r'[^a-zA-Z\s]', '', keyword)
|
| 45 |
+
words = word_tokenize(keyword)
|
| 46 |
+
words = [word for word in words if word not in stop_words]
|
| 47 |
+
lemmatizer = WordNetLemmatizer()
|
| 48 |
+
words = list(set([lemmatizer.lemmatize(word) for word in words]))
|
| 49 |
+
words = [word for word in words if word not in stop_words]
|
| 50 |
+
processed_text = ' '.join(words)
|
| 51 |
+
processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', processed_text)
|
| 52 |
+
return processed_text
|
| 53 |
+
|
| 54 |
+
def bert_preprocess(keyword):
|
| 55 |
+
|
| 56 |
+
# Remove abbreviations
|
| 57 |
+
keyword = re.sub(r"\b[A-Z\.]{2,}\b", ' ', keyword)
|
| 58 |
+
|
| 59 |
+
# Convert to lowercase
|
| 60 |
+
keyword = keyword.lower()
|
| 61 |
+
|
| 62 |
+
# Tokenize and remove stop words
|
| 63 |
+
keyword = ' '.join([w for w in word_tokenize(keyword) if re.sub(r'[^\w\s]', '', w.lower()) not in stop_words_bert])
|
| 64 |
+
|
| 65 |
+
# Remove special characters, unwanted patterns, and symbols
|
| 66 |
+
keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword)
|
| 67 |
+
keyword = re.sub(r'[^a-zA-Z\s]', ' ', keyword)
|
| 68 |
+
|
| 69 |
+
# Clean up and lemmatize words
|
| 70 |
+
lemmatizer = WordNetLemmatizer()
|
| 71 |
+
words = [w for w in word_tokenize(keyword)]
|
| 72 |
+
words = [lemmatizer.lemmatize(word) for word in words]
|
| 73 |
+
|
| 74 |
+
# Remove repeated characters
|
| 75 |
+
processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', ' '.join(words))
|
| 76 |
+
|
| 77 |
+
# Join words and remove unnecessary spaces
|
| 78 |
+
processed_text = ' '.join(processed_text.split())
|
| 79 |
+
|
| 80 |
+
return processed_text
|