Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import re | |
| from nltk import ngrams | |
| from nltk.corpus import wordnet | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| from nltk.stem import WordNetLemmatizer | |
| import nltk | |
| nltk.download('wordnet') | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| nltk.download('stopwords') | |
| stop_words = set(stopwords.words('english')) | |
| stop_words_2 = ('show','international','exhibition','trade','fair','global','conference','world', | |
| 'expo','event','wellknown','popular','new', 'together', | |
| 'latest','offer','trend','sector','exhibitor','th','one','like','also','held','well','etc','u','bb', | |
| 'provide', 'provides', 'provide','day','attendee','year', 'best','top','management', | |
| 'brings','bring','event','topic','visitor','buyer','brand','take','u','national','great','come') | |
| stop_words = stop_words.union(stop_words_2) | |
| list_location = [] | |
| for col in ['name','capital','region','subregion']:#countries | |
| list_location.extend(list(set(pd.read_csv('countries.csv')[col]))) | |
| list_location.extend(list(set(pd.read_csv('states.csv')['name']))) | |
| list_location.extend(list(set(pd.read_csv('cities.csv')['name']))) | |
| list_location.extend(list(set(pd.read_csv('zones.csv')['Zone']))) | |
| locations_removal = set([x.lower() for x in list_location if not pd.isna(x)]) | |
| locations_removal.discard('nan') | |
| stop_words_bert = stop_words.union(locations_removal).union(stop_words_2) | |
| def preprocess_text(keyword): | |
| keyword = ' '.join([w for w in word_tokenize(keyword) if not w.lower() in stop_words]) | |
| keyword = keyword.replace('/', ' ') | |
| keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword).strip() | |
| keyword = keyword.replace('_', ' ') | |
| keyword = keyword.replace('&', ' ').strip() | |
| keyword = keyword.encode('ascii', 'ignore').decode('utf-8').strip().lower() | |
| keyword = re.sub(r'[^a-zA-Z\s]', '', keyword) | |
| words = word_tokenize(keyword) | |
| words = [word for word in words if word not in stop_words] | |
| lemmatizer = WordNetLemmatizer() | |
| words = list(set([lemmatizer.lemmatize(word) for word in words])) | |
| words = [word for word in words if word not in stop_words] | |
| processed_text = ' '.join(words) | |
| processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', processed_text) | |
| return processed_text | |
| def bert_preprocess(keyword): | |
| # Remove abbreviations | |
| keyword = re.sub(r"\b[A-Z\.]{2,}\b", ' ', keyword) | |
| # Convert to lowercase | |
| keyword = keyword.lower() | |
| # Tokenize and remove stop words | |
| keyword = ' '.join([w for w in word_tokenize(keyword) if re.sub(r'[^\w\s]', '', w.lower()) not in stop_words_bert]) | |
| # Remove special characters, unwanted patterns, and symbols | |
| keyword = re.sub(r"^[^a-zA-Z0-9]+|[^a-zA-Z0-9\)]+$", " ", keyword) | |
| keyword = re.sub(r'[^a-zA-Z\s]', ' ', keyword) | |
| # Clean up and lemmatize words | |
| lemmatizer = WordNetLemmatizer() | |
| words = [w for w in word_tokenize(keyword)] | |
| words = [lemmatizer.lemmatize(word) for word in words] | |
| # Remove repeated characters | |
| processed_text = re.sub(r'\b\w*([a-zA-Z])\1{10,}\w*\b', '', ' '.join(words)) | |
| # Join words and remove unnecessary spaces | |
| processed_text = ' '.join(processed_text.split()) | |
| return processed_text | |
| def lam_list(list_words): | |
| list_words = [x.strip().lower() for x in list_words] | |
| lemmatizer = WordNetLemmatizer() | |
| list_words_v = [lemmatizer.lemmatize(word,pos='v') for word in list_words] | |
| list_words_n = [lemmatizer.lemmatize(word,pos='n') for word in list_words] | |
| return list_words_v, list_words_n |