import re from constants.constants import * from preprocess.utils.common.utils import find_full_word, remove_full_words, detect_language_simple def remove_brand_from_name(name, brand): pattern = r'\b' + brand + r'\b' name_wo_brand = re.sub(pattern, '', name).strip() if (len(name_wo_brand) < 3): return name return name_wo_brand def insert_brand_in_name(name, brand): pattern = r'\b' + brand + r'\b' if not re.search(pattern, name): return brand + " " + name return name def extract_spark(text, remove=False): # Handle specific cases when 'champagne' or 'шампань' is not drink type, but region if 'коньяк' in text: return None, text types_variations = TYPES_VARIATIONS_DICT['шампанское'] type_str = find_full_word(text, types_variations) if type_str: if remove: text = text.replace(type_str, ' ') return 'шампанское', text return None, text def extract_color(text): colors_variations = COLORS_VARIATIONS_DICT for color in colors_variations.keys(): found_color_str = find_full_word(text, colors_variations[color]) if found_color_str: return color, found_color_str return None, None def extract_sour(text): sours_variations = SOURS_VARIATIONS_DICT for sour in sours_variations.keys(): found_sour_str = find_full_word(text, sours_variations[sour]) if found_sour_str: return sour, found_sour_str return None, None def extract_stars(text): match = re.search(r'((одна|две|три|четыре|пять|шесть|семь|восьмь|девять|\d{1,2})\s*(звезды|звезд|звёзд|зв))', text, re.IGNORECASE) if match: return match.group(0), match.group(0) return None, None def extract_color_and_sour(text, remove=False): #special_name = find_full_word(text, SPECIFIC_NAMES) #if special_name: # text = text.replace(special_name, '###SPECIAL_NAME###') found_color, found_color_str = extract_color(text) found_sour, found_sour_str = extract_sour(text) #Additional cases for not wine products (put these attributes to colod and sour for simplicity) if not found_color: found_color, found_color_str = extract_stars(text) if remove: if found_color_str and (detect_language_simple(found_color_str) == 'ru') and not found_color_str == 'розе': text = remove_full_words(text, [found_color_str]) if found_sour_str: text = remove_full_words(text, [found_sour_str]) #if special_name: # text = text.replace('###SPECIAL_NAME###', special_name) return found_color, found_sour, text def extract_other_attributes(text, remove=False): other = [] for attr in OTHER_ATTRIBUTES_VARIATIONS_DICT.keys(): attr_word = find_full_word(text, [attr]) if not attr_word: attr_word = find_full_word(text, OTHER_ATTRIBUTES_VARIATIONS_DICT[attr]) if attr_word: text = remove_full_words(text, [attr_word]) other.append(attr_word) return other, text def extract_years(text, remove=False): """ Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years'). # Добавить поддержку выражений вида "не менее|for at least 3 лет" """ # Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра # ron piet 20 y.o 40% #match = re.search(r'\b(?