Spaces:
Build error
Build error
| import re | |
| from constants.constants import * | |
| from preprocess.utils.common.utils import find_full_word, remove_full_words, detect_language_simple | |
| def remove_brand_from_name(name, brand): | |
| pattern = r'\b' + brand + r'\b' | |
| name_wo_brand = re.sub(pattern, '', name).strip() | |
| if (len(name_wo_brand) < 3): | |
| return name | |
| return name_wo_brand | |
| def insert_brand_in_name(name, brand): | |
| pattern = r'\b' + brand + r'\b' | |
| if not re.search(pattern, name): | |
| return brand + " " + name | |
| return name | |
| def extract_spark(text, remove=False): | |
| # Handle specific cases when 'champagne' or 'шампань' is not drink type, but region | |
| if 'коньяк' in text: | |
| return None, text | |
| types_variations = TYPES_VARIATIONS_DICT['шампанское'] | |
| type_str = find_full_word(text, types_variations) | |
| if type_str: | |
| if remove: | |
| text = text.replace(type_str, ' ') | |
| return 'шампанское', text | |
| return None, text | |
| def extract_color(text): | |
| colors_variations = COLORS_VARIATIONS_DICT | |
| for color in colors_variations.keys(): | |
| found_color_str = find_full_word(text, colors_variations[color]) | |
| if found_color_str: | |
| return color, found_color_str | |
| return None, None | |
| def extract_sour(text): | |
| sours_variations = SOURS_VARIATIONS_DICT | |
| for sour in sours_variations.keys(): | |
| found_sour_str = find_full_word(text, sours_variations[sour]) | |
| if found_sour_str: | |
| return sour, found_sour_str | |
| return None, None | |
| def extract_stars(text): | |
| match = re.search(r'((одна|две|три|четыре|пять|шесть|семь|восьмь|девять|\d{1,2})\s*(звезды|звезд|звёзд|зв))', text, re.IGNORECASE) | |
| if match: | |
| return match.group(0), match.group(0) | |
| return None, None | |
| def extract_color_and_sour(text, remove=False): | |
| #special_name = find_full_word(text, SPECIFIC_NAMES) | |
| #if special_name: | |
| # text = text.replace(special_name, '###SPECIAL_NAME###') | |
| found_color, found_color_str = extract_color(text) | |
| found_sour, found_sour_str = extract_sour(text) | |
| #Additional cases for not wine products (put these attributes to colod and sour for simplicity) | |
| if not found_color: | |
| found_color, found_color_str = extract_stars(text) | |
| if remove: | |
| if found_color_str and (detect_language_simple(found_color_str) == 'ru') and not found_color_str == 'розе': | |
| text = remove_full_words(text, [found_color_str]) | |
| if found_sour_str: | |
| text = remove_full_words(text, [found_sour_str]) | |
| #if special_name: | |
| # text = text.replace('###SPECIAL_NAME###', special_name) | |
| return found_color, found_sour, text | |
| def extract_other_attributes(text, remove=False): | |
| other = [] | |
| for attr in OTHER_ATTRIBUTES_VARIATIONS_DICT.keys(): | |
| attr_word = find_full_word(text, [attr]) | |
| if not attr_word: | |
| attr_word = find_full_word(text, OTHER_ATTRIBUTES_VARIATIONS_DICT[attr]) | |
| if attr_word: | |
| text = remove_full_words(text, [attr_word]) | |
| other.append(attr_word) | |
| return other, text | |
| def extract_years(text, remove=False): | |
| """ | |
| Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years'). | |
| # Добавить поддержку выражений вида "не менее|for at least 3 лет" | |
| """ | |
| # Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра | |
| # ron piet 20 y.o 40% | |
| #match = re.search(r'\b(?<!\d)((\d{1,2})\s*(лет|years old|years|y.o))\b', text, re.IGNORECASE) | |
| #match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет|years old|years|y.o|ans)\b', text, re.IGNORECASE) | |
| #match = re.search(r'\b(?<!\d)((\d{1,2})(\s*|\-)(лет|года|yo|year old|years old|years|ans))\b', text, re.IGNORECASE) | |
| match = re.search(r'\b(?<!\d)((\d{1,2})(\s*|\-)(лет|ти летний|летний|года|yo|y o|year old|year|years old|years|ans))\b', text, re.IGNORECASE) | |
| if not match: | |
| match = re.search(r'\b((трех|четырех|пяти|шести|семи|восьми|девяти)летний)\b', text, re.IGNORECASE) | |
| if not match: | |
| match = re.search(r'выдержка (\d{1,2}\s*(лет|л))', text, re.IGNORECASE) | |
| if match: | |
| if remove: | |
| text = text.replace(match.group(0), ' ') | |
| text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ') | |
| return match.group(2), text | |
| return None, text | |
| def extract_production_year(text, remove=False): | |
| """ | |
| Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки. | |
| Например: '2019'. | |
| """ | |
| #pattern = r'\b(19\d{2}|20\d{2})[гГ]*\b' | |
| #pattern = r'\b(19\d{2}|20\d{2})\s*[гГ]*\.*(?:\b|$)' | |
| #pattern = r'\b(19\d{2}|20\d{2})\s*[гГ]*(\/y)*\.*(?:\b|$)' | |
| pattern = r'\b(19\d{2}|20\d{2})\s*(г/у|г)*\.*(?:\b|$)' | |
| #r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text | |
| match = re.search(pattern, text) | |
| if match: | |
| if remove: | |
| text = re.sub(pattern, ' ', text).replace("г/у", " ").replace("год урожая", " ") | |
| return match.group(1), text | |
| return None, text | |
| def extract_alcohol_content(text, remove=False): | |
| """ | |
| Извлекает содержание алкоголя из строки. | |
| Например: '40% или 10-12%'. | |
| """ | |
| # pattern = r'(\d{1,2}(?:[.,]\d+)?\s*%)' | |
| pattern = r'\b(\d{1,2}(?:[.,]\d+)?(?:\s*\-\s*\d{1,2}(?:[.,]\d+)?)?\s*%)' | |
| match = re.search(pattern, text) | |
| if match: | |
| if remove: | |
| text = re.sub(pattern, ' ', text) | |
| # Заменяем запятую на точку для единообразия (если нужно) | |
| return match.group(1).replace(' ', '').replace(',', '.'), text | |
| return None, text | |
| def is_volume(value): | |
| """ | |
| Проверяет, является ли значение валидным объемом (<= 10 литров). | |
| """ | |
| try: | |
| volume = float(value) | |
| return volume if volume <= 10 else None | |
| except ValueError: | |
| return None | |
| def extract_volume_or_number(text, remove = False): | |
| """ | |
| Извлекает объем в литрах или число с плавающей точкой из строки. | |
| Например: '0,75л', '0.5', или '1,5 л'. | |
| Добавить: обработку мл, например '750 мл' | |
| """ | |
| # Попытка найти объем с буквой 'л' или без пробела перед ней | |
| pattern = r'((\d+(?:[\.,]\d+)?\s*[лl]|(?:\d+(?:[\.,]\d+)?[лl])))(\s*[/хx*]\s*(\d{1,2}))*' | |
| match_with_l = re.search(pattern, text) | |
| if match_with_l: | |
| vol = is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('l', '').strip()) | |
| if vol: | |
| if remove: | |
| text = re.sub(pattern, ' ', text) | |
| return vol, text | |
| # Если не найдено число с буквой л, то ищем просто число с плавающей точкой (и возможно '*' после него) | |
| pattern = r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))(\s*[/хx*]\s*(\d{1,3}))*(?!\%)\b(?!\s*(№|-er|er|\d{3,}))' | |
| match_number = re.search(pattern, text) | |
| if match_number: | |
| #print("match_number: " + str(match_number.group(1)) + " for [" + text + "]") | |
| #return is_volume(match_number.group(1).replace(',', '.')), match_number.group(1) | |
| vol = is_volume(match_number.group(1).replace(',', '.').strip()) | |
| if vol: | |
| if remove: | |
| text = re.sub(pattern, ' ', text) | |
| return vol, text | |
| return None, text | |
| def extract_gb(text, remove=False): | |
| if not isinstance(text, str) or not text: | |
| return None, text | |
| gb = find_full_word(text, GBS) | |
| if gb: | |
| text = text.replace(gb, ' ') | |
| gb2 = find_full_word(text, GBS) | |
| if gb2: | |
| text = text.replace(gb2, ' ') | |
| return gb, text | |
| def extract_type_by_brand_name(text): | |
| for k, v in TYPES_FROM_BRAND_DICT.items(): | |
| if find_full_word(text, [k]): | |
| return v | |
| return None | |
| def extract_specific_brands_from_name(name): | |
| return find_full_word(name, SPECIFIC_BRANDS) | |
| # Finds specific brands in names that makes collisions with parsing routine. | |
| # For example, brand 'Voskevaz 1932' makes parsing algorithm to think that '1932' is year of the wine | |
| def replace_specific_brand_and_name(text): | |
| special_brand = find_full_word(text, SPECIFIC_BRANDS) | |
| if special_brand: | |
| text = text.replace(special_brand, '###special_brand###') | |
| special_name = find_full_word(text, SPECIFIC_NAMES) | |
| if special_name: | |
| text = text.replace(special_name, '###special_name###') | |
| return text, special_brand, special_name | |
| def restore_specific_brand_and_name(text, special_brand=None, special_name=None): | |
| if special_name: | |
| text = text.replace('###special_name###', special_name) | |
| if special_brand: | |
| text = text.replace('###special_brand###', special_brand) | |
| return text | |
| def extract_attributes_from_name(text): | |
| gb, text = extract_gb(text, True) | |
| alcohol, text = extract_alcohol_content(text, True) | |
| years, text = extract_years(text, True) | |
| production_year, text = extract_production_year(text, True) | |
| volume_or_number, text = extract_volume_or_number(text, True) | |
| color, sour, text = extract_color_and_sour(text, True) | |
| other, text = extract_other_attributes(text, True) | |
| return text, alcohol, volume_or_number, years, production_year, gb, color, sour, other |