WineMatching / preprocess /utils /common /extracters.py
j-s-v's picture
2025-07-28
d4bade4
import re
from constants.constants import *
from preprocess.utils.common.utils import find_full_word, remove_full_words, detect_language_simple
def remove_brand_from_name(name, brand):
pattern = r'\b' + brand + r'\b'
name_wo_brand = re.sub(pattern, '', name).strip()
if (len(name_wo_brand) < 3):
return name
return name_wo_brand
def insert_brand_in_name(name, brand):
pattern = r'\b' + brand + r'\b'
if not re.search(pattern, name):
return brand + " " + name
return name
def extract_spark(text, remove=False):
# Handle specific cases when 'champagne' or 'шампань' is not drink type, but region
if 'коньяк' in text:
return None, text
types_variations = TYPES_VARIATIONS_DICT['шампанское']
type_str = find_full_word(text, types_variations)
if type_str:
if remove:
text = text.replace(type_str, ' ')
return 'шампанское', text
return None, text
def extract_color(text):
colors_variations = COLORS_VARIATIONS_DICT
for color in colors_variations.keys():
found_color_str = find_full_word(text, colors_variations[color])
if found_color_str:
return color, found_color_str
return None, None
def extract_sour(text):
sours_variations = SOURS_VARIATIONS_DICT
for sour in sours_variations.keys():
found_sour_str = find_full_word(text, sours_variations[sour])
if found_sour_str:
return sour, found_sour_str
return None, None
def extract_stars(text):
match = re.search(r'((одна|две|три|четыре|пять|шесть|семь|восьмь|девять|\d{1,2})\s*(звезды|звезд|звёзд|зв))', text, re.IGNORECASE)
if match:
return match.group(0), match.group(0)
return None, None
def extract_color_and_sour(text, remove=False):
#special_name = find_full_word(text, SPECIFIC_NAMES)
#if special_name:
# text = text.replace(special_name, '###SPECIAL_NAME###')
found_color, found_color_str = extract_color(text)
found_sour, found_sour_str = extract_sour(text)
#Additional cases for not wine products (put these attributes to colod and sour for simplicity)
if not found_color:
found_color, found_color_str = extract_stars(text)
if remove:
if found_color_str and (detect_language_simple(found_color_str) == 'ru') and not found_color_str == 'розе':
text = remove_full_words(text, [found_color_str])
if found_sour_str:
text = remove_full_words(text, [found_sour_str])
#if special_name:
# text = text.replace('###SPECIAL_NAME###', special_name)
return found_color, found_sour, text
def extract_other_attributes(text, remove=False):
other = []
for attr in OTHER_ATTRIBUTES_VARIATIONS_DICT.keys():
attr_word = find_full_word(text, [attr])
if not attr_word:
attr_word = find_full_word(text, OTHER_ATTRIBUTES_VARIATIONS_DICT[attr])
if attr_word:
text = remove_full_words(text, [attr_word])
other.append(attr_word)
return other, text
def extract_years(text, remove=False):
"""
Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years').
# Добавить поддержку выражений вида "не менее|for at least 3 лет"
"""
# Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра
# ron piet 20 y.o 40%
#match = re.search(r'\b(?<!\d)((\d{1,2})\s*(лет|years old|years|y.o))\b', text, re.IGNORECASE)
#match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет|years old|years|y.o|ans)\b', text, re.IGNORECASE)
#match = re.search(r'\b(?<!\d)((\d{1,2})(\s*|\-)(лет|года|yo|year old|years old|years|ans))\b', text, re.IGNORECASE)
match = re.search(r'\b(?<!\d)((\d{1,2})(\s*|\-)(лет|ти летний|летний|года|yo|y o|year old|year|years old|years|ans))\b', text, re.IGNORECASE)
if not match:
match = re.search(r'\b((трех|четырех|пяти|шести|семи|восьми|девяти)летний)\b', text, re.IGNORECASE)
if not match:
match = re.search(r'выдержка (\d{1,2}\s*(лет|л))', text, re.IGNORECASE)
if match:
if remove:
text = text.replace(match.group(0), ' ')
text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ')
return match.group(2), text
return None, text
def extract_production_year(text, remove=False):
"""
Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки.
Например: '2019'.
"""
#pattern = r'\b(19\d{2}|20\d{2})[гГ]*\b'
#pattern = r'\b(19\d{2}|20\d{2})\s*[гГ]*\.*(?:\b|$)'
#pattern = r'\b(19\d{2}|20\d{2})\s*[гГ]*(\/y)*\.*(?:\b|$)'
pattern = r'\b(19\d{2}|20\d{2})\s*(г/у|г)*\.*(?:\b|$)'
#r'\b' + str(production_year) + r'\s*[гГ]*\.*(?:\b|$)', ' ', text
match = re.search(pattern, text)
if match:
if remove:
text = re.sub(pattern, ' ', text).replace("г/у", " ").replace("год урожая", " ")
return match.group(1), text
return None, text
def extract_alcohol_content(text, remove=False):
"""
Извлекает содержание алкоголя из строки.
Например: '40% или 10-12%'.
"""
# pattern = r'(\d{1,2}(?:[.,]\d+)?\s*%)'
pattern = r'\b(\d{1,2}(?:[.,]\d+)?(?:\s*\-\s*\d{1,2}(?:[.,]\d+)?)?\s*%)'
match = re.search(pattern, text)
if match:
if remove:
text = re.sub(pattern, ' ', text)
# Заменяем запятую на точку для единообразия (если нужно)
return match.group(1).replace(' ', '').replace(',', '.'), text
return None, text
def is_volume(value):
"""
Проверяет, является ли значение валидным объемом (<= 10 литров).
"""
try:
volume = float(value)
return volume if volume <= 10 else None
except ValueError:
return None
def extract_volume_or_number(text, remove = False):
"""
Извлекает объем в литрах или число с плавающей точкой из строки.
Например: '0,75л', '0.5', или '1,5 л'.
Добавить: обработку мл, например '750 мл'
"""
# Попытка найти объем с буквой 'л' или без пробела перед ней
pattern = r'((\d+(?:[\.,]\d+)?\s*[лl]|(?:\d+(?:[\.,]\d+)?[лl])))(\s*[/хx*]\s*(\d{1,2}))*'
match_with_l = re.search(pattern, text)
if match_with_l:
vol = is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('l', '').strip())
if vol:
if remove:
text = re.sub(pattern, ' ', text)
return vol, text
# Если не найдено число с буквой л, то ищем просто число с плавающей точкой (и возможно '*' после него)
pattern = r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))(\s*[/хx*]\s*(\d{1,3}))*(?!\%)\b(?!\s*(№|-er|er|\d{3,}))'
match_number = re.search(pattern, text)
if match_number:
#print("match_number: " + str(match_number.group(1)) + " for [" + text + "]")
#return is_volume(match_number.group(1).replace(',', '.')), match_number.group(1)
vol = is_volume(match_number.group(1).replace(',', '.').strip())
if vol:
if remove:
text = re.sub(pattern, ' ', text)
return vol, text
return None, text
def extract_gb(text, remove=False):
if not isinstance(text, str) or not text:
return None, text
gb = find_full_word(text, GBS)
if gb:
text = text.replace(gb, ' ')
gb2 = find_full_word(text, GBS)
if gb2:
text = text.replace(gb2, ' ')
return gb, text
def extract_type_by_brand_name(text):
for k, v in TYPES_FROM_BRAND_DICT.items():
if find_full_word(text, [k]):
return v
return None
def extract_specific_brands_from_name(name):
return find_full_word(name, SPECIFIC_BRANDS)
# Finds specific brands in names that makes collisions with parsing routine.
# For example, brand 'Voskevaz 1932' makes parsing algorithm to think that '1932' is year of the wine
def replace_specific_brand_and_name(text):
special_brand = find_full_word(text, SPECIFIC_BRANDS)
if special_brand:
text = text.replace(special_brand, '###special_brand###')
special_name = find_full_word(text, SPECIFIC_NAMES)
if special_name:
text = text.replace(special_name, '###special_name###')
return text, special_brand, special_name
def restore_specific_brand_and_name(text, special_brand=None, special_name=None):
if special_name:
text = text.replace('###special_name###', special_name)
if special_brand:
text = text.replace('###special_brand###', special_brand)
return text
def extract_attributes_from_name(text):
gb, text = extract_gb(text, True)
alcohol, text = extract_alcohol_content(text, True)
years, text = extract_years(text, True)
production_year, text = extract_production_year(text, True)
volume_or_number, text = extract_volume_or_number(text, True)
color, sour, text = extract_color_and_sour(text, True)
other, text = extract_other_attributes(text, True)
return text, alcohol, volume_or_number, years, production_year, gb, color, sour, other