Spaces:

j-s-v
/

WineMatching

Build error

App Files Files Community

WineMatching / preprocess /utils /common /extracters.py

j-s-v

2025-07-28

d4bade4 10 months ago

raw

history blame contribute delete

9.89 kB

	import re
	from constants.constants import *
	from preprocess.utils.common.utils import find_full_word, remove_full_words, detect_language_simple

	def remove_brand_from_name(name, brand):
	pattern = r'\b' + brand + r'\b'
	name_wo_brand = re.sub(pattern, '', name).strip()
	if (len(name_wo_brand) < 3):
	return name

	return name_wo_brand


	def insert_brand_in_name(name, brand):
	pattern = r'\b' + brand + r'\b'
	if not re.search(pattern, name):
	return brand + " " + name

	return name


	def extract_spark(text, remove=False):
	# Handle specific cases when 'champagne' or 'шампань' is not drink type, but region
	if 'коньяк' in text:
	return None, text

	types_variations = TYPES_VARIATIONS_DICT['шампанское']
	type_str = find_full_word(text, types_variations)
	if type_str:
	if remove:
	text = text.replace(type_str, ' ')
	return 'шампанское', text

	return None, text


	def extract_color(text):
	colors_variations = COLORS_VARIATIONS_DICT
	for color in colors_variations.keys():
	found_color_str = find_full_word(text, colors_variations[color])
	if found_color_str:
	return color, found_color_str

	return None, None


	def extract_sour(text):
	sours_variations = SOURS_VARIATIONS_DICT
	for sour in sours_variations.keys():
	found_sour_str = find_full_word(text, sours_variations[sour])
	if found_sour_str:
	return sour, found_sour_str

	return None, None

	def extract_stars(text):
	match = re.search(r'((одна\|две\|три\|четыре\|пять\|шесть\|семь\|восьмь\|девять\|\d{1,2})\s*(звезды\|звезд\|звёзд\|зв))', text, re.IGNORECASE)
	if match:
	return match.group(0), match.group(0)

	return None, None


	def extract_color_and_sour(text, remove=False):
	#special_name = find_full_word(text, SPECIFIC_NAMES)
	#if special_name:
	# text = text.replace(special_name, '###SPECIAL_NAME###')

	found_color, found_color_str = extract_color(text)
	found_sour, found_sour_str = extract_sour(text)

	#Additional cases for not wine products (put these attributes to colod and sour for simplicity)
	if not found_color:
	found_color, found_color_str = extract_stars(text)

	if remove:
	if found_color_str and (detect_language_simple(found_color_str) == 'ru') and not found_color_str == 'розе':
	text = remove_full_words(text, [found_color_str])

	if found_sour_str:
	text = remove_full_words(text, [found_sour_str])

	#if special_name:
	# text = text.replace('###SPECIAL_NAME###', special_name)

	return found_color, found_sour, text

	def extract_other_attributes(text, remove=False):
	other = []

	for attr in OTHER_ATTRIBUTES_VARIATIONS_DICT.keys():
	attr_word = find_full_word(text, [attr])
	if not attr_word:
	attr_word = find_full_word(text, OTHER_ATTRIBUTES_VARIATIONS_DICT[attr])

	if attr_word:
	text = remove_full_words(text, [attr_word])
	other.append(attr_word)

	return other, text


	def extract_years(text, remove=False):
	"""
	Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years').
	# Добавить поддержку выражений вида "не менее\|for at least 3 лет"
	"""
	# Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра
	# ron piet 20 y.o 40%
	#match = re.search(r'\b(?<!\d)((\d{1,2})\s*(лет\|years old\|years\|y.o))\b', text, re.IGNORECASE)
	#match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет\|years old\|years\|y.o\|ans)\b', text, re.IGNORECASE)
	#match = re.search(r'\b(?<!\d)((\d{1,2})(\s*\|\-)(лет\|года\|yo\|year old\|years old\|years\|ans))\b', text, re.IGNORECASE)
	match = re.search(r'\b(?<!\d)((\d{1,2})(\s*\|\-)(лет\|ти летний\|летний\|года\|yo\|y o\|year old\|year\|years old\|years\|ans))\b', text, re.IGNORECASE)
	if not match:
	match = re.search(r'\b((трех\|четырех\|пяти\|шести\|семи\|восьми\|девяти)летний)\b', text, re.IGNORECASE)
	if not match:
	match = re.search(r'выдержка (\d{1,2}\s*(лет\|л))', text, re.IGNORECASE)

	if match:
	if remove:
	text = text.replace(match.group(0), ' ')
	text = text.replace('выдержка', ' ').replace('aging', ' ').replace('ageing', ' ')
	return match.group(2), text

	return None, text


	def extract_production_year(text, remove=False):
	"""
	Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки.
	Например: '2019'.
	"""
	#pattern = r'\b(19\d{2}\|20\d{2})[гГ]*\b'
	#pattern = r'\b(19\d{2}\|20\d{2})\s[гГ]\.*(?:\b\|$)'
	#pattern = r'\b(19\d{2}\|20\d{2})\s[гГ](\/y)\.(?:\b\|$)'
	pattern = r'\b(19\d{2}\|20\d{2})\s(г/у\|г)\.*(?:\b\|$)'
	#r'\b' + str(production_year) + r'\s[гГ]\.*(?:\b\|$)', ' ', text
	match = re.search(pattern, text)
	if match:
	if remove:
	text = re.sub(pattern, ' ', text).replace("г/у", " ").replace("год урожая", " ")
	return match.group(1), text

	return None, text


	def extract_alcohol_content(text, remove=False):
	"""
	Извлекает содержание алкоголя из строки.
	Например: '40% или 10-12%'.
	"""
	# pattern = r'(\d{1,2}(?:[.,]\d+)?\s*%)'
	pattern = r'\b(\d{1,2}(?:[.,]\d+)?(?:\s\-\s\d{1,2}(?:[.,]\d+)?)?\s*%)'
	match = re.search(pattern, text)
	if match:
	if remove:
	text = re.sub(pattern, ' ', text)
	# Заменяем запятую на точку для единообразия (если нужно)
	return match.group(1).replace(' ', '').replace(',', '.'), text

	return None, text


	def is_volume(value):
	"""
	Проверяет, является ли значение валидным объемом (<= 10 литров).
	"""
	try:
	volume = float(value)
	return volume if volume <= 10 else None
	except ValueError:
	return None


	def extract_volume_or_number(text, remove = False):
	"""
	Извлекает объем в литрах или число с плавающей точкой из строки.
	Например: '0,75л', '0.5', или '1,5 л'.
	Добавить: обработку мл, например '750 мл'
	"""
	# Попытка найти объем с буквой 'л' или без пробела перед ней
	pattern = r'((\d+(?:[\.,]\d+)?\s[лl]\|(?:\d+(?:[\.,]\d+)?[лl])))(\s[/хx]\s(\d{1,2}))*'
	match_with_l = re.search(pattern, text)
	if match_with_l:
	vol = is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('l', '').strip())
	if vol:
	if remove:
	text = re.sub(pattern, ' ', text)
	return vol, text

	# Если не найдено число с буквой л, то ищем просто число с плавающей точкой (и возможно '*' после него)
	pattern = r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))(\s[/хx]\s(\d{1,3}))(?!\%)\b(?!\s*(№\|-er\|er\|\d{3,}))'
	match_number = re.search(pattern, text)
	if match_number:
	#print("match_number: " + str(match_number.group(1)) + " for [" + text + "]")
	#return is_volume(match_number.group(1).replace(',', '.')), match_number.group(1)
	vol = is_volume(match_number.group(1).replace(',', '.').strip())
	if vol:
	if remove:
	text = re.sub(pattern, ' ', text)
	return vol, text

	return None, text


	def extract_gb(text, remove=False):
	if not isinstance(text, str) or not text:
	return None, text

	gb = find_full_word(text, GBS)
	if gb:
	text = text.replace(gb, ' ')

	gb2 = find_full_word(text, GBS)
	if gb2:
	text = text.replace(gb2, ' ')


	return gb, text


	def extract_type_by_brand_name(text):
	for k, v in TYPES_FROM_BRAND_DICT.items():
	if find_full_word(text, [k]):
	return v

	return None


	def extract_specific_brands_from_name(name):
	return find_full_word(name, SPECIFIC_BRANDS)


	# Finds specific brands in names that makes collisions with parsing routine.
	# For example, brand 'Voskevaz 1932' makes parsing algorithm to think that '1932' is year of the wine
	def replace_specific_brand_and_name(text):
	special_brand = find_full_word(text, SPECIFIC_BRANDS)
	if special_brand:
	text = text.replace(special_brand, '###special_brand###')

	special_name = find_full_word(text, SPECIFIC_NAMES)
	if special_name:
	text = text.replace(special_name, '###special_name###')

	return text, special_brand, special_name


	def restore_specific_brand_and_name(text, special_brand=None, special_name=None):
	if special_name:
	text = text.replace('###special_name###', special_name)

	if special_brand:
	text = text.replace('###special_brand###', special_brand)

	return text


	def extract_attributes_from_name(text):
	gb, text = extract_gb(text, True)
	alcohol, text = extract_alcohol_content(text, True)
	years, text = extract_years(text, True)
	production_year, text = extract_production_year(text, True)
	volume_or_number, text = extract_volume_or_number(text, True)
	color, sour, text = extract_color_and_sour(text, True)
	other, text = extract_other_attributes(text, True)

	return text, alcohol, volume_or_number, years, production_year, gb, color, sour, other