Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Gainward777 commited on Feb 28, 2025

Commit

0612fb9

verified ·

1 Parent(s): cbb77ba

Upload Funcs.py

Browse files

Files changed (1) hide show

Funcs.py +197 -197

Funcs.py CHANGED Viewed

@@ -27,31 +27,31 @@ from collections import Counter
-def check_spark(row, col_name='name', types=['��������', '���']):
     if col_name in row.keys():
         for t in types:
-            if t.lower() in row[col_name].lower() and '��������' not in row[col_name].lower():
-                return '��������'
         return None
-def check_color_and_sour(row, col_name='type_wine', types=['�����', '�������', '�������']):
       if col_name in row.keys():
           for t in types:
               if t.lower() in row[col_name].lower():
-                  return '����'
           return None
 def is_type_exist(row, types):
     for t in types:
-        if t.lower() in row['type'].lower():  # ��������� ��� ����� ��������
             return t
     return None
 def check_type(row, types):
     #checker=False
     for t in types:
-        if t.lower() in row['name'].lower():  # ��������� ��� ����� ��������
             return t
     return None
@@ -69,19 +69,19 @@ def get_type(row, types):
 def extract_years(text):
     """
-    ��������� ��������� ����� � �����, ������������ ������� (��������: '50 ���', '21 years').
     """
-    # ���������� ��������� ���� ����� � ����� '���' ��� 'years' � ������ ��������
-    match = re.search(r'\b(?<!\d)(\d{1,2})\s*(���|years)\b', text, re.IGNORECASE)
     if match:
-        # �������� ����� '���' ��� 'years' � ��������� ��������
         return f"{match.group(1)} {match.group(2)}"
     return None
 def extract_production_year(text):
     """
-    ��������� ��� ������������ (�������������� ����� � ��������� 1900�2099) �� ������.
-    ��������: '2019'.
     """
     match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
     if match:
@@ -90,19 +90,19 @@ def extract_production_year(text):
 def extract_alcohol_content(text):
     """
-    ��������� ���������� �������� �� ������.
-    ��������: '40%'.
     """
     match = re.search(r'(\d{1,2}(?:[.,]\d+)?\s*%)', text)
     if match:
-        # �������� ������� �� ����� ��� ������������ (���� �����)
         return match.group(1).replace(' ', '').replace(',', '.')
     return None
 def is_volume(value):
     """
-    ���������, �������� �� �������� �������� ������� (<= 10 ������).
     """
     try:
         volume = float(value)
@@ -112,16 +112,16 @@ def is_volume(value):
 def extract_volume_or_number(text):
     """
-    ��������� ����� � ������ ��� ����� � ��������� ������ �� ������.
-    ��������: '0,75�', '0.5', ��� '1,5 �'.
     """
-    # ������� ����� ����� � ������ '�' ��� ��� ������� ����� ���
-    match_with_l = re.search(r'(\d+(?:[\.,]\d+)?\s*[��]|(?:\d+(?:[\.,]\d+)?[��]))', text)
     if match_with_l:
-        return is_volume(match_with_l.group(1).replace(',', '.').replace('�', '').replace('�', '').strip())
-    # ���� �� �������, ���� ������ ����� � ��������� ������
-    match_number = re.search(r'(?<!�)\b(\d{1,2}(?:[\.,]\d+))\b(?!\s*(�|-er|er|\d{3,}))', text)
     if match_number:
         return is_volume(match_number.group(1).replace(',', '.'))
@@ -130,37 +130,37 @@ def extract_volume_or_number(text):
 def get_sour(s):
     """
-    ��������� �� ������ �������� �����, ���� ��� ������������ ��� ��������� �����.
-    ���������� ������������� �������� �����/����� ��� ��������, ��� ����� � ����� ����������
-    ��������� ����� ��� ��������-�������� ��������.
     Args:
-        s (str): �������� ������.
     Returns:
-        str or None: ��������� �������� �����, ���� ��� ������������ ��� ��������� �����, ����� None.
     """
-    # ������ �������� ����
     keywords = [
         r'brut',
         r'semi-sweet',
         r'sweet',
-        r'����',
-        r'�����',
-        r'���������',
-        r'�����������',
-        r'�������',
-        r'�/���',
-        r'�/��',
-        r'�/�',
-        r'��',
-        r'���'
     ]
-    # �������� ������ � �������������� ���������� �������� ����� � �����,
-    # ����� ���������, ��� ���������� �� �������� ������ ����� �������� �����.
-    # (?<!\w) - ����� ����������� �� ������ ���� ������� [a-zA-Z0-9_]
-    # (?!\w)  - ����� ���������� �� ������ ���� ������� [a-zA-Z0-9_]
     pattern = re.compile(r'(?<!\w)(?:' + '|'.join(keywords) + r')(?!\w)', re.IGNORECASE)
     match = pattern.search(s)
@@ -172,29 +172,29 @@ def get_sour(s):
 def get_color(s):
     """
-    ��������� ������, ���������� ���������� � ���������� ��������,
-    � ���������� �� � ���� ������� � ���������.
     Args:
-        strings (list): ������ �����.
     Returns:
-        dict: �������, ��� ����� � ������� �����, � �������� � ������ � ������������ � ���������� ��������.
     """
-    # ������ �������� ���� � ���� ��� ������
-    keywords = [r'�������',
-                r'�����',
-                r'�������'
-                r'��',
-                r'���',
-                r'����',
                 r'rosso',
                 r'roso',
                 r'roseto',
                 r'rosetto',
                 r'red',
                 r'white']
-    # ������� ������ ����������� ���������
     pattern = re.compile('|'.join(keywords), re.IGNORECASE)
     #gift_box_phrases={}
     #for idx, s in enumerate(strings):
@@ -207,16 +207,16 @@ def get_color(s):
 def get_GB(s):
     """
-    ��������� ������, ���������� ���������� � ���������� ��������,
-    � ���������� �� � ���� ������� � ���������.
     Args:
-        strings (list): ������ �����.
     Returns:
-        dict: �������, ��� ����� � ������� �����, � �������� � ������ � ������������ � ���������� ��������.
     """
-    # ������ �������� ���� � ���� ��� ������
     keywords = [r'cristal decanter in oak gift box',
                 r'in the carton gift box with 2 glasses',
                 r'decanter in the carton gift box',
@@ -239,30 +239,30 @@ def get_GB(s):
                 r'in wood case'
                 r'in wood box',
                 r'in wood',
-                r'����������� �������� � ���������� �������� �� ����',
-                r'�������� � ���������� �������� �� �������',
-                r'� ���������� �������� �� ������� � 2 ��������'
-                r'� ���������� �������� �� �������',
-                r'� ���������� �������� �� ����',
-                r'� � � ������ � ���������� �������',
-                r'� ���������� ��������',
-                r'���������� ��������',
-                r'���������� �����',
-                r'� ���������� �������',
-                r'���������� �������',
-                r'� �/�+2 ���������',
-                r'� �/� �� �������',
-                r'� �/�+�����',
-                r'� �/� (���.�������)',
-                r'� �/� ������',
-                r'� �/�',
-                r'� � �',
-                r'�/��',
-                r'�/�',
-                r'� ����',
-                r'����',
-                r'��']
-    # ������� ������ ����������� ���������
     pattern = re.compile('|'.join(keywords), re.IGNORECASE)
     #gift_box_phrases={}
     #for idx, s in enumerate(strings):
@@ -291,7 +291,7 @@ def prcess_text(origin):
     if volume_or_number is not None:
         volume_with_comma=str(volume_or_number).replace('.', ',')
         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
-        text=text.replace(str(volume_or_number)+' �', '').replace(str(volume_with_comma)+' �', '')
     # else:
     #     volume_or_number=re_extract_volume(text)
     #     if volume_or_number is not None:
@@ -299,7 +299,7 @@ def prcess_text(origin):
     #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
     years = extract_years(text)
     if years is not None:
-        text=text.replace(str(years), '').replace(str('��������'), '').replace(str('��������'), '').replace(str('aging'), '')
     production_year = extract_production_year(text)
     if production_year is not None:
         text=text.replace(str(production_year), '')
@@ -322,30 +322,30 @@ def prcess_text(origin):
 def remove_l(text):
-    result = re.sub(r'\b�\b', '', text, flags=re.IGNORECASE)
-  # ������� ��������� ������ �������, ����������� ����� ��������
     result = re.sub(r'\s{2,}', ' ', result).strip()
     return result
 def trim_name(text, words_to_remove):
     """
-    ������� �� ������ ������ �� �����, ������� ��������� ��������� � ���������� ������ words_to_remove.
-    :param text: �������� ������.
-    :param words_to_remove: ������ ����, ������� ���������� �������.
-    :return: ���������� ������ � ��������� �������.
     """
-    # ������ ���������� ���������, ������� ���� ����� �� ��������� ���� ��� ��������� �����.
-    # ���������� re.escape, ����� ������������ ����������� � ������.
     pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
     #print(pattern)
-    # �������� ��������� ������ ����� �� ������ ������.
     new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
-    # ������� ������ �������, ����������� ����� �������� ����.
     new_text = re.sub(r'\s+', ' ', new_text).strip()
     return new_text
@@ -448,16 +448,16 @@ def  process_products(products):
 def fill_brands_in_dataframe(brands, df, col_name='new_brand', is_brand=True):
     """
-    ��������� ������� 'brand' � DataFrame ���������� ��������.
-    :param brands: ������ �������.
-    :param df: DataFrame � ��������� ['id', 'brand', 'name', ...].
-    :return: DataFrame � ���������� �������� 'brand'.
     """
-    # �������������� ������� ��� �������� ������ �������
     automaton = Automaton()
-    # ��������� ������ � �������
     for idx, brand in enumerate(brands):
         if isinstance(brand, str) and brand:
             automaton.add_word(brand.lower(), (idx, brand))
@@ -466,18 +466,18 @@ def fill_brands_in_dataframe(brands, df, col_name='new_brand', is_brand=True):
     def find_brand(name):
         """
-        ������� ������ ����� ��� ������� �����.
         """
         matched_brands = set()
         for _, (_, brand) in automaton.iter(name.lower()):
-            # ���������, ��� ����� ����������� ��� ��������� �����
             if re.search(rf'\b{re.escape(brand.lower())}\b', name.lower()):
                 matched_brands.add(brand)
-        # ���������� ����� � ������������ ������ (����� ������ ����������)
         return max(matched_brands, key=len) if matched_brands else None
-    # ��������� ������� brand ������ ��� ������ ��������
     # df['new_brand'] = df.apply(
     #     lambda row: find_brand(row['name']), #if pd.isna(row['brand']) else row['brand'],
     #     axis=1
@@ -519,31 +519,31 @@ def get_same_brands(products, items):
 def match_brands_improved(items_brands, prods_brands, threshold=85):
     """
-    ���������� �������� ������������� ������� � ������ ��������� ������ � ���������� ������.
-    :param items_brands: ������ ������� �� ���������� items.
-    :param prods_brands: ������ ������� �� ���������� prods.
-    :param threshold: ����� �������� ��� ��������� ������.
-    :return: ������� ������������ {����� �� items: ��������� ����� �� prods}.
     """
     brand_mapping = {}
     for item_brand in tqdm(items_brands):
         if isinstance(item_brand, str):
-            # ��������� ����� �� �����
             parts = [part.strip() for part in re.split(r"[\/\(\)]", item_brand) if part.strip()]
             best_match = None
             best_score = 0
             for part in parts:
                 match, score, _ = process.extractOne(part, prods_brands, scorer=fuzz.ratio)
-                # ���������� �� ����� ����� � ������
                 if score >= threshold and abs(len(part) - len(match)) / len(part) <= 0.3:
                     if score > best_score:
                         best_match = match
                         best_score = score
-            # ���������� ����������
             if best_match:
                 brand_mapping[item_brand] = best_match#, best_score)
@@ -552,14 +552,14 @@ def match_brands_improved(items_brands, prods_brands, threshold=85):
 def normalize(text):
     """
-    �������� ����� � ������� �������� � ��������������� ��� � ��������.
     """
     return unidecode(text.lower())
 def build_regex_for_brands(brands):
     """
-    ����������� ������ � ������ ���� ���������� ��������� ��� ������� ������.
-    ���������� ���������������� ������� � �������: ��������������� �������� -> ������������ ��������.
     """
     norm_to_brand = {}
     for brand in brands:
@@ -571,20 +571,20 @@ def build_regex_for_brands(brands):
 def process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold):
     """
-    ������������ ���� ������:
-      1. �������� ����� ����� ����� ���������� ���������.
-      2. ���� ������� ���������� ��� � ��������� ������ � ��������� �������� �����.
-    ���������� ������: (�������� ������, ��������� ����� ��� None).
     """
     norm_s = normalize(s)
-    # �������� ����� ����� ����� ���������� ���������
     match = regex_pattern.search(norm_s)
     if match:
         return s, norm_to_brand[match.group(0)]
-    # ���� ������� ���������� ���, ��������� ������ �� ������������ � ����������� �����
     parts = [part.strip() for part in re.split(r"[\/\(\)]", s) if part.strip()]
-    parts.append(s)  # ������ ���� ������
     best_match = None
     best_score = 0
     for part in parts:
@@ -603,17 +603,17 @@ def process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_br
 def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
     """
-    ����� ������� � ������� � ������ ��������� ��������� � ��������������.
-    ���������� ��������������� ����� ����� ���������� ��������� �, ��� �������������,
-    �������� �����. ��������� ����������� ����������� � ������������ ��������� � ������� pqdm.
-    :param strings: ������ ����� ��� ������ �������.
-    :param brands: ������ ������� ��� ������.
-    :param threshold: ����� �������� ��� ��������� ������.
-    :param n_jobs: ����� ������� ������� (��� ���������, ���� ������������ pqdm.processes).
-    :return: ������� ���� {������: ��������� �����}.
     """
-    # �������������� ������ ��������������� ������� � ������������� �������� � ������������� ��������.
     norm_brand_list = []
     index_to_brand = []
     for brand in brands:
@@ -621,14 +621,14 @@ def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
         norm_brand_list.append(norm_brand)
         index_to_brand.append(brand)
-    # ������� ��������������� ������� ��� ������� ������.
     regex_pattern, norm_to_brand = build_regex_for_brands(brands)
-    # ���������� ��������������� �������, ����������� ����������� ���������.
     def process_string_wrapper(s):
         return process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold)
-    # ������������ ������ ����������� � ������������ ���������.
     results = pqdm(strings, process_string_wrapper, n_jobs=n_jobs)
     brand_mapping = {}
@@ -640,34 +640,34 @@ def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
 def clean_wine_name(name):
     """
-    ������� � ����� ������ �������� ������� ����� (������������� �����), �� �������� � ������ ������ ����.
-    ��������, "�����   �" ����������� � "�����".
     """
-    # ���������� ��������� ����:
-    # \s+        � ���� ��� ��������� ���������� ��������;
-    # \b         � ������� �����;
-    # [A-Za-z�-ߨ�-��] � ����� ���� ����� (��������� ��� �������������);
-    # \b         � ������� �����;
-    # \s*$       � ����� ������� �� ����� ������.
-    return re.sub(r'\s+\b[A-Za-z�-ߨ�-��]\b\s*$', '', name)
 def most_common_words(strings, top_n=None):
     """
-    ���������� ������ �������� ����� ������������� ���� �� ������ �����.
-    ���������:
-    - strings: ������ �����
-    - top_n: ���������� �������� ����� ������������� ����, ������� ���������� �������.
-             ���� None, ������������ ��� �����, ��������������� �� �������.
-    ����������:
-    - ������ �������� (�����, �������)
     """
     all_words = []
     for s in tqdm(strings):
         s=str(s)
-        # ��������� �����, �������� �� � ������� �������� � ������� ����������
         words = re.findall(r'\w+', s.lower())
         all_words.extend(words)
@@ -681,12 +681,12 @@ def top_inserts_matching(other_brands, p_brands, items, th=65):
     for i in other_brands:
         l=i.split('/')
         if len(l)>2:
-            replaced[l[0].replace('����','')]=i
         else:
-            if '����' in i:
-                replaced[i.replace('����','')]=i
-    ob=[i.split('/')[0].replace('����','') for i in other_brands]
     rr60_ob=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
     result={}
@@ -704,7 +704,7 @@ def process_unbrended_names(items, p_brands, types, grape_varieties, onther_word
     for n in tqdm(items[items['new_brand'].isna()]['name'].values):
         name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(n)
-        #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('���� ����� ������� �/��. ���.0.75�')
         name=trim_name(name, types)
         name=trim_name(name, grape_varieties)
         name=trim_name(name, onther_words)
@@ -735,8 +735,8 @@ def process_unbrended_names(items, p_brands, types, grape_varieties, onther_word
 def find_full_word(text, word_list):
     """
-    ���� ������ ������ ��������� ����� �� word_list � ������ text.
-    ���������� ��������� ����� ��� None, ���� ���������� �� �������.
     """
     for word in word_list:
         pattern = r'\b' + re.escape(word) + r'\b'
@@ -780,7 +780,7 @@ def merge_wine_type(items, colors=None, color_merge_dict=None):
 def merge_types(items, products):
     alco_types=[i.strip().lower() for i in products['type'].unique()]
-    alco_types.append('����')
     result=[]
     for row in tqdm(items.iterrows()):
         try:
@@ -801,13 +801,13 @@ def merge_types(items, products):
             result.append(None)
     items['new_type']=result
-    items['new_type']=items['new_type'].replace({'����': '�����', None: 'unmatched'})
 def normalize_name(name):
     """
-    ����������� ������: ���� �������������� ������� ����, ��������������� � � ��������,
-    �������� � ������� ��������.
     """
     try:
         if detect_language(name) == 'ru':
@@ -818,13 +818,13 @@ def normalize_name(name):
 def prepare_groups_with_ids(items_df):
     """
-    ��������������� ����������� ������ �� items �� (new_brand, type, volume, new_type_wine, sour)
-    � ������ ���������������� ��������.
-    ��������� ������� 'norm_name', ����� ������������� �������� name ���� ��� �������.
-    :param items_df: DataFrame � ��������� 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
-    :return: ������� {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
@@ -836,11 +836,11 @@ def prepare_groups_with_ids(items_df):
 def prepare_groups_by_alternative_keys(items_df):
     """
-    ����������� ������ �� items �� (new_type_wine, new_type, volume, sour) � ����������� id, new_brand,
-    ������������� � ���������������� �����.
-    :param items_df: DataFrame � ��������� 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
-    :return: ������� {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
@@ -853,26 +853,26 @@ def prepare_groups_by_alternative_keys(items_df):
 def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
     """
-    ����� ���������� � ����������� id ��������� ������, ��������� ������� ��������������
-    ��������������� ������.
-    ������������ ��� �������:
-    - ������: ����� �� ������� (brand, type, volume, new_type_wine, sour);
-    - ������: ��� ��������� ��� ���������� ���� �� �������������� ������� (new_type_wine, new_type, volume, sour),
-      �������� ����� � �������� �������.
-    ��������� ������������ �� ������� norm_name, � ��� ������ ������������ ������������ name.
-    :param products_df: DataFrame � ��������� 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
-    :param items_groups: �������, �������������� �������� prepare_groups_with_ids.
-    :param items_df: DataFrame ������ � ��������� 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
-    :param name_threshold: ����� �������� ��� fuzzy matching.
-    :return: DataFrame � ������������ ��������� 'matched_items' (������ ����������) � 'alternative' (�������������� ����������).
     """
     results = []
-    no_match_products = []  # ������ ��� �������� ��������� ��� ���������� � �������� ������
-    # ������ ������: ����� �� ������� (brand, type, volume, new_type_wine, sour)
     for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
         product_brand = product['brand']
         product_type = product['type']
@@ -884,7 +884,7 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
         items_data = items_groups.get(key, [])
         if items_data:
-            # �������������: id, ������������ ���, ��������������� ���, volume, new_type_wine, sour
             items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = zip(*items_data)
         else:
             items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = ([], [], [], [], [], [])
@@ -911,13 +911,13 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         results.append({
             'product_id': product['id'],
             'matched_items': matched_items,
-            'alternative': []  # ����������� �� ������ �������
         })
-    # ���������� �������������� ����������� �� (new_type_wine, new_type, volume, sour)
     groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
-    # ������ ������: ��� ��������� ��� ���������� ���� �� �������������� �������
     for idx, product in tqdm(no_match_products):
         product_brand = product['brand']
         product_type_wine = product['new_type_wine']
@@ -928,7 +928,7 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         alt_key = (product_type_wine, product_type, product_volume, product_sour)
         type_items = groups_by_alternative_keys.get(alt_key, [])
-        # ���������, �������� ����� � �������� �������
         filtered_items = [item for item in type_items if item[1] != product_brand]
         if filtered_items:
             alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = zip(*filtered_items)
@@ -960,8 +960,8 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
 def contains_full_word(word, text, case_sensitive=True):
     """
-    ���������, ���������� �� ����� word � ������ text ��� ��������� �����.
-    �������� case_sensitive �����, ��������� �� �������.
     """
     flags = 0 if case_sensitive else re.IGNORECASE
     pattern = r'\b' + re.escape(word) + r'\b'
@@ -978,7 +978,7 @@ def unwrap_brands(products):
         for j in new_brands:
             if contains_full_word(i, j, case_sensitive=False):
                 if i != j:
-                    #if len(i)>1:#i != '�' and i  != "�":
                         res[j]=i
     return res

+def check_spark(row, col_name='name', types=['Игристое', 'игр']):
     if col_name in row.keys():
         for t in types:
+            if t.lower() in row[col_name].lower() and 'Пилигрим' not in row[col_name].lower():
+                return 'Игристое'
         return None
+def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
       if col_name in row.keys():
           for t in types:
               if t.lower() in row[col_name].lower():
+                  return 'Вино'
           return None
 def is_type_exist(row, types):
     for t in types:
+        if t.lower() in row['type'].lower():  # Сравнение без учета регистра
             return t
     return None
 def check_type(row, types):
     #checker=False
     for t in types:
+        if t.lower() in row['name'].lower():  # Сравнение без учета регистра
             return t
     return None
 def extract_years(text):
     """
+    Извлекает сочетание числа и слова, указывающего возраст (например: '50 лет', '21 years').
     """
+    # Регулярное выражение ищет числа и слова 'лет' или 'years' с учетом регистра
+    match = re.search(r'\b(?<!\d)(\d{1,2})\s*(лет|years)\b', text, re.IGNORECASE)
     if match:
+        # Приводим слово 'лет' или 'years' к исходному регистру
         return f"{match.group(1)} {match.group(2)}"
     return None
 def extract_production_year(text):
     """
+    Извлекает год производства (четырехзначное число в диапазоне 1900–2099) из строки.
+    Например: '2019'.
     """
     match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
     if match:
 def extract_alcohol_content(text):
     """
+    Извлекает содержание алкоголя из строки.
+    Например: '40%'.
     """
     match = re.search(r'(\d{1,2}(?:[.,]\d+)?\s*%)', text)
     if match:
+        # Заменяем запятую на точку для единообразия (если нужно)
         return match.group(1).replace(' ', '').replace(',', '.')
     return None
 def is_volume(value):
     """
+    Проверяет, является ли значение валидным объемом (<= 10 литров).
     """
     try:
         volume = float(value)
 def extract_volume_or_number(text):
     """
+    Извлекает объем в литрах или число с плавающей точкой из строки.
+    Например: '0,75л', '0.5', или '1,5 л'.
     """
+    # Попытка найти объем с буквой 'л' или без пробела перед ней
+    match_with_l = re.search(r'(\d+(?:[\.,]\d+)?\s*[лЛ]|(?:\d+(?:[\.,]\d+)?[лЛ]))', text)
     if match_with_l:
+        return is_volume(match_with_l.group(1).replace(',', '.').replace('л', '').replace('Л', '').strip())
+    # ��сли не найдено, ищем просто число с плавающей точкой
+    match_number = re.search(r'(?<!№)\b(\d{1,2}(?:[\.,]\d+))\b(?!\s*(№|-er|er|\d{3,}))', text)
     if match_number:
         return is_volume(match_number.group(1).replace(',', '.'))
 def get_sour(s):
     """
+    Извлекает из строки ключевое слово, если оно присутствует как отдельное слово.
+    Использует отрицательные просмотр назад/вперёд для проверки, что перед и после найденного
+    ключевого слова нет буквенно-цифровых символов.
     Args:
+        s (str): Исходная строка.
     Returns:
+        str or None: Найденное ключевое слово, если оно присутствует как отдельное слово, иначе None.
     """
+    # Список ключевых слов
     keywords = [
         r'brut',
         r'semi-sweet',
         r'sweet',
+        r'брют',
+        r'сухое',
+        r'полусухое',
+        r'полусладкое',
+        r'сладкое',
+        r'п/сух',
+        r'п/сл',
+        r'п/с',
+        r'сл',
+        r'сух'
     ]
+    # Собираем шаблон с использованием негативных просмотр назад и вперёд,
+    # чтобы убедиться, что совпадение не является частью более длинного слова.
+    # (?<!\w) - перед совпадением не должно быть символа [a-zA-Z0-9_]
+    # (?!\w)  - после совпадения не должно быть символа [a-zA-Z0-9_]
     pattern = re.compile(r'(?<!\w)(?:' + '|'.join(keywords) + r')(?!\w)', re.IGNORECASE)
     match = pattern.search(s)
 def get_color(s):
     """
+    Извлекает строки, содержащие упоминания о подарочной упаковке,
+    и возвращает их в виде словаря с индексами.
     Args:
+        strings (list): Список строк.
     Returns:
+        dict: Словарь, где ключи — индексы строк, а значения — строки с упоминаниями о подарочной упаковке.
     """
+    # Список ключевых слов и фраз для поиска
+    keywords = [r'красное',
+                r'белое',
+                r'розовое'
+                r'кр',
+                r'бел',
+                r'розе',
                 r'rosso',
                 r'roso',
                 r'roseto',
                 r'rosetto',
                 r'red',
                 r'white']
+    # Создаем шаблон регулярного выражения
     pattern = re.compile('|'.join(keywords), re.IGNORECASE)
     #gift_box_phrases={}
     #for idx, s in enumerate(strings):
 def get_GB(s):
     """
+    Извлекает строки, содержащие упоминания о подарочной упаковке,
+    и возвращает их в виде словаря с индексами.
     Args:
+        strings (list): Список строк.
     Returns:
+        dict: Словарь, где ключи — индексы строк, а значения — строки с упоминаниями о подарочной упаковке.
     """
+    # Список ключевых слов и фраз для поиска
     keywords = [r'cristal decanter in oak gift box',
                 r'in the carton gift box with 2 glasses',
                 r'decanter in the carton gift box',
                 r'in wood case'
                 r'in wood box',
                 r'in wood',
+                r'хрустальный декантер в подарочной упаковке из дуба',
+                r'декантер в подарочной упаковке из картона',
+                r'в подарочной упаковке из картона с 2 бокалами'
+                r'в подарочной упаковке из картона',
+                r'в подарочной упаковке из Дуба',
+                r'в П У графин и деревянная коробка',
+                r'в подарочной упаковке',
+                r'подарочная упаковка',
+                r'подарочный набор',
+                r'в деревянной коробке',
+                r'деревянная коробка',
+                r'в п/у+2 бокаланов',
+                r'в п/у из картона',
+                r'в п/у+бокал',
+                r'в п/у (дер.коробке)',
+                r'в п/у солома',
+                r'в п/у',
+                r'в п у',
+                r'п/уп',
+                r'п/у',
+                r'в тубе',
+                r'туба',
+                r'ПУ']
+    # Создаем шаблон регулярного выражения
     pattern = re.compile('|'.join(keywords), re.IGNORECASE)
     #gift_box_phrases={}
     #for idx, s in enumerate(strings):
     if volume_or_number is not None:
         volume_with_comma=str(volume_or_number).replace('.', ',')
         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+        text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
     # else:
     #     volume_or_number=re_extract_volume(text)
     #     if volume_or_number is not None:
     #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
     years = extract_years(text)
     if years is not None:
+        text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
     production_year = extract_production_year(text)
     if production_year is not None:
         text=text.replace(str(production_year), '')
 def remove_l(text):
+    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
+  # Убираем возможные лишние пробелы, возникающие после удаления
     result = re.sub(r'\s{2,}', ' ', result).strip()
     return result
 def trim_name(text, words_to_remove):
     """
+    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
+    :param text: Исходная строка.
+    :param words_to_remove: Список слов, которые необходимо удалить.
+    :return: Обновлённая строка с удалёнными словами.
     """
+    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
+    # Используем re.escape, чтобы экранировать спецсимволы в словах.
     pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
     #print(pattern)
+    # Заменяем найденные полные слова на пустую строку.
     new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    # Убираем лишние пробелы, возникающие после удаления слов.
     new_text = re.sub(r'\s+', ' ', new_text).strip()
     return new_text
 def fill_brands_in_dataframe(brands, df, col_name='new_brand', is_brand=True):
     """
+    Заполняет колонку 'brand' в DataFrame найденными брендами.
+    :param brands: Список брендов.
+    :param df: DataFrame с колонками ['id', 'brand', 'name', ...].
+    :return: DataFrame с обновлённой колонкой 'brand'.
     """
+    # Инициализируем автомат для быстрого поиска брендов
     automaton = Automaton()
+    # Добавляем бренды в автомат
     for idx, brand in enumerate(brands):
         if isinstance(brand, str) and brand:
             automaton.add_word(brand.lower(), (idx, brand))
     def find_brand(name):
         """
+        Находит лучший бренд для данного имени.
         """
         matched_brands = set()
         for _, (_, brand) in automaton.iter(name.lower()):
+            # Проверяем, что бренд встречается как отдельное слово
             if re.search(rf'\b{re.escape(brand.lower())}\b', name.lower()):
                 matched_brands.add(brand)
+        # Возвращаем бренд с максимальной длиной (более точное совпадение)
         return max(matched_brands, key=len) if matched_brands else None
+    # Обновляем колонку brand только для пустых значений
     # df['new_brand'] = df.apply(
     #     lambda row: find_brand(row['name']), #if pd.isna(row['brand']) else row['brand'],
     #     axis=1
 def match_brands_improved(items_brands, prods_brands, threshold=85):
     """
+    Улучшенный алгоритм сопоставления брендов с учётом нечёткого поиска и фильтрации ошибок.
+    :param items_brands: Список брендов из датафрейма items.
+    :param prods_brands: Список брендов из датафрейма prods.
+    :param threshold: Порог сходства для нечёткого поиска.
+    :return: Словарь соответствий {бренд из items: ближайший бренд из prods}.
     """
     brand_mapping = {}
     for item_brand in tqdm(items_brands):
         if isinstance(item_brand, str):
+            # Разделяем бренд на части
             parts = [part.strip() for part in re.split(r"[\/\(\)]", item_brand) if part.strip()]
             best_match = None
             best_score = 0
             for part in parts:
                 match, score, _ = process.extractOne(part, prods_brands, scorer=fuzz.ratio)
+                # Фильтрация по длине строк и порогу
                 if score >= threshold and abs(len(part) - len(match)) / len(part) <= 0.3:
                     if score > best_score:
                         best_match = match
                         best_score = score
+            # Сохранение результата
             if best_match:
                 brand_mapping[item_brand] = best_match#, best_score)
 def normalize(text):
     """
+    Приводит текст к нижнему регистру и транслитерирует его в латиницу.
     """
     return unidecode(text.lower())
 def build_regex_for_brands(brands):
     """
+    Нормализует бренды и создаёт одно регулярное выражение для точного поиска.
+    Возвращает скомпилированный паттерн и словарь: нормализованное название -> оригинальное название.
     """
     norm_to_brand = {}
     for brand in brands:
 def process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold):
     """
+    Обрабатывает одну строку:
+      1. Пытается найти бренд через регулярное выражение.
+      2. Если точного совпадения нет – разбивает строку и выполняет нечёткий поиск.
+    Возвращает кортеж: (исходная строка, найденный бренд или None).
     """
     norm_s = normalize(s)
+    # Пытаемся найти бренд через регулярное выражение
     match = regex_pattern.search(norm_s)
     if match:
         return s, norm_to_brand[match.group(0)]
+    # Если точного совпадения нет, разбиваем строку по разделителям и анализируем части
     parts = [part.strip() for part in re.split(r"[\/\(\)]", s) if part.strip()]
+    parts.append(s)  # анализ всей строки
     best_match = None
     best_score = 0
     for part in parts:
 def check_brands_in_strings_pqdm(strings, brands, threshold=85, n_jobs=8):
     """
+    Поиск брендов в строках с учетом вариантов написания и транслитерации.
+    Использует предварительный поиск через регулярное выражение и, при необходимости,
+    нечёткий поиск. Обработка выполняется параллельно с отображением прогресса с помощью pqdm.
+    :param strings: Список строк для поиска брендов.
+    :param brands: Список брендов для поиска.
+    :param threshold: Порог сходства для нечёткого поиска.
+    :param n_jobs: Число рабочих потоков (или процессов, если использовать pqdm.processes).
+    :return: Словарь вида {строка: найденный бренд}.
     """
+    # Подготавливаем список нормализованных брендов и сопоставление индексов с оригинальными брендами.
     norm_brand_list = []
     index_to_brand = []
     for brand in brands:
         norm_brand_list.append(norm_brand)
         index_to_brand.append(brand)
+    # Создаем комбинированный паттерн для точного поиска.
     regex_pattern, norm_to_brand = build_regex_for_brands(brands)
+    # Определяем вспомогательную функцию, закрывающую необходимые параметры.
     def process_string_wrapper(s):
         return process_string(s, regex_pattern, norm_to_brand, norm_brand_list, index_to_brand, threshold)
+    # Обрабатываем строки параллельно с отображением прогресса.
     results = pqdm(strings, process_string_wrapper, n_jobs=n_jobs)
     brand_mapping = {}
 def clean_wine_name(name):
     """
+    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
+    Например, "токай   л" превратится в "токай".
     """
+    # Регулярное выражение ищет:
+    # \s+        – один или несколько пробельных символов;
+    # \b         – граница слова;
+    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
+    # \b         – граница слова;
+    # \s*$       – любые пробелы до конца строки.
+    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
 def most_common_words(strings, top_n=None):
     """
+    Возвращает список наиболее часто повторяющихся слов из списка строк.
+    Параметры:
+    - strings: список строк
+    - top_n: количество наиболее часто встречающихся слов, которые необходимо вернуть.
+             Если None, возвращаются все слова, отсортированные по частоте.
+    Возвращает:
+    - Список кортежей (слово, частота)
     """
     all_words = []
     for s in tqdm(strings):
         s=str(s)
+        # Извлекаем слова, приводим их к нижнему регистру и удаляем пунктуацию
         words = re.findall(r'\w+', s.lower())
         all_words.extend(words)
     for i in other_brands:
         l=i.split('/')
         if len(l)>2:
+            replaced[l[0].replace('Шато','')]=i
         else:
+            if 'Шато' in i:
+                replaced[i.replace('Шато','')]=i
+    ob=[i.split('/')[0].replace('Шато','') for i in other_brands]
     rr60_ob=check_brands_in_strings_pqdm(ob, p_brands, threshold=th)
     result={}
     for n in tqdm(items[items['new_brand'].isna()]['name'].values):
         name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(n)
+        #name, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text('Вино Токай Фурминт п/сл. бел.0.75л')
         name=trim_name(name, types)
         name=trim_name(name, grape_varieties)
         name=trim_name(name, onther_words)
 def find_full_word(text, word_list):
     """
+    Ищет первое полное вхождение слова из word_list в строке text.
+    Возвращает найденное слово или None, если совпадение не найдено.
     """
     for word in word_list:
         pattern = r'\b' + re.escape(word) + r'\b'
 def merge_types(items, products):
     alco_types=[i.strip().lower() for i in products['type'].unique()]
+    alco_types.append('ликёр')
     result=[]
     for row in tqdm(items.iterrows()):
         try:
             result.append(None)
     items['new_type']=result
+    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
 def normalize_name(name):
     """
+    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
+    приводит к нижнему регистру.
     """
     try:
         if detect_language(name) == 'ru':
 def prepare_groups_with_ids(items_df):
     """
+    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
+    с учетом нормализованного названия.
+    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
+    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
+    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
 def prepare_groups_by_alternative_keys(items_df):
     """
+    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
+    оригинального и нормализованного имени.
+    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
+    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
     """
     items_df = items_df.copy()
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
 def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
     """
+    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
+    нормализованные группы.
+    Производится два прохода:
+    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
+    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
+      исключая итемы с исходным брендом.
+    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
+    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
+    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
+    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
+    :param name_threshold: Порог сходства для fuzzy matching.
+    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
     """
     results = []
+    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
+    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
     for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
         product_brand = product['brand']
         product_type = product['type']
         key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
         items_data = items_groups.get(key, [])
         if items_data:
+            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
             items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = zip(*items_data)
         else:
             items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour = ([], [], [], [], [], [])
         results.append({
             'product_id': product['id'],
             'matched_items': matched_items,
+            'alternative': []  # Заполняется во втором проходе
         })
+    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
     groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
+    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
     for idx, product in tqdm(no_match_products):
         product_brand = product['brand']
         product_type_wine = product['new_type_wine']
         alt_key = (product_type_wine, product_type, product_volume, product_sour)
         type_items = groups_by_alternative_keys.get(alt_key, [])
+        # Фильтруем, исключая итемы с исходным брендом
         filtered_items = [item for item in type_items if item[1] != product_brand]
         if filtered_items:
             alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour = zip(*filtered_items)
 def contains_full_word(word, text, case_sensitive=True):
     """
+    Проверяет, содержится ли слово word в строке text как отдельное слово.
+    Параметр case_sensitive задаёт, учитывать ли регистр.
     """
     flags = 0 if case_sensitive else re.IGNORECASE
     pattern = r'\b' + re.escape(word) + r'\b'
         for j in new_brands:
             if contains_full_word(i, j, case_sensitive=False):
                 if i != j:
+                    #if len(i)>1:#i != 'А' and i  != "Я":
                         res[j]=i
     return res