Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Upload 20 files

by Gainward777 - opened Mar 18, 2025

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+1101

-817

Files changed (14) hide show

.gitattributes +35 -35
README.md +12 -12
api.py +189 -0
app.py +29 -31
preprocess/preprocess.py +224 -224
preprocess/utils/common/utils.py +149 -137
processor/matching.py +158 -158
processor/processor.py +28 -32
requirements.txt +6 -6
search/search_by_id.py +52 -23
tmp/prod.csv +1 -1
tmp/service/prod.csv +1 -1
tmp/utils.py +48 -37
ui/gradio_ui.py +169 -120

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Product Matching
-emoji: 🏃
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 5.19.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Product Matching
+emoji: 🏃
+colorFrom: gray
+colorTo: purple
+sdk: gradio
+sdk_version: 5.19.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

api.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import json
+import os
+import datetime
+from processor.processor import Processor
+from constants.constants import *
+from search.search_by_id import Searcher
+from fastapi import FastAPI, File, UploadFile, HTTPException
+import uvicorn
+from pydantic import BaseModel
+import pandas as pd
+from tmp.utils import update_products_csv
+processor=Processor(LONG_TYPES_LIST,
+                    SHORT_TYPES_LIST,
+                    SOUR,
+                    WINE_TYPES,
+                    GBS,
+                    COLORS_FOR_TRIM,
+                    GRAPES,
+                    OTHER_WORDS,
+                    SOUR_MERGE_DICT,
+                    TYPES_WINES_DICT,
+                    COLOR_MERGE_DICT)
+searcher=Searcher()
+class item_by_id(BaseModel):
+    result_file: str
+    id: str
+class match_request(BaseModel):
+    items: str
+    threshold: int
+    items_first: int
+def get_data_dir():
+    return "/home/user/app/_data/"
+def get_products_dir():
+    return os.path.join(get_data_dir(), "products")
+def get_items_dir():
+    return os.path.join(get_data_dir(), "items")
+def get_results_dir():
+    return os.path.join(get_data_dir(), "results")
+app = FastAPI()
+@app.get("/api/get_result_csv")
+async def get_result_csv():
+    results = []
+    for file in os.listdir(get_results_dir()):
+        if file.endswith(".csv"):
+            results.append(file)
+    results_json = json.dumps(results)
+    return results_json
+@app.post("/api/upload_result_csv")
+async def upload_result_csv(file: UploadFile = File(...)):
+    try:
+        contents = file.file.read()
+        with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
+            f.write(contents)
+    except Exception:
+        raise HTTPException(status_code=500, detail='Something went wrong')
+    finally:
+        file.file.close()
+    return {"message": f"Successfully uploaded {file.filename}"}
+@app.post("/api/upload_products_csv")
+async def upload_products_csv(file: UploadFile, overwrite_existing: int):
+    try:
+        datadir = get_products_dir()
+        if not os.path.exists(datadir):
+            os.makedirs(datadir)
+        tempfile = os.path.join(datadir, "products.csv_upload")
+        contents = file.file.read()
+        with open(tempfile, 'wb') as f:
+            f.write(contents)
+        fullfn = os.path.join(datadir, "products.csv")
+        update_products_csv(tempfile, fullfn, overwrite_existing)
+    except Exception:
+        raise HTTPException(status_code=500, detail='Something went wrong')
+    finally:
+        file.file.close()
+    return {"message": f"Successfully uploaded {file.filename}"}
+@app.post("/api/upload_items_csv")
+async def upload_items_csv(file: UploadFile = File(...)):
+    try:
+        itemsdir = get_items_dir()
+        if not os.path.exists(itemsdir):
+            os.makedirs(itemsdir)
+        contents = file.file.read()
+        with open(os.path.join(itemsdir, file.filename), 'wb') as f:
+            f.write(contents)
+    except Exception:
+        raise HTTPException(status_code=500, detail='Something went wrong')
+    finally:
+        file.file.close()
+    return {"message": f"Successfully uploaded {file.filename}"}
+@app.get("/api/get_items_csv")
+async def get_items_csv():
+    itemsdir = get_items_dir()
+    results = []
+    for file in os.listdir(itemsdir):
+        if file.endswith(".csv"):
+            results.append(file)
+    results_json = json.dumps(results)
+    return results_json
+@app.post("/api/match")
+async def match(r: match_request):
+    prods_file = os.path.join(get_products_dir(), "products.csv")
+    if not os.path.isfile(prods_file):
+        return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
+    if len(r.items) == 0:
+        return {"Status": "Error", "ErrorDesc": "Items file not specified"}
+    if not r.threshold:
+        r.threshold = 50
+    items_fn = os.path.join(get_items_dir(), r.items)
+    if not os.path.isfile(items_fn):
+        return {"Status": "Error", "ErrorDesc": "Items file not found"}
+    row_items = pd.read_csv(items_fn, sep='\t')
+    row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
+    df, items, products = processor.process(row_products, row_items, r.items_first, r.threshold)
+    results_dir = get_results_dir()
+    if not os.path.exists(results_dir):
+        os.makedirs(results_dir)
+    output_csv = "m1-" + str(r.threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
+    df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
+    return {"Status": "Success", "result_file" : output_csv}
+@app.get("/api/get_matched_by_id")
+async def get_matched_by_id(item: item_by_id):
+    fullfn = os.path.join(get_results_dir(), item.result_file)
+    if not os.path.isfile(fullfn):
+        return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
+    (df, is_alternative) = searcher.search(fullfn, int(item.id))
+    if df.empty:
+        return {"Status": "Success", "IsAlternative": False, "Data": ""}
+    return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
+if __name__ == "__main__":
+    uvicorn.run(
+        app,
+        host="0.0.0.0",
+        port=8000,
+        log_level="debug"
+    )

app.py CHANGED Viewed

@@ -1,31 +1,29 @@
-from processor.processor import Processor
-from constants.constants import *
-from ui.gradio_ui import GradioUI
-from search.search_by_id import Searcher
-processor=Processor(LONG_TYPES_LIST,
-                    SHORT_TYPES_LIST,
-                    SOUR,
-                    WINE_TYPES,
-                    GBS,
-                    COLORS_FOR_TRIM,
-                    GRAPES,
-                    OTHER_WORDS,
-                    SOUR_MERGE_DICT,
-                    TYPES_WINES_DICT,
-                    COLOR_MERGE_DICT)
-searcher=Searcher()
-ui=GradioUI(processor, searcher)
-ui.run_ui()

+from processor.processor import Processor
+from constants.constants import *
+from ui.gradio_ui import GradioUI
+from search.search_by_id import Searcher
+processor=Processor(LONG_TYPES_LIST,
+                    SHORT_TYPES_LIST,
+                    SOUR,
+                    WINE_TYPES,
+                    GBS,
+                    COLORS_FOR_TRIM,
+                    GRAPES,
+                    OTHER_WORDS,
+                    SOUR_MERGE_DICT,
+                    TYPES_WINES_DICT,
+                    COLOR_MERGE_DICT)
+searcher=Searcher()
+ui=GradioUI(processor, searcher, "/home/user/app/_data/")
+ui.run_ui()

preprocess/preprocess.py CHANGED Viewed

@@ -1,224 +1,224 @@
-import json
-from tqdm import tqdm
-from preprocess.utils.items.attrs import *
-from preprocess.utils.common.extracters import *
-from preprocess.utils.common.brand_matching import *
-from preprocess.utils.common.parallel_brand_matching import *
-from preprocess.utils.common.utils import *
-from preprocess.utils.common.top_inserts import *
-import pandas as pd
-class Preprocessor():
-    def __init__(self, long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict):
-        self.long_types_list=long_types_list
-        self.short_types_list=short_types_list
-        self.sour=sour_list
-        self.type_wine=type_wine
-        self.gbs=gbs
-        self.colors_ft=colors_for_trim
-        self.grapes=grapes
-        self.other_words=other_words
-        self.types_n_others=long_types_list+other_words
-        self.sour_dict=sour_merge_dict
-        self.type_dict=type_merge_dict
-        self.color_merge_dict=color_merge_dict
-    def process_items(self, df):
-        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
-    #counter=0
-        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
-            try:
-                i=json.loads(i)
-                result['id'].append(idf)
-                if 'brand' in i.keys():
-                    result['brand'].append(i['brand'])
-                else: result['brand'].append(None)
-                result['name'].append(i['name'])
-                drink_type=get_type(i, self.long_types_list)
-                if drink_type is None:
-                    drink_type=check_spark(i)
-                if drink_type is None:
-                    drink_type=check_color_and_sour(i)
-                if drink_type is None:
-                    drink_type=check_spark(i, col_name='type_wine')
-                if drink_type is None:
-                    drink_type=check_color_and_sour(i, types=self.sour)
-                #if 'type' in i.keys():
-                result['type'].append(drink_type)#i['type'])
-                #else: dd['type'].append(None)
-                if 'volume' in i.keys():
-                    result['volume'].append(i['volume'])
-                else:
-                    vol=extract_volume_or_number(i['name'])
-                    result['volume'].append(vol)
-                if 'year' in i.keys():
-                    result['year'].append(i['year'])
-                else:
-                    year=extract_production_year(i['name'])
-                    result['year'].append(year)
-                alco=extract_alcohol_content(i['name'])
-                if 'type_wine' in i.keys():
-                    result['type_wine'].append(i['type_wine'])
-                else: result['type_wine'].append(None)
-                #f alco is not None:
-                result['alco'].append(alco)
-                #else: dd['type_wine'].append(None)
-            except Exception as ex:
-                print(idf, ex)
-        return pd.DataFrame(result)
-    def process_products(self, products):
-        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
-        for idx, row in tqdm(products.iterrows()):
-            try:
-                result['id'].append(row['id'])
-                result['brand'].append(row['brand'])
-                result['type_wine'].append(row['category'])
-                result['type'].append(row['product_type'])
-                result['name'].append(row['name_long'])
-                vol=extract_volume_or_number(row['name'])
-                result['volume'].append(vol)
-                #year=extract_production_year(row['name'])
-                year=extract_production_year(str(row['name_postfix']))
-                result['year'].append(year)
-                #rr['year'].append(row['name_postfix'])
-                alco=extract_alcohol_content(row['name'])
-                #f alco is not None:
-                result['alco'].append(alco)
-            except Exception as ex:
-                print(ex)
-        return pd.DataFrame(result)
-    def prcess_text(self, text):
-        #text=''+origin
-        #text=str(split_russian_and_english(text))
-        gb=find_full_word(text, self.gbs)#get_GB(text)
-        if gb is not None:
-            text=text.replace(str(gb), '')
-        alcohol = extract_alcohol_content(text)
-        if alcohol is not None:
-            alco_w_comma=alcohol.replace('.', ',')
-            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
-        volume_or_number = extract_volume_or_number(text)
-        if volume_or_number is not None:
-            volume_with_comma=str(volume_or_number).replace('.', ',')
-            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
-            test=clean_wine_name(text) #remove_l(text)
-            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
-        # else:
-        #     volume_or_number=re_extract_volume(text)
-        #     if volume_or_number is not None:
-        #         volume_with_comma=volume_or_number.replace('.', ',')
-        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
-        years = extract_years(text)
-        if years is not None:
-            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
-        production_year = extract_production_year(text)
-        if production_year is not None:
-            text=text.replace(str(production_year), '')
-        color=find_full_word(text, self.colors_ft)
-        if color is not None:
-            text=text.replace(str(color), '')
-        sour=find_full_word(text, self.sour) #get_sour(text)
-        if sour is not None:
-            text=text.replace(str(sour), '')
-        # re_extracted_volume=re_extract_volume(text)
-        # if re_extracted_volume is not None:
-        #     volume_with_comma=re_extracted_volume.replace('.', ',')
-        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
-        # else:
-        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
-        # volume_or_number=re_extracted_volume
-        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
-    def process(self, products, items):
-        print('------*-----Prepare items catalogue-----*-----')
-        items=self.process_items(items.copy())
-        print('-----*-----Prepare products catalogue-----*-----')
-        products=self.process_products(products.copy())
-        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
-        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
-        print('-----*-----Split n match-----*-----')
-        splited=split_n_match(products, items)
-        items["brand"] = items["brand"].replace(splited)
-        print('-----*-----Fill brands in items-----*-----')
-        fill_brands_in_dataframe(products['brand'].unique(), items)
-        print('-----*-----Brand matching-----*-----')
-        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
-        out_prods=list(set(prod_brand_list)-set(comp_list))
-        out_items=list(set(items_brand_list)-set(comp_list))
-        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
-        items["new_brand"] = items["new_brand"].replace(brand_map_improved)
-        items['type']=items['type'].replace(self.type_dict)
-        print('-----*-----Unwrap brend cats step 1-----*-----')
-        unwrap_b_match=unwrap_brands(products)
-        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
-        products["brand"] = products["brand"].replace(unwrap_b_match)
-        print('-----*-----Unwrap brend cats step 2-----*-----')
-        unwrap_b_match=unwrap_brands(products)
-        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
-        products["brand"] = products["brand"].replace(unwrap_b_match)
-        print('-----*-----Finding brands in names-----*-----')
-        items['new_brand']=items['new_brand'].replace('none', None)
-        i_brands=items[items['new_brand'].isna()]['name'].values
-        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
-        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
-        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
-        print('-----*-----Top inserts-----*-----')
-        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
-                                self.grapes, self.other_words)
-        print('-----*-----Adding service categories-----*-----')
-        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(items, products)
-        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(products, products)
-        print('-----*-----Name trimming-----*-----')
-        item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
-        #items['name']=items['id'].replace(item_timed_names)
-        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
-        items['gb']=gb
-        items['sour']=sour
-        items['sour']=items['sour'].replace(self.sour_dict)
-        products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
-        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
-        products['gb']=gb
-        products['sour']=sour
-        products['sour']=products['sour'].replace(self.sour_dict)
-        print('-----*-----Replacing product types-----*-----')
-        products['type']=products['type'].replace(self.type_dict)
-        return items, products

+import json
+from tqdm import tqdm
+from preprocess.utils.items.attrs import *
+from preprocess.utils.common.extracters import *
+from preprocess.utils.common.brand_matching import *
+from preprocess.utils.common.parallel_brand_matching import *
+from preprocess.utils.common.utils import *
+from preprocess.utils.common.top_inserts import *
+import pandas as pd
+class Preprocessor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.long_types_list=long_types_list
+        self.short_types_list=short_types_list
+        self.sour=sour_list
+        self.type_wine=type_wine
+        self.gbs=gbs
+        self.colors_ft=colors_for_trim
+        self.grapes=grapes
+        self.other_words=other_words
+        self.types_n_others=long_types_list+other_words
+        self.sour_dict=sour_merge_dict
+        self.type_dict=type_merge_dict
+        self.color_merge_dict=color_merge_dict
+    def process_items(self, df):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+    #counter=0
+        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
+            try:
+                i=json.loads(i)
+                result['id'].append(idf)
+                if 'brand' in i.keys():
+                    result['brand'].append(i['brand'])
+                else: result['brand'].append(None)
+                result['name'].append(i['name'])
+                drink_type=get_type(i, self.long_types_list)
+                if drink_type is None:
+                    drink_type=check_spark(i)
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i)
+                if drink_type is None:
+                    drink_type=check_spark(i, col_name='type_wine')
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i, types=self.sour)
+                #if 'type' in i.keys():
+                result['type'].append(drink_type)#i['type'])
+                #else: dd['type'].append(None)
+                if 'volume' in i.keys():
+                    result['volume'].append(i['volume'])
+                else:
+                    vol=extract_volume_or_number(i['name'])
+                    result['volume'].append(vol)
+                if 'year' in i.keys():
+                    result['year'].append(i['year'])
+                else:
+                    year=extract_production_year(i['name'])
+                    result['year'].append(year)
+                alco=extract_alcohol_content(i['name'])
+                if 'type_wine' in i.keys():
+                    result['type_wine'].append(i['type_wine'])
+                else: result['type_wine'].append(None)
+                #f alco is not None:
+                result['alco'].append(alco)
+                #else: dd['type_wine'].append(None)
+            except Exception as ex:
+                print(idf, ex)
+        return pd.DataFrame(result)
+    def process_products(self, products):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+        for idx, row in tqdm(products.iterrows()):
+            try:
+                result['id'].append(row['id'])
+                result['brand'].append(row['brand'])
+                result['type_wine'].append(row['category'])
+                result['type'].append(row['product_type'])
+                result['name'].append(row['name_long'])
+                vol=extract_volume_or_number(row['name'])
+                result['volume'].append(vol)
+                #year=extract_production_year(row['name'])
+                year=extract_production_year(str(row['name_postfix']))
+                result['year'].append(year)
+                #rr['year'].append(row['name_postfix'])
+                alco=extract_alcohol_content(row['name'])
+                #f alco is not None:
+                result['alco'].append(alco)
+            except Exception as ex:
+                print(ex)
+        return pd.DataFrame(result)
+    def prcess_text(self, text):
+        #text=''+origin
+        #text=str(split_russian_and_english(text))
+        gb=find_full_word(text, self.gbs)#get_GB(text)
+        if gb is not None:
+            text=text.replace(str(gb), '')
+        alcohol = extract_alcohol_content(text)
+        if alcohol is not None:
+            alco_w_comma=alcohol.replace('.', ',')
+            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
+        volume_or_number = extract_volume_or_number(text)
+        if volume_or_number is not None:
+            volume_with_comma=str(volume_or_number).replace('.', ',')
+            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+            test=clean_wine_name(text) #remove_l(text)
+            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
+        # else:
+        #     volume_or_number=re_extract_volume(text)
+        #     if volume_or_number is not None:
+        #         volume_with_comma=volume_or_number.replace('.', ',')
+        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+        years = extract_years(text)
+        if years is not None:
+            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
+        production_year = extract_production_year(text)
+        if production_year is not None:
+            text=text.replace(str(production_year), '')
+        color=find_full_word(text, self.colors_ft)
+        if color is not None:
+            text=text.replace(str(color), '')
+        sour=find_full_word(text, self.sour) #get_sour(text)
+        if sour is not None:
+            text=text.replace(str(sour), '')
+        # re_extracted_volume=re_extract_volume(text)
+        # if re_extracted_volume is not None:
+        #     volume_with_comma=re_extracted_volume.replace('.', ',')
+        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
+        # else:
+        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
+        # volume_or_number=re_extracted_volume
+        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
+    def process(self, products, items):
+        print('------*-----Prepare items catalogue-----*-----')
+        items=self.process_items(items.copy())
+        print('-----*-----Prepare products catalogue-----*-----')
+        products=self.process_products(products.copy())
+        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
+        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
+        print('-----*-----Split n match-----*-----')
+        splited=split_n_match(products, items)
+        items["brand"] = items["brand"].replace(splited)
+        print('-----*-----Fill brands in items-----*-----')
+        fill_brands_in_dataframe(products['brand'].unique(), items)
+        print('-----*-----Brand matching-----*-----')
+        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
+        out_prods=list(set(prod_brand_list)-set(comp_list))
+        out_items=list(set(items_brand_list)-set(comp_list))
+        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
+        items["new_brand"] = items["new_brand"].replace(brand_map_improved)
+        items['type']=items['type'].replace(self.type_dict)
+        print('-----*-----Unwrap brend cats step 1-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Unwrap brend cats step 2-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Finding brands in names-----*-----')
+        items['new_brand']=items['new_brand'].replace('none', None)
+        i_brands=items[items['new_brand'].isna()]['name'].values
+        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
+        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
+        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
+        print('-----*-----Top inserts-----*-----')
+        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
+                                self.grapes, self.other_words)
+        print('-----*-----Adding service categories-----*-----')
+        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(items, products)
+        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(products, products)
+        print('-----*-----Name trimming-----*-----')
+        item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
+        #items['name']=items['id'].replace(item_timed_names)
+        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
+        items['gb']=gb
+        items['sour']=sour
+        items['sour']=items['sour'].replace(self.sour_dict)
+        products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
+        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
+        products['gb']=gb
+        products['sour']=sour
+        products['sour']=products['sour'].replace(self.sour_dict)
+        print('-----*-----Replacing product types-----*-----')
+        products['type']=products['type'].replace(self.type_dict)
+        return items, products

preprocess/utils/common/utils.py CHANGED Viewed

@@ -1,138 +1,150 @@
-import re
-from tqdm import tqdm
-import csv
-def get_delimiter(file_path):
-    with open(file_path, 'r') as f:
-        sample = f.read(1024)  # читаем часть файла для анализа
-        dialect = csv.Sniffer().sniff(sample)
-    return dialect.delimiter
-def remove_quotes(text):
-    return re.sub(r'["\']', '', text)
-def remove_l(text):
-    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
-  # Убираем возможные лишние пробелы, возникающие после удаления
-    result = re.sub(r'\s{2,}', ' ', result).strip()
-    return result
-def clean_wine_name(name):
-    """
-    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
-    Например, "токай   л" превратится в "токай".
-    """
-    # Регулярное выражение ищет:
-    # \s+        – один или несколько пробельных символов;
-    # \b         – граница слова;
-    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
-    # \b         – граница слова;
-    # \s*$       – любые пробелы до конца строки.
-    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
-def find_full_word(text, word_list):
-    """
-    Ищет первое полное вхождение слова из word_list в строке text.
-    Возвращает найденное слово или None, если совпадение не найдено.
-    """
-    for word in word_list:
-        pattern = r'\b' + re.escape(word) + r'\b'
-        if re.search(pattern, text, re.IGNORECASE):
-            return word
-    return None
-def merge_wine_type(items, colors=None, color_merge_dict=None):
-    result=[]
-    for row in tqdm(items.iterrows()):
-        try:
-            if row[1]['type_wine'] is not None:
-                color=find_full_word(row[1]['type_wine'], colors)
-                if color is not None:
-                    result.append(color)
-                else:
-                    color=find_full_word(row[1]['name'], colors)
-                    if color is not None:
-                        result.append(color)
-                    else:
-                        result.append(None)
-            else:
-                color=find_full_word(row[1]['name'], colors)
-                if color is not None:
-                    result.append(color)
-                else:
-                    result.append(None)
-        except Exception as ex:
-            print(ex)
-            result.append(None)
-    items['new_type_wine']=result
-    items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
-def merge_types(items, products):
-    alco_types=[i.strip().lower() for i in products['type'].unique()]
-    alco_types.append('ликёр')
-    result=[]
-    for row in tqdm(items.iterrows()):
-        try:
-            type_in_name=find_full_word(row[1]['name'], alco_types)
-            if type_in_name is not None:
-                result.append(type_in_name)
-                continue
-            if row[1]['type'] is not None:
-                type_in_type=find_full_word(row[1]['type'], alco_types)
-                if type_in_type is not None:
-                    result.append(type_in_type)
-                else:
-                    result.append(row[1]['type'])
-            else:
-                result.append(None)
-        except Exception as ex:
-            print(ex)
-            result.append(None)
-    items['new_type']=result
-    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
-def trim_name(text, words_to_remove):
-    """
-    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
-    :param text: Исходная строка.
-    :param words_to_remove: Список слов, которые необходимо удалить.
-    :return: Обновлённая строка с удалёнными словами.
-    """
-    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
-    # Используем re.escape, чтобы экранировать спецсимволы в словах.
-    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
-    #print(pattern)
-    # Заменяем найденные полные слова на пустую строку.
-    new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
-    # Убираем лишние пробелы, возникающие после удаления слов.
-    new_text = re.sub(r'\s+', ' ', new_text).strip()
-    return new_text
-def name_trimmer(df, prcess_text, types_and_others):
-    result={}
-    gbs=[]
-    sours=[]
-    for idx, row in tqdm(df.iterrows()):
-        text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
-        text=trim_name(text, types_and_others).replace(',','').replace('.','')
-        result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
-        gbs.append(gb)
-        sours.append(sour)
     return result, gbs, sours

+import re
+from tqdm import tqdm
+'''def get_delimiter(file_path):
+    with open(file_path, 'r') as f:
+        sample = f.read(1024)  # читаем часть файла для анализа
+        dialect = csv.Sniffer().sniff(sample)
+    return dialect.delimiter'''
+def get_delimiter(file_path):
+    with open(file_path, 'r', encoding="utf-8") as f:
+        ln = f.readline()
+        if ',' in ln:
+            return ','
+        if ';' in ln:
+            return ';'
+        if '\t' in ln:
+            return '\t'
+        if '|' in ln:
+            return '|'
+    raise ValueError(None, "Error parsing CSV file. Cannot detect delimiter")
+def remove_quotes(text):
+    return re.sub(r'["\']', '', text)
+def remove_l(text):
+    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
+  # Убираем возможные лишние пробелы, возникающие после удаления
+    result = re.sub(r'\s{2,}', ' ', result).strip()
+    return result
+def clean_wine_name(name):
+    """
+    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
+    Например, "токай   л" превратится в "токай".
+    """
+    # Регулярное выражение ищет:
+    # \s+        – один или несколько пробельных символов;
+    # \b         – граница слова;
+    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
+    # \b         – граница слова;
+    # \s*$       – любые пробелы до конца строки.
+    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
+def find_full_word(text, word_list):
+    """
+    Ищет первое полное вхождение слова из word_list в строке text.
+    Возвращает найденное слово или None, если совпадение не найдено.
+    """
+    for word in word_list:
+        pattern = r'\b' + re.escape(word) + r'\b'
+        if re.search(pattern, text, re.IGNORECASE):
+            return word
+    return None
+def merge_wine_type(items, colors=None, color_merge_dict=None):
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            if row[1]['type_wine'] is not None:
+                color=find_full_word(row[1]['type_wine'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    color=find_full_word(row[1]['name'], colors)
+                    if color is not None:
+                        result.append(color)
+                    else:
+                        result.append(None)
+            else:
+                color=find_full_word(row[1]['name'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type_wine']=result
+    items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
+def merge_types(items, products):
+    alco_types=[i.strip().lower() for i in products['type'].unique()]
+    alco_types.append('ликёр')
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            type_in_name=find_full_word(row[1]['name'], alco_types)
+            if type_in_name is not None:
+                result.append(type_in_name)
+                continue
+            if row[1]['type'] is not None:
+                type_in_type=find_full_word(row[1]['type'], alco_types)
+                if type_in_type is not None:
+                    result.append(type_in_type)
+                else:
+                    result.append(row[1]['type'])
+            else:
+                result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type']=result
+    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
+def trim_name(text, words_to_remove):
+    """
+    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
+    :param text: Исходная строка.
+    :param words_to_remove: Список слов, которые необходимо удалить.
+    :return: Обновлённая строка с удалёнными словами.
+    """
+    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
+    # Используем re.escape, чтобы экранировать спецсимволы в словах.
+    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
+    #print(pattern)
+    # Заменяем найденные полные слова на пустую строку.
+    new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    # Убираем лишние пробелы, возникающие после удаления слов.
+    new_text = re.sub(r'\s+', ' ', new_text).strip()
+    return new_text
+def name_trimmer(df, prcess_text, types_and_others):
+    result={}
+    gbs=[]
+    sours=[]
+    for idx, row in tqdm(df.iterrows()):
+        text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
+        text=trim_name(text, types_and_others).replace(',','').replace('.','')
+        result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
+        gbs.append(gb)
+        sours.append(sour)
     return result, gbs, sours

processor/matching.py CHANGED Viewed

@@ -1,159 +1,159 @@
-from tqdm import tqdm
-from transliterate import translit, detect_language
-import pandas as pd
-from rapidfuzz import fuzz, process
-def normalize_name(name):
-    """
-    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
-    приводит к нижнему регистру.
-    """
-    try:
-        if detect_language(name) == 'ru':
-            return translit(name, 'ru', reversed=True).lower()
-    except Exception:
-        pass
-    return name.lower()
-def prepare_groups_with_ids(items_df):
-    """
-    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
-    с учетом нормализованного названия.
-    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
-    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
-    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
-    """
-    items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name)
-    grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
-        lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
-    ).to_dict()
-    return grouped
-def prepare_groups_by_alternative_keys(items_df):
-    """
-    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
-    оригинального и нормализованного имени.
-    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
-    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
-    """
-    items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name)
-    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
-        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
-    ).to_dict()
-    return grouped
-def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
-    """
-    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
-    нормализованные группы.
-    Производится два прохода:
-    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
-    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
-      исключая итемы с исходным брендом.
-    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
-    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
-    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
-    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
-    :param name_threshold: Порог сходства для fuzzy matching.
-    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
-    """
-    results = []
-    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
-    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
-    for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
-        product_brand = product['brand']
-        product_type = product['type']
-        product_name = product['name']
-        product_volume = product['volume']
-        product_type_wine = product['new_type_wine']
-        product_sour = product['sour']
-        key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
-        items_data = items_groups.get(key, [])
-        if items_data:
-            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
-            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
-        else:
-            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
-        norm_product_name = normalize_name(product_name)
-        matches = process.extract(
-            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
-        )
-        matched_items = [
-            {
-                'item_id': items_ids[idx_candidate],
-                'item_name': items_names[idx_candidate],
-                'score': score,
-                'volume': items_volumes[idx_candidate],
-                'color': item_type_wine[idx_candidate],
-                'sour': items_sour[idx_candidate],
-                'year': items_year[idx_candidate],
-            }
-            for match, score, idx_candidate in matches
-        ]
-        if not matched_items:
-            no_match_products.append((idx, product))
-        results.append({
-            'product_id': product['id'],
-            'matched_items': matched_items,
-            'alternative': []  # Заполняется во втором проходе
-        })
-    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
-    groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
-    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
-    for idx, product in tqdm(no_match_products):
-        product_brand = product['brand']
-        product_type_wine = product['new_type_wine']
-        product_type = product['new_type']
-        product_volume = product['volume']
-        product_name = product['name']
-        product_sour = product['sour']
-        alt_key = (product_type_wine, product_type, product_volume, product_sour)
-        type_items = groups_by_alternative_keys.get(alt_key, [])
-        # Фильтруем, исключая итемы с исходным брендом
-        filtered_items = [item for item in type_items if item[1] != product_brand]
-        if filtered_items:
-            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
-        else:
-            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
-        norm_product_name = normalize_name(product_name)
-        alt_matches = process.extract(
-            norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
-        )
-        alt_matched_items = [
-            {
-                'item_id': alt_ids[idx_candidate],
-                'item_name': alt_names[idx_candidate],
-                'score': score,
-                'volume': alt_volumes[idx_candidate],
-                'color': alt_type_wine[idx_candidate],
-                'sour': alt_sour[idx_candidate],
-                'year': alt_year[idx_candidate],
-            }
-            for match, score, idx_candidate in alt_matches
-        ]
-        results[idx]['alternative'] = alt_matched_items
-    results_df = pd.DataFrame(results)
-    merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
     return merged_df

+from tqdm import tqdm
+from transliterate import translit, detect_language
+import pandas as pd
+from rapidfuzz import fuzz, process
+def normalize_name(name):
+    """
+    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
+    приводит к нижнему регистру.
+    """
+    try:
+        if detect_language(name) == 'ru':
+            return translit(name, 'ru', reversed=True).lower()
+    except Exception:
+        pass
+    return name.lower()
+def prepare_groups_with_ids(items_df):
+    """
+    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
+    с учетом нормализованного названия.
+    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
+    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
+    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
+    ).to_dict()
+    return grouped
+def prepare_groups_by_alternative_keys(items_df):
+    """
+    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
+    оригинального и нормализованного имени.
+    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
+    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
+    ).to_dict()
+    return grouped
+def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
+    """
+    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
+    нормализованные группы.
+    Производится два прохода:
+    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
+    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
+      исключая итемы с исходным брендом.
+    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
+    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
+    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
+    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
+    :param name_threshold: Порог сходства для fuzzy matching.
+    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
+    """
+    results = []
+    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
+    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
+    for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
+        product_brand = product['brand']
+        product_type = product['type']
+        product_name = product['name']
+        product_volume = product['volume']
+        product_type_wine = product['new_type_wine']
+        product_sour = product['sour']
+        key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
+        items_data = items_groups.get(key, [])
+        if items_data:
+            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
+        else:
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
+        norm_product_name = normalize_name(product_name)
+        matches = process.extract(
+            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        matched_items = [
+            {
+                'item_id': items_ids[idx_candidate],
+                'item_name': items_names[idx_candidate],
+                'score': score,
+                'volume': items_volumes[idx_candidate],
+                'color': item_type_wine[idx_candidate],
+                'sour': items_sour[idx_candidate],
+                'year': items_year[idx_candidate],
+            }
+            for match, score, idx_candidate in matches
+        ]
+        if not matched_items:
+            no_match_products.append((idx, product))
+        results.append({
+            'product_id': product['id'],
+            'matched_items': matched_items,
+            'alternative': []  # Заполняется во втором проходе
+        })
+    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
+    groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
+    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
+    for idx, product in tqdm(no_match_products):
+        product_brand = product['brand']
+        product_type_wine = product['new_type_wine']
+        product_type = product['new_type']
+        product_volume = product['volume']
+        product_name = product['name']
+        product_sour = product['sour']
+        alt_key = (product_type_wine, product_type, product_volume, product_sour)
+        type_items = groups_by_alternative_keys.get(alt_key, [])
+        # Фильтруем, исключая итемы с исходным брендом
+        filtered_items = [item for item in type_items if item[1] != product_brand]
+        if filtered_items:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
+        else:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
+        norm_product_name = normalize_name(product_name)
+        alt_matches = process.extract(
+            norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        alt_matched_items = [
+            {
+                'item_id': alt_ids[idx_candidate],
+                'item_name': alt_names[idx_candidate],
+                'score': score,
+                'volume': alt_volumes[idx_candidate],
+                'color': alt_type_wine[idx_candidate],
+                'sour': alt_sour[idx_candidate],
+                'year': alt_year[idx_candidate],
+            }
+            for match, score, idx_candidate in alt_matches
+        ]
+        results[idx]['alternative'] = alt_matched_items
+    results_df = pd.DataFrame(results)
+    merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
     return merged_df

processor/processor.py CHANGED Viewed

@@ -1,32 +1,28 @@
-from preprocess.preprocess import Preprocessor
-from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
-class Processor():
-    def __init__(self, long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict):
-        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict)
-    def process(self, products, items, is_items_first=False, th=65):
-        items, products=self.preprocessor.process(products, items)
-        print('-----*-----Matching-----*-----')
-        if is_items_first:
-            products['new_brand']=products['brand']
-            items['brand']=items['new_brand']
-            products_groups = prepare_groups_with_ids(products)
-            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
-        else:
-            items_groups = prepare_groups_with_ids(items)
-            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
-        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products #'year',

+from preprocess.preprocess import Preprocessor
+from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
+class Processor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict)
+    def process(self, products, items, is_items_first=False, th=65):
+        items, products=self.preprocessor.process(products, items)
+        print('-----*-----Matching-----*-----')
+        if is_items_first:
+            products['new_brand']=products['brand']
+            items['brand']=items['new_brand']
+            products_groups = prepare_groups_with_ids(products)
+            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
+        else:
+            items_groups = prepare_groups_with_ids(items)
+            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
+        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-python-Levenshtein
-transliterate
-rapidfuzz
-pyahocorasick
-unidecode
-pqdm
 tqdm

+python-Levenshtein
+transliterate
+rapidfuzz
+pyahocorasick
+unidecode
+pqdm
 tqdm

search/search_by_id.py CHANGED Viewed

@@ -1,24 +1,53 @@
-import json
-import pandas as pd
-import ast
-class Searcher():
-    def __init__(self):
-        self.df = None
-    def set_df(self, df):
-        self.df = df
-        try:
-            self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
-        except Exception as e:
-            print(e)
-        #self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
-    def search(self, query):
-        data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
-        return pd.DataFrame(data)
-    def search_in_uploaded_file(self, path, query):
-        matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
-        self.set_df(matching_result)
-        result=self.search(query)
         return result

+import json
+import pandas as pd
+import ast
+class Searcher():
+    def __init__(self):
+        self.df = None
+    def set_df(self, df):
+        self.df = df
+        try:
+            self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+            self.df['alternative'] = self.df['alternative'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+        except Exception as e:
+            print(e)
+    def search(self, query):
+        data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
+        return pd.DataFrame(data)
+    def search(self, resultfn, query):
+        is_alternative_items = False
+        df_matched_items = pd.DataFrame()
+        matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
+        self.set_df(matching_result)
+        items = self.df[self.df['id']==query]
+        matched_items = items['matched_items']
+        if (len(matched_items) != 0) and (len(matched_items.values[0])):
+            data = json.loads(json.dumps(matched_items.values[0]))
+            df_matched_items = pd.DataFrame(data)
+            is_alternative_items = False
+        else:
+            alter_items = items['alternative']
+            if (len(alter_items) != 0) and (len(alter_items.values[0])):
+                data = json.loads(json.dumps(alter_items.values[0]))
+                df_matched_items = pd.DataFrame(data)
+                is_alternative_items = True
+        return (df_matched_items, is_alternative_items)
+    def search_in_uploaded_file(self, path, query):
+        matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
+        self.set_df(matching_result)
+        result=self.search(query)
         return result

tmp/prod.csv CHANGED Viewed

	@@ -1 +1 @@
1	- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume


1	+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume

tmp/service/prod.csv CHANGED Viewed

	@@ -1 +1 @@
1	- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume


1	+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume

tmp/utils.py CHANGED Viewed

@@ -1,37 +1,48 @@
-import pandas as pd
-from preprocess.utils.common.utils import get_delimiter
-from glob import glob
-import shutil
-import os
-def update_products_csv(new_csv_path, main_csv_path='/home/user/app/tmp/prod.csv'):
-    main_sep=get_delimiter(main_csv_path)
-    main_csv=pd.read_csv(main_csv_path, sep=main_sep)
-    new_sep=get_delimiter(new_csv_path)
-    new_csv=pd.read_csv(new_csv_path, sep=new_sep)
-    result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
-    result.to_csv(main_csv_path, sep=main_sep, index=False)
-def is_csv_exist(path):
-    file_list=glob(path+'/*.csv')
-    if len(file_list)>0:
-        return file_list[0]
-    else:
-        None
-def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
-    main_path=is_csv_exist(main_dir)
-    if main_path==None:
-        new_path = shutil.move(new_path, main_dir)
-        return new_path
-    else:
-        update_products_csv(main_path, new_path)
-        return main_path
-def remover():
-    #path=is_csv_exist('/home/user/app/tmp/prod.csv')
-    #if path!=None:
-    os.remove(os.getcwd()+'/tmp/prod.csv')
-    shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')

+import pandas as pd
+from preprocess.utils.common.utils import get_delimiter
+import shutil
+import os
+def update_products_csv(new_csv_path, prods_file, overwrite_existing):
+    if os.path.isfile(prods_file) and not overwrite_existing:
+        main_sep=get_delimiter(prods_file)
+        main_csv=pd.read_csv(prods_file, sep=main_sep, on_bad_lines="warn")
+        new_sep=get_delimiter(new_csv_path)
+        new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
+        if 'attrs' in new_csv.columns.values:
+            raise Exception("Uploaded Products CSV does not seem to be valid")
+        result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
+        result.to_csv(prods_file, sep=main_sep, index=False)
+    else:
+        new_sep=get_delimiter(new_csv_path)
+        new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
+        new_csv.to_csv(prods_file, sep=new_sep, index=False)
+    return prods_file
+'''def is_csv_exist(path):
+    file_list=glob(path+'/*.csv')
+    if len(file_list)>0:
+        return file_list[0]
+    else:
+        None
+def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
+    main_path=is_csv_exist(main_dir)
+    if main_path==None:
+        new_path = shutil.move(new_path, main_dir)
+        return new_path
+    else:
+        update_products_csv(main_path, new_path)
+        return main_path
+def remover(data_path):
+    #path=is_csv_exist('/home/user/app/tmp/prod.csv')
+    #if path!=None:
+    os.remove(os.getcwd()+'/tmp/prod.csv')
+    shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''

ui/gradio_ui.py CHANGED Viewed

@@ -1,121 +1,170 @@
-import gradio as gr
-import pandas as pd
-import tempfile
-from preprocess.utils.common.utils import get_delimiter
-from tmp.utils import uploader, remover, update_products_csv
-from glob import glob
-import os
-class GradioUI():
-    def __init__(self, processor, searcher=None):
-        self.processor=processor
-        self.searcher=searcher
-    def process_files(self, file1, file2, is_items_first, threshold): #, q_id):
-        try:
-            print(file1)
-            print()
-            print(os.getcwd())
-            print(os.path.dirname(os.path.abspath(__file__)))
-            print()
-            if file1!=None:
-                #file1=uploader(file1)
-                update_products_csv(file1)
-            #else:
-                #file1=glob('./home/user/app/tmp/*.csv')[0]
-            file1=os.getcwd()+'/tmp/prod.csv'
-            #print()
-            #print(file1)
-            #print()
-            if file2!=None:
-                items_delimiter=get_delimiter(file2)
-                print('items delimiter: '+items_delimiter)
-                row_items=pd.read_csv(file2, sep=items_delimiter, on_bad_lines='skip')
-                products_delimiter=get_delimiter(file1)
-                print('products delimiter: '+products_delimiter)
-                row_products=pd.read_csv(file1, sep=products_delimiter, on_bad_lines='skip')
-                # if q_id in row_products['id'].unique():
-                #     row_products=row_products[row_products['id']==q_id]
-                #print("product id: " + str(q_id))
-                df, items, products= self.processor.process(row_products, row_items, is_items_first, threshold)
-                # Создаём временный CSV файл для сохранения результата
-                self.searcher.set_df(df.copy())
-                with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
-                    output_csv = tmp.name
-                df.to_csv(output_csv, sep='\t', index=False)
-                return output_csv
-        except Exception as ex:
-            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
-            return None
-    def run_ui(self):
-        with gr.Blocks() as demo:
-            with gr.Tabs():
-                #     with gr.Row():
-                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                #         process_button = gr.Button("Обновить")
-                # Вкладка для обработки CSV файлов
-                with gr.TabItem("Обработка CSV файлов"):
-                    gr.Markdown("## Обработка CSV файлов")
-                    with gr.Row():
-                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                        file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
-                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    with gr.Row():
-                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
-                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
-                    process_button = gr.Button("Обработать файлы")
-                    output_file = gr.File(label="Скачать результат (CSV)")
-                    process_button.click(
-                        fn=self.process_files,
-                        inputs=[file_input1, file_input2, toggle_input, threshold_input], #, search_number],
-                        outputs=output_file
-                    )
-                # Вкладка для поиска
-                with gr.TabItem("Поиск в обработанном csv"):
-                    gr.Markdown("## Поиск")
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search,
-                        inputs=[search_number],
-                        outputs=search_table
-                    )
-                with gr.TabItem("Загрузка результат и поиск в нем"):
-                    gr.Markdown("## Поиск")
-                    with gr.Row():
-                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search_in_uploaded_file,
-                        inputs=[input_path, search_number],
-                        outputs=search_table
-                    )
-                with gr.TabItem("Удалить сохраненные продукты"):
-                    del_button = gr.Button("Удалить")
-                    process_button.click(fn=remover)
             demo.launch()

+from argparse import ArgumentError
+import gradio as gr
+import pandas as pd
+from preprocess.utils.common.utils import get_delimiter
+from tmp.utils import remover, update_products_csv
+import os
+import datetime, time
+class GradioUI():
+    def __init__(self, processor, searcher, data_path):
+        self.processor=processor
+        self.searcher=searcher
+        self.data_path = data_path
+    def get_data_dir(self):
+        return self.data_path
+    def get_products_dir(self):
+        return os.path.join(self.get_data_dir(), "products")
+    def get_items_dir(self):
+        return os.path.join(self.get_data_dir(), "items")
+    def get_results_dir(self):
+        return os.path.join(self.get_data_dir(), "results")
+    def get_products_file_date(self):
+        fullfn = os.path.join(self.data_path, "products", "products.csv")
+        if not os.path.isfile(fullfn):
+            return "Файл Products не найден"
+        stinfo = os.stat(fullfn)
+        return time.ctime(stinfo.st_mtime)
+    def upload_products_file(self, prods_file, overwrite_existing):
+        try:
+            if not os.path.exists(self.get_products_dir()):
+                os.makedirs(self.get_products_dir())
+            fullfn = os.path.join(self.get_products_dir(), "products.csv")
+            if prods_file != None:
+                update_products_csv(prods_file, fullfn, overwrite_existing)
+            gr.Info("Файл Products успешно загружен")
+        except Exception as ex:
+            raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
+    def process_items(self, items_file, is_items_first, threshold): #, q_id):
+        try:
+            prods_file = os.path.join(self.get_products_dir(), "products.csv")
+            if not os.path.isfile(prods_file):
+                raise Exception("Файл Products не найден")
+            if items_file != None:
+                items_delimiter=get_delimiter(items_file)
+                print('items delimiter: '+items_delimiter)
+                row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
+                if not 'attrs' in row_items.columns.values:
+                    raise Exception("Uploaded Items CSV does not seem to be valid")
+                products_delimiter=get_delimiter(prods_file)
+                print('products delimiter: '+products_delimiter)
+                row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
+                # if q_id in row_products['id'].unique():
+                #     row_products=row_products[row_products['id']==q_id]
+                #print("product id: " + str(q_id))
+                df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold)
+                self.searcher.set_df(df.copy())
+                #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
+                #    output_csv = tmp.name
+                results_path = self.get_results_dir()
+                if not os.path.exists(results_path):
+                    os.makedirs(results_path)
+                output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
+                output_csv = os.path.join(results_path, output_csv)
+                df.to_csv(output_csv, sep='\t', index=False)
+                return output_csv
+        except Exception as ex:
+            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
+    def on_page_load(self, r: gr.Request):
+        m_time = self.get_products_file_date()
+        return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
+    def run_ui(self):
+        with gr.Blocks() as demo:
+            tabs = gr.Tabs()
+            with tabs:
+                #     with gr.Row():
+                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                #         process_button = gr.Button("Обновить")
+                with gr.TabItem("Загрузка файла Products"):
+                    prod_file_info1 = gr.Markdown("## Загрузка файла Products")
+                    with gr.Row():
+                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                    with gr.Row():
+                        toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
+                    upload_button = gr.Button("Загрузить файл")
+                    upload_button.click(
+                        fn=self.upload_products_file,
+                        inputs=[file_input1, toggle_input],
+                        #outputs=output_file
+                    )
+                # Вкладка для обработки CSV файлов
+                with gr.TabItem("Обработка каталога поставщика"):
+                    gr.Markdown("## Обработка каталога поставщика")
+                    m_time = self.get_products_file_date()
+                    prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
+                    with gr.Row():
+                        #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                        file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
+                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    with gr.Row():
+                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
+                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
+                    process_button = gr.Button("Загрузить файл с каталогом и сравнить")
+                    output_file = gr.File(label="Скачать результат (CSV)")
+                    process_button.click(
+                        fn=self.process_items,
+                        inputs=[file_items, toggle_input, threshold_input], #, search_number],
+                        outputs=output_file
+                    )
+                # Вкладка для поиска
+                with gr.TabItem("Поиск в обработанном csv"):
+                    gr.Markdown("## Поиск")
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search,
+                        inputs=[search_number],
+                        outputs=search_table
+                    )
+                with gr.TabItem("Загрузка результат и поиск в нем"):
+                    gr.Markdown("## Поиск")
+                    with gr.Row():
+                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search_in_uploaded_file,
+                        inputs=[input_path, search_number],
+                        outputs=search_table
+                    )
+                #with gr.TabItem("Удалить сохраненные продукты"):
+                #    del_button = gr.Button("Удалить")
+                #    process_button.click(fn=remover)
+            demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
             demo.launch()