Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

API and Changes

by j-s-v - opened Mar 18, 2025

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+822

-1499

This PR is in draft mode

Files changed (18) hide show

.gitattributes +35 -35
.gitignore +0 -3
README.md +12 -12
api.py +0 -205
app.py +31 -33
constants/constants.py +4 -31
preprocess/preprocess.py +224 -243
preprocess/utils/common/utils.py +137 -164
preprocess/utils/items/attrs.py +1 -1
processor/matching.py +158 -301
processor/processor.py +32 -30
requirements.txt +6 -6
search/matching_judge.py +0 -156
search/search_by_id.py +23 -52
tmp/prod.csv +1 -1
tmp/service/prod.csv +1 -1
tmp/utils.py +37 -48
ui/gradio_ui.py +120 -177

.gitattributes CHANGED Viewed

@@ -1,35 +1,35 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore DELETED Viewed

@@ -1,3 +0,0 @@
-*.pyc
-.idea/*
-_data/*

README.md CHANGED Viewed

@@ -1,12 +1,12 @@
----
-title: Product Matching
-emoji: 🏃
-colorFrom: gray
-colorTo: purple
-sdk: gradio
-sdk_version: 5.19.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: Product Matching
+emoji: 🏃
+colorFrom: gray
+colorTo: purple
+sdk: gradio
+sdk_version: 5.19.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

api.py DELETED Viewed

@@ -1,205 +0,0 @@
-import csv
-import json
-import os
-import datetime
-from processor.processor import Processor
-from constants.constants import *
-from search.search_by_id import Searcher
-from fastapi import FastAPI, File, UploadFile, HTTPException
-import uvicorn
-from pydantic import BaseModel
-import pandas as pd
-from tmp.utils import update_products_csv
-from search.matching_judge import compare_matching_with_manual
-'''compare_matching_with_manual("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New5)\\products.csv",
-                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\ws-items-for-test.csv",
-                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\m1-50-250325-133739.csv",
-                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\matching-20250318.csv")'''
-processor=Processor(LONG_TYPES_LIST,
-                    SHORT_TYPES_LIST,
-                    SOUR,
-                    WINE_TYPES,
-                    GBS,
-                    COLORS_FOR_TRIM,
-                    GRAPES,
-                    OTHER_WORDS,
-                    SOUR_MERGE_DICT,
-                    TYPES_WINES_DICT,
-                    COLOR_MERGE_DICT)
-searcher=Searcher()
-class item_by_id(BaseModel):
-    result_file: str
-    id: str
-class match_request(BaseModel):
-    items: str
-    threshold: int
-    items_first: int
-def get_data_dir():
-    return "/home/user/app/_data/"
-    #return "_data"
-def get_products_dir():
-    return os.path.join(get_data_dir(), "products")
-def get_items_dir():
-    return os.path.join(get_data_dir(), "items")
-def get_results_dir():
-    return os.path.join(get_data_dir(), "results")
-app = FastAPI()
-@app.get("/api/get_result_csv")
-async def get_result_csv():
-    results = []
-    for file in os.listdir(get_results_dir()):
-        if file.endswith(".csv"):
-            results.append(file)
-    results_json = json.dumps(results)
-    return results_json
-@app.post("/api/upload_result_csv")
-async def upload_result_csv(file: UploadFile = File(...)):
-    try:
-        contents = file.file.read()
-        with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
-            f.write(contents)
-    except Exception:
-        raise HTTPException(status_code=500, detail='Something went wrong')
-    finally:
-        file.file.close()
-    return {"message": f"Successfully uploaded {file.filename}"}
-@app.post("/api/upload_products_csv")
-async def upload_products_csv(file: UploadFile, overwrite_existing: int):
-    try:
-        datadir = get_products_dir()
-        if not os.path.exists(datadir):
-            os.makedirs(datadir)
-        tempfile = os.path.join(datadir, "products.csv_upload")
-        contents = file.file.read()
-        with open(tempfile, 'wb') as f:
-            f.write(contents)
-        fullfn = os.path.join(datadir, "products.csv")
-        update_products_csv(tempfile, fullfn, overwrite_existing)
-        os.remove(tempfile)
-    except Exception:
-        raise HTTPException(status_code=500, detail='Something went wrong')
-    finally:
-        file.file.close()
-    return {"message": f"Successfully uploaded {file.filename}"}
-#@app.post("/api/upload_items_csv")
-def upload_items_csv(file: UploadFile):
-    try:
-        itemsdir = get_items_dir()
-        if not os.path.exists(itemsdir):
-            os.makedirs(itemsdir)
-        contents = file.file.read()
-        fullfn = os.path.join(itemsdir, file.filename)
-        with open(fullfn, 'wb') as f:
-            f.write(contents)
-    except Exception:
-        raise HTTPException(status_code=500, detail='Something went wrong')
-    finally:
-        file.file.close()
-    #return {"message": f"Successfully uploaded {file.filename}"}
-    return fullfn
-@app.get("/api/get_items_csv")
-async def get_items_csv():
-    itemsdir = get_items_dir()
-    results = []
-    for file in os.listdir(itemsdir):
-        if file.endswith(".csv"):
-            results.append(file)
-    results_json = json.dumps(results)
-    return results_json
-@app.post("/api/match")
-async def match(items_file: UploadFile, threshold: int, items_first: int):
-    prods_file = os.path.join(get_products_dir(), "products.csv")
-    if not os.path.isfile(prods_file):
-        return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
-    items_fn = upload_items_csv(items_file)
-    #if len(r.items) == 0:
-    #    return {"Status": "Error", "ErrorDesc": "Items file not specified"}
-    if not threshold:
-        threshold = 50
-    #items_fn = os.path.join(get_items_dir(), r.items)
-    #if not os.path.isfile(items_fn):
-    #    return {"Status": "Error", "ErrorDesc": "Items file not found"}
-    row_items = pd.read_csv(items_fn, sep='\t')
-    os.remove(items_fn)
-    row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
-    df, items, products = processor.process(row_products, row_items, items_first, threshold)
-    results_dir = get_results_dir()
-    if not os.path.exists(results_dir):
-        os.makedirs(results_dir)
-    output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
-    df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
-    return {"Status": "Success", "result_file" : output_csv}
-@app.get("/api/get_matched_by_id")
-async def get_matched_by_id(item: item_by_id):
-    fullfn = os.path.join(get_results_dir(), item.result_file)
-    if not os.path.isfile(fullfn):
-        return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
-    (df, is_alternative) = searcher.search(fullfn, int(item.id))
-    if df.empty:
-        return {"Status": "Success", "IsAlternative": False, "Data": ""}
-    return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
-if __name__ == "__main__":
-    uvicorn.run(
-        app,
-        host="0.0.0.0",
-        port=8000,
-        log_level="debug"
-    )

app.py CHANGED Viewed

@@ -1,33 +1,31 @@
-from processor.processor import Processor
-from constants.constants import *
-from ui.gradio_ui import GradioUI
-from search.search_by_id import Searcher
-processor=Processor(LONG_TYPES_LIST,
-                    SHORT_TYPES_LIST,
-                    SOUR,
-                    WINE_TYPES,
-                    GBS,
-                    COLORS_FOR_TRIM,
-                    GRAPES,
-                    OTHER_WORDS,
-                    SOUR_MERGE_DICT,
-                    TYPES_WINES_DICT,
-                    COLOR_MERGE_DICT,
-                    COUNTRY_LIST,
-                    NORMALIZED_NAMES_ALTERNATIVES_DICT
-                    )
-searcher=Searcher()
-ui=GradioUI(processor, searcher, "/home/user/app/_data/")
-#ui=GradioUI(processor, searcher, "_data")
-ui.run_ui()

+from processor.processor import Processor
+from constants.constants import *
+from ui.gradio_ui import GradioUI
+from search.search_by_id import Searcher
+processor=Processor(LONG_TYPES_LIST,
+                    SHORT_TYPES_LIST,
+                    SOUR,
+                    WINE_TYPES,
+                    GBS,
+                    COLORS_FOR_TRIM,
+                    GRAPES,
+                    OTHER_WORDS,
+                    SOUR_MERGE_DICT,
+                    TYPES_WINES_DICT,
+                    COLOR_MERGE_DICT)
+searcher=Searcher()
+ui=GradioUI(processor, searcher)
+ui.run_ui()

constants/constants.py CHANGED Viewed

@@ -75,9 +75,7 @@ SOUR = [
         'п/сл',
         'п/с',
         'сл',
-        'сл.',
         'сух',
-        'сух.'
     ]
@@ -87,8 +85,7 @@ WINE_TYPES = [
     'розовое',
     'роз',
     'кр',
-    'крас',
-    'бел',
     'розе',
     'rosso',
     'roso',
@@ -154,13 +151,11 @@ GBS = [
 COLORS_FOR_TRIM = [
     'красное',
-    'крас',
-    'кр',
     'белое',
     'бел',
-    'розовое',
     'розе',
-    'rose',
     'rosso',
     'roso',
     'roseto',
@@ -212,8 +207,6 @@ GRAPES = [
 OTHER_WORDS=[
-    "Шампанское",
-    "Шампань",
     "Игристое",
     "Жемчужное",
     "Газированный",
@@ -234,7 +227,6 @@ OTHER_WORDS=[
     "Десертный",
     "Вкус",
     "Сорт",
-    "односолод."
     ]
@@ -244,14 +236,10 @@ SOUR_MERGE_DICT={
     'sweet':'сладкое',
     'сухое':'сухое',
     'п/сух':'полусухое',
-    'п/сух.':'полусухое',
     'п/сл':'полусладкое',
-    'п/сл.':'полусладкое',
     'п/с':'полусухое',
     'сл':'сладкое',
-    'сл.':'сладкое',
     'сух':'сухое',
-    'сух.':'сухое',
     None: 'unmatched',
     }
@@ -265,8 +253,7 @@ TYPES_WINES_DICT={
     'Сироп':'Сиропы',
     'Арманьяк':'Коньяк',
     'Бренди':'Коньяк',
-    'Ликер':'Ликер',
-    'Ликёр': 'Ликер',
     'Граппа':'Водка',
     'Настойка':'Водка',
     'Конфеты':'Сладости',
@@ -276,13 +263,11 @@ TYPES_WINES_DICT={
     'Винный напиток': "Вино",
     "Игристое вино":'Шампанское',
     "Самогон": "Водка",
-    None: 'unmatched'
     }
 COLOR_MERGE_DICT={
     "кр":'красное',
-    "крас":'красное',
     "red":"красное",
     "бел":"белое",
     "white":"белое",
@@ -298,15 +283,3 @@ COLOR_MERGE_DICT={
     None: 'unmatched'
     }
-COUNTRY_LIST=[
-    "Франция",
-    "Испания",
-    "Италия",
-    "Шотландия",
-]
-NORMALIZED_NAMES_ALTERNATIVES_DICT={
-    "M&H" : ["em end ejch"],
-    "peats beast" : ["pits bist"],
-    "xo": ["ho"]
-}

         'п/сл',
         'п/с',
         'сл',
         'сух',
     ]
     'розовое',
     'роз',
     'кр',
+    'бел',
     'розе',
     'rosso',
     'roso',
 COLORS_FOR_TRIM = [
     'красное',
     'белое',
+    'розовое'
+    'кр',
     'бел',
     'розе',
     'rosso',
     'roso',
     'roseto',
 OTHER_WORDS=[
     "Игристое",
     "Жемчужное",
     "Газированный",
     "Десертный",
     "Вкус",
     "Сорт",
     ]
     'sweet':'сладкое',
     'сухое':'сухое',
     'п/сух':'полусухое',
     'п/сл':'полусладкое',
     'п/с':'полусухое',
     'сл':'сладкое',
     'сух':'сухое',
     None: 'unmatched',
     }
     'Сироп':'Сиропы',
     'Арманьяк':'Коньяк',
     'Бренди':'Коньяк',
+    'Ликер':'Ликеры',
     'Граппа':'Водка',
     'Настойка':'Водка',
     'Конфеты':'Сладости',
     'Винный напиток': "Вино",
     "Игристое вино":'Шампанское',
     "Самогон": "Водка",
     }
 COLOR_MERGE_DICT={
     "кр":'красное',
     "red":"красное",
     "бел":"белое",
     "white":"белое",
     None: 'unmatched'
     }

preprocess/preprocess.py CHANGED Viewed

@@ -1,243 +1,224 @@
-import json
-from tqdm import tqdm
-from preprocess.utils.items.attrs import *
-from preprocess.utils.common.extracters import *
-from preprocess.utils.common.brand_matching import *
-from preprocess.utils.common.parallel_brand_matching import *
-from preprocess.utils.common.utils import *
-from preprocess.utils.common.top_inserts import *
-import pandas as pd
-class Preprocessor():
-    def __init__(self, long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict,
-                 country_list, normalized_names_dict):
-        self.long_types_list=long_types_list
-        self.short_types_list=short_types_list
-        self.sour=sour_list
-        self.type_wine=type_wine
-        self.gbs=gbs
-        self.colors_ft=colors_for_trim
-        self.grapes=grapes
-        self.other_words=other_words
-        self.types_n_others=long_types_list+other_words+sour_list+country_list
-        self.types_n_others.remove("Шерри")
-        self.sour_dict=sour_merge_dict
-        self.type_dict=type_merge_dict
-        self.color_merge_dict=color_merge_dict
-        self.country_list = country_list
-        self.normalized_names_dict=normalized_names_dict
-    def preprocess_name(self, name):
-        return name.replace("\n", " ")
-    def process_items(self, df):
-        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
-    #counter=0
-        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
-            try:
-                i=json.loads(i)
-                result['id'].append(idf)
-                if 'brand' in i.keys():
-                    result['brand'].append(i['brand'])
-                else: result['brand'].append(None)
-                name = self.preprocess_name(i['name'])
-                result['name'].append(name)
-                result['fullname'].append(name)
-                drink_type=get_type(i, self.long_types_list)
-                if drink_type is None:
-                    drink_type=check_spark(i)
-                if drink_type is None:
-                    drink_type=check_color_and_sour(i)
-                if drink_type is None:
-                    drink_type=check_spark(i, col_name='type_wine')
-                if drink_type is None:
-                    drink_type=check_color_and_sour(i, types=self.sour)
-                if drink_type is None:
-                    drink_type=check_color_and_sour(i, col_name='name')
-                #if 'type' in i.keys():
-                result['type'].append(drink_type)#i['type'])
-                #else: dd['type'].append(None)
-                if 'volume' in i.keys():
-                    result['volume'].append(i['volume'])
-                else:
-                    vol=extract_volume_or_number(i['name'])
-                    result['volume'].append(vol)
-                if 'year' in i.keys():
-                    result['year'].append(i['year'])
-                else:
-                    year=extract_production_year(i['name'])
-                    result['year'].append(year)
-                alco=extract_alcohol_content(i['name'])
-                if 'type_wine' in i.keys():
-                    result['type_wine'].append(i['type_wine'])
-                else: result['type_wine'].append(None)
-                #f alco is not None:
-                result['alco'].append(alco)
-                #else: dd['type_wine'].append(None)
-            except Exception as ex:
-                print(idf, ex)
-        return pd.DataFrame(result)
-    def process_products(self, products):
-        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
-        for idx, row in tqdm(products.iterrows()):
-            try:
-                result['id'].append(row['id'])
-                result['brand'].append(row['brand'])
-                result['type_wine'].append(row['category'])
-                result['type'].append(row['product_type'])
-                result['name'].append(row['name_long'])
-                result['fullname'].append(row['name_long'])
-                vol=extract_volume_or_number(row['name'])
-                result['volume'].append(vol)
-                #year=extract_production_year(row['name'])
-                year=extract_production_year(str(row['name_postfix']))
-                result['year'].append(year)
-                #rr['year'].append(row['name_postfix'])
-                alco=extract_alcohol_content(row['name'])
-                #f alco is not None:
-                result['alco'].append(alco)
-            except Exception as ex:
-                print(ex)
-        return pd.DataFrame(result)
-    def prcess_text(self, text):
-        #text=''+origin
-        #text=str(split_russian_and_english(text))
-        gb=find_full_word(text, self.gbs)#get_GB(text)
-        if gb is not None:
-            text=text.replace(str(gb), '')
-        alcohol = extract_alcohol_content(text)
-        if alcohol is not None:
-            alco_w_comma=alcohol.replace('.', ',')
-            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
-        volume_or_number = extract_volume_or_number(text)
-        if volume_or_number is not None:
-            volume_with_comma=str(volume_or_number).replace('.', ',')
-            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
-            text = re.sub(r'\s+\b[лЛlL].\b', '', text)
-            text = re.sub(r'\s+\b[лЛlL]\b', '', text)
-            test=clean_wine_name(text) #remove_l(text)
-            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
-        # else:
-        #     volume_or_number=re_extract_volume(text)
-        #     if volume_or_number is not None:
-        #         volume_with_comma=volume_or_number.replace('.', ',')
-        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
-        years = extract_years(text)
-        if years is not None:
-            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
-        production_year = extract_production_year(text)
-        if production_year is not None:
-            text=text.replace(str(production_year), '')
-        color=find_full_word(text, self.colors_ft)
-        if color is not None:
-            text=text.replace(str(color), '')
-        sour=find_full_word(text, self.sour) #get_sour(text)
-        if sour is not None:
-            text=text.replace(str(sour), '')
-        # re_extracted_volume=re_extract_volume(text)
-        # if re_extracted_volume is not None:
-        #     volume_with_comma=re_extracted_volume.replace('.', ',')
-        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
-        # else:
-        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
-        # volume_or_number=re_extracted_volume
-        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
-    def process(self, products, items):
-        print('------*-----Prepare items catalogue-----*-----')
-        items=self.process_items(items.copy())
-        print('-----*-----Prepare products catalogue-----*-----')
-        products=self.process_products(products.copy())
-        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
-        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
-        print('-----*-----Split n match-----*-----')
-        splited=split_n_match(products, items)
-        items["brand"] = items["brand"].replace(splited)
-        print('-----*-----Fill brands in items-----*-----')
-        fill_brands_in_dataframe(products['brand'].unique(), items)
-        print('-----*-----Brand matching-----*-----')
-        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
-        out_prods=list(set(prod_brand_list)-set(comp_list))
-        out_items=list(set(items_brand_list)-set(comp_list))
-        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
-        items["new_brand"] = items["new_brand"].replace(brand_map_improved)
-        items['type']=items['type'].replace(self.type_dict)
-        print('-----*-----Unwrap brand cats step 1-----*-----')
-        unwrap_b_match=unwrap_brands(products)
-        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
-        products["brand"] = products["brand"].replace(unwrap_b_match)
-        print('-----*-----Unwrap brand cats step 2-----*-----')
-        unwrap_b_match=unwrap_brands(products)
-        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
-        products["brand"] = products["brand"].replace(unwrap_b_match)
-        print('-----*-----Finding brands in names-----*-----')
-        items['new_brand']=items['new_brand'].replace('none', None)
-        i_brands=items[items['new_brand'].isna()]['name'].values
-        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
-        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
-        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
-        print('-----*-----Top inserts-----*-----')
-        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
-                                self.grapes, self.other_words)
-        print('-----*-----Adding service categories-----*-----')
-        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(items, products, type_merge_dict=self.type_dict)
-        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
-        merge_types(products, products, type_merge_dict=self.type_dict)
-        print('-----*-----Name trimming-----*-----')
-        item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
-        #items['name']=items['id'].replace(item_timed_names)
-        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
-        items['gb']=gb
-        items['sour']=sour
-        items['sour']=items['sour'].replace(self.sour_dict)
-        products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
-        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
-        products['gb']=gb
-        products['sour']=sour
-        products['sour']=products['sour'].replace(self.sour_dict)
-        print('-----*-----Replacing product types-----*-----')
-        products['type']=products['type'].replace(self.type_dict)
-        return items, products

+import json
+from tqdm import tqdm
+from preprocess.utils.items.attrs import *
+from preprocess.utils.common.extracters import *
+from preprocess.utils.common.brand_matching import *
+from preprocess.utils.common.parallel_brand_matching import *
+from preprocess.utils.common.utils import *
+from preprocess.utils.common.top_inserts import *
+import pandas as pd
+class Preprocessor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.long_types_list=long_types_list
+        self.short_types_list=short_types_list
+        self.sour=sour_list
+        self.type_wine=type_wine
+        self.gbs=gbs
+        self.colors_ft=colors_for_trim
+        self.grapes=grapes
+        self.other_words=other_words
+        self.types_n_others=long_types_list+other_words
+        self.sour_dict=sour_merge_dict
+        self.type_dict=type_merge_dict
+        self.color_merge_dict=color_merge_dict
+    def process_items(self, df):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+    #counter=0
+        for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
+            try:
+                i=json.loads(i)
+                result['id'].append(idf)
+                if 'brand' in i.keys():
+                    result['brand'].append(i['brand'])
+                else: result['brand'].append(None)
+                result['name'].append(i['name'])
+                drink_type=get_type(i, self.long_types_list)
+                if drink_type is None:
+                    drink_type=check_spark(i)
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i)
+                if drink_type is None:
+                    drink_type=check_spark(i, col_name='type_wine')
+                if drink_type is None:
+                    drink_type=check_color_and_sour(i, types=self.sour)
+                #if 'type' in i.keys():
+                result['type'].append(drink_type)#i['type'])
+                #else: dd['type'].append(None)
+                if 'volume' in i.keys():
+                    result['volume'].append(i['volume'])
+                else:
+                    vol=extract_volume_or_number(i['name'])
+                    result['volume'].append(vol)
+                if 'year' in i.keys():
+                    result['year'].append(i['year'])
+                else:
+                    year=extract_production_year(i['name'])
+                    result['year'].append(year)
+                alco=extract_alcohol_content(i['name'])
+                if 'type_wine' in i.keys():
+                    result['type_wine'].append(i['type_wine'])
+                else: result['type_wine'].append(None)
+                #f alco is not None:
+                result['alco'].append(alco)
+                #else: dd['type_wine'].append(None)
+            except Exception as ex:
+                print(idf, ex)
+        return pd.DataFrame(result)
+    def process_products(self, products):
+        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
+        for idx, row in tqdm(products.iterrows()):
+            try:
+                result['id'].append(row['id'])
+                result['brand'].append(row['brand'])
+                result['type_wine'].append(row['category'])
+                result['type'].append(row['product_type'])
+                result['name'].append(row['name_long'])
+                vol=extract_volume_or_number(row['name'])
+                result['volume'].append(vol)
+                #year=extract_production_year(row['name'])
+                year=extract_production_year(str(row['name_postfix']))
+                result['year'].append(year)
+                #rr['year'].append(row['name_postfix'])
+                alco=extract_alcohol_content(row['name'])
+                #f alco is not None:
+                result['alco'].append(alco)
+            except Exception as ex:
+                print(ex)
+        return pd.DataFrame(result)
+    def prcess_text(self, text):
+        #text=''+origin
+        #text=str(split_russian_and_english(text))
+        gb=find_full_word(text, self.gbs)#get_GB(text)
+        if gb is not None:
+            text=text.replace(str(gb), '')
+        alcohol = extract_alcohol_content(text)
+        if alcohol is not None:
+            alco_w_comma=alcohol.replace('.', ',')
+            text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
+        volume_or_number = extract_volume_or_number(text)
+        if volume_or_number is not None:
+            volume_with_comma=str(volume_or_number).replace('.', ',')
+            text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+            test=clean_wine_name(text) #remove_l(text)
+            #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
+        # else:
+        #     volume_or_number=re_extract_volume(text)
+        #     if volume_or_number is not None:
+        #         volume_with_comma=volume_or_number.replace('.', ',')
+        #         text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
+        years = extract_years(text)
+        if years is not None:
+            text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
+        production_year = extract_production_year(text)
+        if production_year is not None:
+            text=text.replace(str(production_year), '')
+        color=find_full_word(text, self.colors_ft)
+        if color is not None:
+            text=text.replace(str(color), '')
+        sour=find_full_word(text, self.sour) #get_sour(text)
+        if sour is not None:
+            text=text.replace(str(sour), '')
+        # re_extracted_volume=re_extract_volume(text)
+        # if re_extracted_volume is not None:
+        #     volume_with_comma=re_extracted_volume.replace('.', ',')
+        #     text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
+        # else:
+        #     re_extracted_volume=re_extract_volume(str(volume_or_number))
+        # volume_or_number=re_extracted_volume
+        return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
+    def process(self, products, items):
+        print('------*-----Prepare items catalogue-----*-----')
+        items=self.process_items(items.copy())
+        print('-----*-----Prepare products catalogue-----*-----')
+        products=self.process_products(products.copy())
+        items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
+        products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
+        print('-----*-----Split n match-----*-----')
+        splited=split_n_match(products, items)
+        items["brand"] = items["brand"].replace(splited)
+        print('-----*-----Fill brands in items-----*-----')
+        fill_brands_in_dataframe(products['brand'].unique(), items)
+        print('-----*-----Brand matching-----*-----')
+        comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
+        out_prods=list(set(prod_brand_list)-set(comp_list))
+        out_items=list(set(items_brand_list)-set(comp_list))
+        brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
+        items["new_brand"] = items["new_brand"].replace(brand_map_improved)
+        items['type']=items['type'].replace(self.type_dict)
+        print('-----*-----Unwrap brend cats step 1-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Unwrap brend cats step 2-----*-----')
+        unwrap_b_match=unwrap_brands(products)
+        items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
+        products["brand"] = products["brand"].replace(unwrap_b_match)
+        print('-----*-----Finding brands in names-----*-----')
+        items['new_brand']=items['new_brand'].replace('none', None)
+        i_brands=items[items['new_brand'].isna()]['name'].values
+        p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
+        new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
+        items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
+        print('-----*-----Top inserts-----*-----')
+        process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
+                                self.grapes, self.other_words)
+        print('-----*-----Adding service categories-----*-----')
+        merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(items, products)
+        merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
+        merge_types(products, products)
+        print('-----*-----Name trimming-----*-----')
+        item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
+        #items['name']=items['id'].replace(item_timed_names)
+        items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
+        items['gb']=gb
+        items['sour']=sour
+        items['sour']=items['sour'].replace(self.sour_dict)
+        products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
+        products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
+        products['gb']=gb
+        products['sour']=sour
+        products['sour']=products['sour'].replace(self.sour_dict)
+        print('-----*-----Replacing product types-----*-----')
+        products['type']=products['type'].replace(self.type_dict)
+        return items, products

preprocess/utils/common/utils.py CHANGED Viewed

@@ -1,165 +1,138 @@
-import re
-from tqdm import tqdm
-'''def get_delimiter(file_path):
-    with open(file_path, 'r') as f:
-        sample = f.read(1024)  # читаем часть файла для анализа
-        dialect = csv.Sniffer().sniff(sample)
-    return dialect.delimiter'''
-def get_delimiter(file_path):
-    with open(file_path, 'r', encoding="utf-8") as f:
-        ln = f.readline()
-        if ',' in ln:
-            return ','
-        if ';' in ln:
-            return ';'
-        if '\t' in ln:
-            return '\t'
-        if '|' in ln:
-            return '|'
-    raise ValueError(None, "Error parsing CSV file. Cannot detect delimiter")
-def remove_quotes(text):
-    return re.sub(r'["\']', '', text)
-def remove_l(text):
-    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
-  # Убираем возможные лишние пробелы, возникающие после удаления
-    result = re.sub(r'\s{2,}', ' ', result).strip()
-    return result
-def clean_wine_name(name):
-    """
-    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
-    Например, "токай   л" превратится в "токай".
-    """
-    # Регулярное выражение ищет:
-    # \s+        – один или несколько пробельных символов;
-    # \b         – граница слова;
-    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
-    # \b         – граница слова;
-    # \s*$       – любые пробелы до конца строки.
-    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
-def find_full_word(text, word_list):
-    """
-    Ищет первое полное вхождение слова из word_list в строке text.
-    Возвращает найденное слово или None, если совпадение не найдено.
-    """
-    for word in word_list:
-        pattern = r'\b' + re.escape(word) + r'\b'
-        if re.search(pattern, text, re.IGNORECASE):
-            return word
-    return None
-def merge_wine_type(items, colors=None, color_merge_dict=None):
-    result=[]
-    for row in tqdm(items.iterrows()):
-        try:
-            #print("merge_wine_type:" + str(row))
-            if row[1]['type_wine'] is not None:
-                color=find_full_word(row[1]['type_wine'], colors)
-                if color is not None:
-                    result.append(color)
-                else:
-                    color=find_full_word(row[1]['name'], colors)
-                    if color is not None:
-                        result.append(color)
-                    else:
-                        result.append(None)
-            else:
-                color=find_full_word(row[1]['name'], colors)
-                if color is not None:
-                    result.append(color)
-                else:
-                    result.append(None)
-        except Exception as ex:
-            print(ex)
-            result.append(None)
-    items['new_type_wine']=result
-    items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
-def merge_types(items, products, type_merge_dict={}, sub_alco_types=["Бренди", "Шампань", "Шампанское"]):
-    alco_types=[i.strip().lower() for i in products['type'].unique()]
-    alco_types.append('ликёр')
-    result=[]
-    for row in tqdm(items.iterrows()):
-        try:
-            # Parameter 'sub_alco_types' specifies specific alcohol types that usually specified
-            # in product / item name along with "parent" type and in this case this subtype should have priority
-            # For example, "Вино Шампано Ле Брён де Нёвиль", or "Бренди де Херес"
-            if sub_alco_types:
-                type_in_name=find_full_word(row[1]['name'], sub_alco_types)
-                if type_in_name is not None:
-                    result.append(type_in_name)
-                    continue
-            type_in_name=find_full_word(row[1]['name'], alco_types)
-            if type_in_name is not None:
-                result.append(type_in_name)
-                continue
-            if row[1]['type'] is not None:
-                type_in_type=find_full_word(row[1]['type'], alco_types)
-                if type_in_type is not None:
-                    result.append(type_in_type)
-                else:
-                    result.append(row[1]['type'])
-            else:
-                result.append(None)
-        except Exception as ex:
-            print(ex)
-            result.append(None)
-    items['new_type']=result
-    #items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
-    items['new_type'] = items['new_type'].replace(type_merge_dict)
-def trim_name(text, words_to_remove):
-    """
-    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
-    :param text: Исходная строка.
-    :param words_to_remove: Список слов, которые необходимо удалить.
-    :return: Обновлённая строка с удалёнными словами.
-    """
-    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
-    # Используем re.escape, чтобы экранировать спецсимволы в словах.
-    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
-    #print("Pattern: " + pattern)
-    # Заменяем найденные полные слова на пустую строку.
-    new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
-    # Убираем лишние пробелы, возникающие после удаления слов.
-    new_text = re.sub(r'\s+', ' ', new_text).strip()
-    return new_text
-def name_trimmer(df, prcess_text, types_and_others):
-    result={}
-    gbs=[]
-    sours=[]
-    for idx, row in tqdm(df.iterrows()):
-        #print("Name1: " + str(row['name']))
-        text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
-        #print("Name2: " + text)
-        text=trim_name(text, types_and_others).replace(',','').replace('.','')
-        #print("Name3: " + text)
-        result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
-        gbs.append(gb)
-        sours.append(sour)
     return result, gbs, sours

+import re
+from tqdm import tqdm
+import csv
+def get_delimiter(file_path):
+    with open(file_path, 'r') as f:
+        sample = f.read(1024)  # читаем часть файла для анализа
+        dialect = csv.Sniffer().sniff(sample)
+    return dialect.delimiter
+def remove_quotes(text):
+    return re.sub(r'["\']', '', text)
+def remove_l(text):
+    result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
+  # Убираем возможные лишние пробелы, возникающие после удаления
+    result = re.sub(r'\s{2,}', ' ', result).strip()
+    return result
+def clean_wine_name(name):
+    """
+    Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
+    Например, "токай   л" превратится в "токай".
+    """
+    # Регулярное выражение ищет:
+    # \s+        – один или несколько пробельных символов;
+    # \b         – граница слова;
+    # [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
+    # \b         – граница слова;
+    # \s*$       – любые пробелы до конца строки.
+    return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
+def find_full_word(text, word_list):
+    """
+    Ищет первое полное вхождение слова из word_list в строке text.
+    Возвращает найденное слово или None, если совпадение не найдено.
+    """
+    for word in word_list:
+        pattern = r'\b' + re.escape(word) + r'\b'
+        if re.search(pattern, text, re.IGNORECASE):
+            return word
+    return None
+def merge_wine_type(items, colors=None, color_merge_dict=None):
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            if row[1]['type_wine'] is not None:
+                color=find_full_word(row[1]['type_wine'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    color=find_full_word(row[1]['name'], colors)
+                    if color is not None:
+                        result.append(color)
+                    else:
+                        result.append(None)
+            else:
+                color=find_full_word(row[1]['name'], colors)
+                if color is not None:
+                    result.append(color)
+                else:
+                    result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type_wine']=result
+    items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
+def merge_types(items, products):
+    alco_types=[i.strip().lower() for i in products['type'].unique()]
+    alco_types.append('ликёр')
+    result=[]
+    for row in tqdm(items.iterrows()):
+        try:
+            type_in_name=find_full_word(row[1]['name'], alco_types)
+            if type_in_name is not None:
+                result.append(type_in_name)
+                continue
+            if row[1]['type'] is not None:
+                type_in_type=find_full_word(row[1]['type'], alco_types)
+                if type_in_type is not None:
+                    result.append(type_in_type)
+                else:
+                    result.append(row[1]['type'])
+            else:
+                result.append(None)
+        except Exception as ex:
+            print(ex)
+            result.append(None)
+    items['new_type']=result
+    items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
+def trim_name(text, words_to_remove):
+    """
+    Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
+    :param text: Исходная строка.
+    :param words_to_remove: Список слов, которые необходимо удалить.
+    :return: Обновлённая строка с удалёнными словами.
+    """
+    # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
+    # Используем re.escape, чтобы экранировать спецсимволы в словах.
+    pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
+    #print(pattern)
+    # Заменяем найденные полные слова на пустую строку.
+    new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
+    # Убираем лишние пробелы, возникающие после удаления слов.
+    new_text = re.sub(r'\s+', ' ', new_text).strip()
+    return new_text
+def name_trimmer(df, prcess_text, types_and_others):
+    result={}
+    gbs=[]
+    sours=[]
+    for idx, row in tqdm(df.iterrows()):
+        text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
+        text=trim_name(text, types_and_others).replace(',','').replace('.','')
+        result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
+        gbs.append(gb)
+        sours.append(sour)
     return result, gbs, sours

preprocess/utils/items/attrs.py CHANGED Viewed

@@ -6,7 +6,7 @@ def check_spark(row, col_name='name', types=['Игристое', 'игр']):
         return None
-def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное', 'крас.', 'бел.']):
     if col_name in row.keys():
         for t in types:
             if t.lower() in row[col_name].lower():

         return None
+def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
     if col_name in row.keys():
         for t in types:
             if t.lower() in row[col_name].lower():

processor/matching.py CHANGED Viewed

@@ -1,302 +1,159 @@
-import json
-from constants.constants import *
-from tqdm import tqdm
-from transliterate import translit, detect_language
-import pandas as pd
-from rapidfuzz import fuzz, process
-import numpy as np
-from math import isnan
-from preprocess.utils.common.utils import *
-def normalize_name(name):
-    """
-    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
-    приводит к нижнему регистру.
-    """
-    try:
-        if detect_language(name) == 'ru':
-            return translit(name, 'ru', reversed=True).lower()
-    except Exception:
-        pass
-    return name.lower()
-def normalize_name_ex(name):
-    name = normalize_name(name)
-    for nnk in NORMALIZED_NAMES_ALTERNATIVES_DICT:
-        word = find_full_word(name, NORMALIZED_NAMES_ALTERNATIVES_DICT[nnk])
-        if word:
-            name = name.replace(word, nnk)
-    return name
-def compare_names(name1, name2, scorer=fuzz.ratio, score_cutoff=50):
-    print("Scoring: " + name1 + " vs " + name2)
-    words1 = name1.split(" ")
-    words2 = name2.split(" ")
-    score = 0
-    for w1 in words1:
-        for w2 in words2:
-            r = scorer(w1, w2)
-            print("\t " + w1 + " - " + w2 + " ; " + str(r))
-            if r >= score_cutoff:
-                score = score + r
-    print("Score result: " + str(score / (100*len(words1))))
-    return score / (100*len(words1))
-def compare_name_with_list(name, names_list, scorer=fuzz.ratio, score_cutoff=50):
-    result = []
-    index = 0
-    for name2 in names_list:
-        result.append((name2, compare_names(name, name2, scorer, score_cutoff), index))
-        index = index + 1
-    return result
-def prepare_groups_with_ids(items_df):
-    """
-    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
-    с учетом нормализованного названия.
-    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
-    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
-    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
-    """
-    items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
-    grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
-        lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
-    ).to_dict()
-    #print(grouped)
-    return grouped
-def prepare_groups_by_alternative_keys(items_df):
-    """
-    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
-    оригинального и нормализованного имени.
-    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
-    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
-    """
-    items_df = items_df.copy()
-    items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
-    #grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume' ''', 'sour''''']).apply(
-    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume']).apply(
-        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
-    ).to_dict()
-    return grouped
-def parse_year(year):
-    if not year:
-        return False
-    elif isinstance(year, str):
-        return int(year)
-    elif isinstance(year, (int, float)) and not isnan(year):
-        return int(year)
-    return False
-def order_by_best_year(matched_items, year):
-    best_matched_items = []
-    max_year_matched_items = []
-    other_matched_items = []
-    max_year = 0
-    year = parse_year(year)
-    for mi in matched_items:
-        # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
-        try:
-            if isinstance(mi['year'], (int, float, str)):
-                mi_year = int(mi['year'])
-            else:
-                mi_year = False
-            if year and mi_year and (mi_year == year):
-                best_matched_items.append(mi['item_id'])
-            elif mi_year:
-                if mi_year > max_year:
-                    max_year_matched_items = [mi]
-                    max_year = mi_year
-                elif mi_year == max_year:
-                    max_year_matched_items.append(mi)
-                else:
-                    other_matched_items.append(mi['item_id'])
-            else:
-                other_matched_items.append(mi['item_id'])
-        except Exception as ex:
-            print("Error processing best year for item " + str(mi["item_id"]) + " value " + str(mi['year']) + ": " + str(ex))
-    if len(best_matched_items) > 0:
-        for m in matched_items:
-            if not m['item_id'] in best_matched_items:
-                m['score'] = m['score']*0.8
-    return matched_items
-def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
-    """
-    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
-    нормализованные группы.
-    Производится два прохода:
-    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
-    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
-      исключая итемы с исходным брендом.
-    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
-    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
-    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
-    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
-    :param name_threshold: Порог сходства для fuzzy matching.
-    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
-    """
-    results = []
-    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
-    if name_threshold < 50:
-        name_threshold = 50
-    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
-    for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
-        product_brand = product['brand']
-        product_type = product['type']
-        product_name = product['name']
-        product_volume = product['volume']
-        product_type_wine = product['new_type_wine']
-        product_sour = product['sour']
-        key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
-        #print("Name: " + product_name)
-        #print("Key: " + str(key))
-        #print("Groups: " + str(items_groups))
-        items_data = items_groups.get(key, [])
-        if items_data:
-            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
-            #print("Data: " + str(items_data))
-            items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
-        else:
-            #print("Data: No")
-            items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
-        norm_product_name = normalize_name_ex(product_name)
-        matches = process.extract(
-            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
-        )
-        matched_items = [
-            {
-                'item_id': items_ids[idx_candidate],
-                'brand': product_brand,
-                'item_name': items_full_names[idx_candidate],
-                #'item_name': items_names[idx_candidate],
-                'score': score,
-                'volume': items_volumes[idx_candidate],
-                'color': item_type_wine[idx_candidate],
-                'sour': items_sour[idx_candidate],
-                'year': items_year[idx_candidate],
-            }
-            for match, score, idx_candidate in matches
-        ]
-        if matched_items:
-            matched_items = order_by_best_year(matched_items, product['year'])
-            matched_items = matched_items[:5]
-        else:
-            no_match_products.append((idx, product))
-        results.append({
-            'product_id': product['id'],
-            #"matched_top_id": top_matched_id,
-            'matched_items': matched_items,
-            #"alternative_top_id": "",
-            #'alternative': []  # Заполняется во втором проходе
-        })
-    if include_alternatives:
-        # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
-        groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
-        # Второй проход: для продуктов без совпадений ищем по альтернативным группам
-        for idx, product in tqdm(no_match_products):
-            #print("Product: " + str(product))
-            product_brand = product['brand']
-            product_type_wine = product['new_type_wine']
-            product_type = product['new_type']
-            product_volume = product['volume']
-            product_name = product['name']
-            product_sour = product['sour']
-            #alt_key = (product_type_wine, product_type, product_volume, product_sour)
-            alt_key = (product_type_wine, product_type, product_volume)
-            #print("AltName: " + str(product))
-            #print("AltKey: " + str(alt_key))
-            #print("AltGroups: " + str(groups_by_alternative_keys))
-            #print("AltGroups Keys: " + str(groups_by_alternative_keys.keys()))
-            type_items = groups_by_alternative_keys.get(alt_key, [])
-            #print("AltGroups2: " + str(type_items))
-            # Фильтруем, исключая итемы с исходным брендом
-            filtered_items = [item for item in type_items if item[1] != product_brand]
-            if filtered_items:
-                #print("AltData: " + str(filtered_items))
-                alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
-            else:
-                #print("AltData: No")
-                alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
-            norm_product_name = normalize_name_ex(product_name)
-            #print("norm_product_name: " + str(norm_product_name))
-            #print("alt_norm_names: " + str(alt_norm_names))
-            alt_matches = process.extract(
-                norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=50
-            )
-            #alt_matches = compare_name_with_list(
-            #    norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=70
-            #)
-            #print("alt_matches: " + str(alt_matches))
-            alt_matched_items = [
-                {
-                    'item_id': alt_ids[idx_candidate],
-                    'brand': alt_brands[idx_candidate],
-                    #'item_name': alt_names[idx_candidate],
-                    'item_name': alt_full_names[idx_candidate],
-                    'score': score / 2,
-                    'volume': alt_volumes[idx_candidate],
-                    'color': alt_type_wine[idx_candidate],
-                    'sour': alt_sour[idx_candidate],
-                    'year': alt_year[idx_candidate],
-                }
-                for match, score, idx_candidate in alt_matches
-            ]
-            alt_matched_items = order_by_best_year(alt_matched_items, product['year'])
-            alt_matched_items = alt_matched_items[:5]
-            results[idx]['matched_items'].extend(alt_matched_items)
-    for r in results:
-        r['matched_items'] = json.dumps(r['matched_items'], ensure_ascii=False)
-    #if alt_matched_items:
-        #    results[idx]['alternative_top_id'] = alt_matched_items[0]["item_id"]
-        #results[idx]['alternative'] = alt_matched_items
-    results_df = pd.DataFrame(results)
-    merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
     return merged_df

+from tqdm import tqdm
+from transliterate import translit, detect_language
+import pandas as pd
+from rapidfuzz import fuzz, process
+def normalize_name(name):
+    """
+    Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
+    приводит к нижнему регистру.
+    """
+    try:
+        if detect_language(name) == 'ru':
+            return translit(name, 'ru', reversed=True).lower()
+    except Exception:
+        pass
+    return name.lower()
+def prepare_groups_with_ids(items_df):
+    """
+    Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
+    с учетом нормализованного названия.
+    Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
+    :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
+    :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
+    ).to_dict()
+    return grouped
+def prepare_groups_by_alternative_keys(items_df):
+    """
+    Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
+    оригинального и нормализованного имени.
+    :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
+    :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
+    """
+    items_df = items_df.copy()
+    items_df['norm_name'] = items_df['name'].apply(normalize_name)
+    grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
+    ).to_dict()
+    return grouped
+def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
+    """
+    Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
+    нормализованные группы.
+    Производится два прохода:
+    - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
+    - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
+      исключая итемы с исходным брендом.
+    Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
+    :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
+    :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
+    :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
+    :param name_threshold: Порог сходства для fuzzy matching.
+    :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
+    """
+    results = []
+    no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
+    # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
+    for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
+        product_brand = product['brand']
+        product_type = product['type']
+        product_name = product['name']
+        product_volume = product['volume']
+        product_type_wine = product['new_type_wine']
+        product_sour = product['sour']
+        key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
+        items_data = items_groups.get(key, [])
+        if items_data:
+            # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
+        else:
+            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
+        norm_product_name = normalize_name(product_name)
+        matches = process.extract(
+            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        matched_items = [
+            {
+                'item_id': items_ids[idx_candidate],
+                'item_name': items_names[idx_candidate],
+                'score': score,
+                'volume': items_volumes[idx_candidate],
+                'color': item_type_wine[idx_candidate],
+                'sour': items_sour[idx_candidate],
+                'year': items_year[idx_candidate],
+            }
+            for match, score, idx_candidate in matches
+        ]
+        if not matched_items:
+            no_match_products.append((idx, product))
+        results.append({
+            'product_id': product['id'],
+            'matched_items': matched_items,
+            'alternative': []  # Заполняется во втором проход��
+        })
+    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
+    groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
+    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
+    for idx, product in tqdm(no_match_products):
+        product_brand = product['brand']
+        product_type_wine = product['new_type_wine']
+        product_type = product['new_type']
+        product_volume = product['volume']
+        product_name = product['name']
+        product_sour = product['sour']
+        alt_key = (product_type_wine, product_type, product_volume, product_sour)
+        type_items = groups_by_alternative_keys.get(alt_key, [])
+        # Фильтруем, исключая итемы с исходным брендом
+        filtered_items = [item for item in type_items if item[1] != product_brand]
+        if filtered_items:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
+        else:
+            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
+        norm_product_name = normalize_name(product_name)
+        alt_matches = process.extract(
+            norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+        )
+        alt_matched_items = [
+            {
+                'item_id': alt_ids[idx_candidate],
+                'item_name': alt_names[idx_candidate],
+                'score': score,
+                'volume': alt_volumes[idx_candidate],
+                'color': alt_type_wine[idx_candidate],
+                'sour': alt_sour[idx_candidate],
+                'year': alt_year[idx_candidate],
+            }
+            for match, score, idx_candidate in alt_matches
+        ]
+        results[idx]['alternative'] = alt_matched_items
+    results_df = pd.DataFrame(results)
+    merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
     return merged_df

processor/processor.py CHANGED Viewed

@@ -1,30 +1,32 @@
-from preprocess.preprocess import Preprocessor
-from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
-class Processor():
-    def __init__(self, long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict,
-                 country_list, normalized_names_dict):
-        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
-                 type_wine, gbs, colors_for_trim, grapes, other_words,
-                 sour_merge_dict, type_merge_dict, color_merge_dict,
-                 country_list, normalized_names_dict)
-    def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
-        items, products=self.preprocessor.process(products, items)
-        print('-----*-----Matching-----*-----')
-        if is_items_first:
-            products['new_brand']=products['brand']
-            items['brand']=items['new_brand']
-            products_groups = prepare_groups_with_ids(products)
-            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th, include_alternatives=include_alternatives)
-        else:
-            items_groups = prepare_groups_with_ids(items)
-            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th, include_alternatives=include_alternatives)
-        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

+from preprocess.preprocess import Preprocessor
+from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
+class Processor():
+    def __init__(self, long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict):
+        self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
+                 type_wine, gbs, colors_for_trim, grapes, other_words,
+                 sour_merge_dict, type_merge_dict, color_merge_dict)
+    def process(self, products, items, is_items_first=False, th=65):
+        items, products=self.preprocessor.process(products, items)
+        print('-----*-----Matching-----*-----')
+        if is_items_first:
+            products['new_brand']=products['brand']
+            items['brand']=items['new_brand']
+            products_groups = prepare_groups_with_ids(products)
+            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
+        else:
+            items_groups = prepare_groups_with_ids(items)
+            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
+        return res.drop(['type','type_wine','alco','gb'], axis=1), items, products #'year',

requirements.txt CHANGED Viewed

@@ -1,7 +1,7 @@
-python-Levenshtein
-transliterate
-rapidfuzz
-pyahocorasick
-unidecode
-pqdm
 tqdm

+python-Levenshtein
+transliterate
+rapidfuzz
+pyahocorasick
+unidecode
+pqdm
 tqdm

search/matching_judge.py DELETED Viewed

@@ -1,156 +0,0 @@
-import json
-import pandas as pd
-import ast
-import csv
-def verify_csv(csv_file):
-    lnnum = 1
-    w = open(csv_file + ".1", "w", encoding="utf-8")
-    with open(csv_file, "r", encoding="utf-8") as f:
-        while True:
-            ln = f.readline()
-            if lnnum == 1:
-                w.write(ln)
-            if len(ln) == 0:
-                break
-            if ln.count('"') % 2 == 1:
-                #raise Exception("Incorrect quotes at line " + str(lnnum) + " in file [" + csv_file + "]")
-                w.write(ln)
-            lnnum = lnnum + 1
-    w.close()
-    return True
-def compare_matching_with_manual(products_file, items_file, match_result_file, manual_result_file):
-    '''with open(products_file, mode="r", encoding="utf-8", newline='') as csvfile:
-        csvreader = csv.reader(csvfile, dialect="excel-tab")
-        for row in csvreader:
-            print(', '.join(row))'''
-    if not verify_csv(products_file):
-        raise Exception
-    products_df = pd.read_csv(products_file, sep="\t")
-    items_df = pd.read_csv(items_file, sep=";")
-    match_df = pd.read_csv(match_result_file, sep="\t")
-    manual_df = pd.read_csv(manual_result_file, sep="\t")
-    results = {
-        "item_count" : int(items_df.count()[0]),
-        "product_count" : int(products_df.count()[0]),
-        "match_count" : int(match_df.count()[0]),
-        "manual_count" : int(manual_df.count()[0]),
-    }
-    items_to_manual = {}
-    for index, row in items_df.iterrows():
-        x = manual_df[manual_df['item_id'] == row["id"]]['state']
-        if (len(x) > 0) and (x.values[0] == 1):
-            p = products_df[products_df["id"] == manual_df.iloc[int(x.index[0])]["product_id"]]
-            items_to_manual[row["id"]] = int(manual_df.iloc[int(x.index[0])]["product_id"])
-    '''items_to_auto = {}
-    for index, row in match_df.iterrows():
-        if row["matched_top_id"] > 0:
-            p = products_df[products_df["id"] == int(row["matched_top_id"])]
-            items_to_auto[row["id"]] = int(row["matched_top_id"])
-    results["items_to_manual_count"] = len(items_to_manual)
-    results["items_to_auto_count"] = len(items_to_auto)'''
-    result_list = []
-    for index, row in items_df.iterrows():
-        result_data = {}
-        result_data["id"] = row["id"]
-        result_data["match_side"] = "no_match"
-        result_data["auto_score"] = ""
-        result_data["manual_score"] = ""
-        result_data["discuss"] = ""
-        auto_match = match_df[match_df['id'] == row["id"]]["matched_items"].values[0]
-        '''if len(auto_match) > 2:
-            if auto_match.find("\\'") >= 0:
-                auto_match = auto_match
-            auto_match = auto_match.replace("\\'", "$$$$$$").replace(": None}", ": \"\"}").replace("'", '"').replace("$$$$$$", "\\'")
-        auto_match = json.loads(auto_match)'''
-        manual_match = None
-        manual = manual_df[manual_df['item_id'] == row["id"]]['state']
-        if (len(manual) > 0) and (manual.values[0] == 1):
-            p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
-            if len(p.values) > 0:
-                manual_match = p
-            else:
-                print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
-        if (auto_match is not None) and len(auto_match) > 2 and (manual_match is not None):
-            result_data["match_side"] = "both"
-            manual_id = int(manual_match["id"].values[0])
-            auto_match_ns = auto_match.replace(" ", "")
-            i1 = auto_match_ns.find("'item_id':")
-            i2 = auto_match_ns.find("'item_id':" + str(manual_id) + ",")
-            if i1 == i2:
-                result_data["auto_score"] = 1
-                result_data["manual_score"] = 1
-            elif i2 >= 0:
-                result_data["auto_score"] = 0.5
-                result_data["manual_score"] = 0.5
-        elif (auto_match is not None) and len(auto_match) > 2:
-            result_data["match_side"] = "only_auto"
-        elif manual_match is not None:
-            result_data["match_side"] = "only_manual"
-        result_data["discuss"] = ""
-        result_data["item"] = row["attrs"]
-        result_data["auto_match"] = auto_match
-        manual_string = ""
-        if (manual_match is not None):
-            manual_string = '{' + \
-                        '"id": ' + str(manual_match["id"].values[0]) + ',' + \
-                        '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
-                        '"name": "' + str(manual_match["name_long"].values[0]) + '",' + \
-                        '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
-                        '"year": ' + str(manual_match["year"].values[0]) + '"}'
-        result_data["manual_match"] = manual_string
-        result_list.append(result_data)
-    results_df = pd.DataFrame(result_list)
-    results_df.to_csv("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\mjudge_new.csv")
-    '''common_match = {}
-    full_match = {}
-    for a_match in items_to_auto:
-        if a_match in items_to_manual:
-            common_match[a_match] = [items_to_auto[a_match], items_to_manual[a_match]]
-            if items_to_auto[a_match] == items_to_manual[a_match]:
-                full_match[a_match] = items_to_auto[a_match]'''
-    #results["items_to_manual"] = len(items_to_manual)
-    #results["items_to_auto"] = len(items_to_auto
-    print(results)
-    return results

search/search_by_id.py CHANGED Viewed

@@ -1,53 +1,24 @@
-import json
-import pandas as pd
-import ast
-class Searcher():
-    def __init__(self):
-        self.df = None
-    def set_df(self, df):
-        self.df = df
-        try:
-            self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
-            self.df['alternative'] = self.df['alternative'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
-        except Exception as e:
-            print(e)
-    def search(self, query):
-        data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
-        return pd.DataFrame(data)
-    def search(self, resultfn, query):
-        is_alternative_items = False
-        df_matched_items = pd.DataFrame()
-        matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
-        self.set_df(matching_result)
-        items = self.df[self.df['id']==query]
-        matched_items = items['matched_items']
-        if (len(matched_items) != 0) and (len(matched_items.values[0])):
-            data = json.loads(json.dumps(matched_items.values[0]))
-            df_matched_items = pd.DataFrame(data)
-            is_alternative_items = False
-        else:
-            alter_items = items['alternative']
-            if (len(alter_items) != 0) and (len(alter_items.values[0])):
-                data = json.loads(json.dumps(alter_items.values[0]))
-                df_matched_items = pd.DataFrame(data)
-                is_alternative_items = True
-        return (df_matched_items, is_alternative_items)
-    def search_in_uploaded_file(self, path, query):
-        matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
-        self.set_df(matching_result)
-        result=self.search(query)
         return result

+import json
+import pandas as pd
+import ast
+class Searcher():
+    def __init__(self):
+        self.df = None
+    def set_df(self, df):
+        self.df = df
+        try:
+            self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+        except Exception as e:
+            print(e)
+        #self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
+    def search(self, query):
+        data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
+        return pd.DataFrame(data)
+    def search_in_uploaded_file(self, path, query):
+        matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
+        self.set_df(matching_result)
+        result=self.search(query)
         return result

tmp/prod.csv CHANGED Viewed

	@@ -1 +1 @@
1	- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume


1	+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume

tmp/service/prod.csv CHANGED Viewed

	@@ -1 +1 @@
1	- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume


1	+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume

tmp/utils.py CHANGED Viewed

@@ -1,48 +1,37 @@
-import pandas as pd
-from preprocess.utils.common.utils import get_delimiter
-import shutil
-import os
-def update_products_csv(new_csv_path, prods_file, overwrite_existing):
-    if os.path.isfile(prods_file) and not overwrite_existing:
-        main_sep=get_delimiter(prods_file)
-        main_csv=pd.read_csv(prods_file, sep=main_sep, on_bad_lines="warn")
-        new_sep=get_delimiter(new_csv_path)
-        new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
-        if 'attrs' in new_csv.columns.values:
-            raise Exception("Uploaded Products CSV does not seem to be valid")
-        result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
-        result.to_csv(prods_file, sep=main_sep, index=False)
-    else:
-        new_sep=get_delimiter(new_csv_path)
-        new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
-        new_csv.to_csv(prods_file, sep=new_sep, index=False)
-    return prods_file
-'''def is_csv_exist(path):
-    file_list=glob(path+'/*.csv')
-    if len(file_list)>0:
-        return file_list[0]
-    else:
-        None
-def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
-    main_path=is_csv_exist(main_dir)
-    if main_path==None:
-        new_path = shutil.move(new_path, main_dir)
-        return new_path
-    else:
-        update_products_csv(main_path, new_path)
-        return main_path
-def remover(data_path):
-    #path=is_csv_exist('/home/user/app/tmp/prod.csv')
-    #if path!=None:
-    os.remove(os.getcwd()+'/tmp/prod.csv')
-    shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''

+import pandas as pd
+from preprocess.utils.common.utils import get_delimiter
+from glob import glob
+import shutil
+import os
+def update_products_csv(new_csv_path, main_csv_path='/home/user/app/tmp/prod.csv'):
+    main_sep=get_delimiter(main_csv_path)
+    main_csv=pd.read_csv(main_csv_path, sep=main_sep)
+    new_sep=get_delimiter(new_csv_path)
+    new_csv=pd.read_csv(new_csv_path, sep=new_sep)
+    result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
+    result.to_csv(main_csv_path, sep=main_sep, index=False)
+def is_csv_exist(path):
+    file_list=glob(path+'/*.csv')
+    if len(file_list)>0:
+        return file_list[0]
+    else:
+        None
+def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
+    main_path=is_csv_exist(main_dir)
+    if main_path==None:
+        new_path = shutil.move(new_path, main_dir)
+        return new_path
+    else:
+        update_products_csv(main_path, new_path)
+        return main_path
+def remover():
+    #path=is_csv_exist('/home/user/app/tmp/prod.csv')
+    #if path!=None:
+    os.remove(os.getcwd()+'/tmp/prod.csv')
+    shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')

ui/gradio_ui.py CHANGED Viewed

@@ -1,178 +1,121 @@
-import gradio as gr
-import pandas as pd
-from preprocess.utils.common.utils import get_delimiter
-from tmp.utils import update_products_csv #remover,
-import os
-import csv
-import datetime, time
-class GradioUI():
-    def __init__(self, processor, searcher, data_path):
-        self.processor=processor
-        self.searcher=searcher
-        self.data_path = data_path
-        gr.set_static_paths(paths=[os.path.join(self.get_data_dir(), "products")])
-    def get_data_dir(self):
-        return self.data_path
-    def get_products_dir(self):
-        return os.path.join(self.get_data_dir(), "products")
-    def get_items_dir(self):
-        return os.path.join(self.get_data_dir(), "items")
-    def get_results_dir(self):
-        return os.path.join(self.get_data_dir(), "results")
-    def get_products_file_date(self):
-        fullfn = os.path.join(self.data_path, "products", "products.csv")
-        if not os.path.isfile(fullfn):
-            return "Файл Products не найден"
-        stinfo = os.stat(fullfn)
-        return time.ctime(stinfo.st_mtime)
-    def upload_products_file(self, prods_file, overwrite_existing):
-        try:
-            if not os.path.exists(self.get_products_dir()):
-                os.makedirs(self.get_products_dir())
-            fullfn = os.path.join(self.get_products_dir(), "products.csv")
-            if prods_file != None:
-                update_products_csv(prods_file, fullfn, overwrite_existing)
-            gr.Info("Файл Products успешно загружен")
-        except Exception as ex:
-            raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
-    def process_items(self, items_file, is_items_first, threshold, include_alternatives): #, q_id):
-        try:
-            prods_file = os.path.join(self.get_products_dir(), "products.csv")
-            if not os.path.isfile(prods_file):
-                raise Exception("Файл Products не найден")
-            if items_file != None:
-                items_delimiter=get_delimiter(items_file)
-                print('items delimiter: '+items_delimiter)
-                #row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
-                row_items = pd.read_csv(items_file, sep=items_delimiter)
-                if not 'attrs' in row_items.columns.values:
-                    raise Exception("Uploaded Items CSV does not seem to be valid")
-                products_delimiter=get_delimiter(prods_file)
-                print('products delimiter: '+products_delimiter)
-                #row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
-                row_products = pd.read_csv(prods_file, sep=products_delimiter)
-                # if q_id in row_products['id'].unique():
-                #     row_products=row_products[row_products['id']==q_id]
-                #print("product id: " + str(q_id))
-                df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold, include_alternatives)
-                self.searcher.set_df(df.copy())
-                #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
-                #    output_csv = tmp.name
-                results_path = self.get_results_dir()
-                if not os.path.exists(results_path):
-                    os.makedirs(results_path)
-                output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
-                output_csv = os.path.join(results_path, output_csv)
-                df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE, escapechar="@")
-                return output_csv
-        except Exception as ex:
-            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
-    def on_page_load(self, r: gr.Request):
-        m_time = self.get_products_file_date()
-        return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
-    def run_ui(self):
-        with gr.Blocks() as demo:
-            tabs = gr.Tabs()
-            with tabs:
-                #     with gr.Row():
-                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                #         process_button = gr.Button("Обновить")
-                # Вкладка для обработки CSV файлов
-                with gr.TabItem("Обработка каталога поставщика"):
-                    gr.Markdown("## Обработка каталога поставщика")
-                    m_time = self.get_products_file_date()
-                    prod_file_info2 = gr.Markdown(f"Дата последнего о��новления файла Products: {m_time}")
-                    with gr.Row():
-                        #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                        file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
-                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    with gr.Row():
-                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=True)
-                        toggle_alternative = gr.Checkbox(label="Включать в результаты альтернативные варианты", value=True)
-                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
-                    process_button = gr.Button("Загрузить файл с каталогом и сравнить")
-                    output_file = gr.File(label="Скачать результат (CSV)")
-                    process_button.click(
-                        fn=self.process_items,
-                        inputs=[file_items, toggle_input, threshold_input, toggle_alternative], #, search_number],
-                        outputs=output_file
-                    )
-                with gr.TabItem("Загрузка файла Products"):
-                    with gr.Row():
-                        prod_file_info1 = gr.Markdown("## Загрузка файла Products")
-                        product_download_button = gr.DownloadButton(label="Скачать", value=os.path.join(self.get_products_dir(), "products.csv"), visible=True)
-                    with gr.Row():
-                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                    with gr.Row():
-                        toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
-                    upload_button = gr.Button("Загрузить файл")
-                    upload_button.click(
-                        fn=self.upload_products_file,
-                        inputs=[file_input1, toggle_input],
-                        #outputs=output_file
-                    )
-                # Вкладка для поиска
-                with gr.TabItem("Поиск в обработанном csv"):
-                    gr.Markdown("## Поиск")
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search,
-                        inputs=[search_number],
-                        outputs=search_table
-                    )
-                with gr.TabItem("Загрузка результат и поиск в нем"):
-                    gr.Markdown("## Поиск")
-                    with gr.Row():
-                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search_in_uploaded_file,
-                        inputs=[input_path, search_number],
-                        outputs=search_table
-                    )
-                #with gr.TabItem("Удалить сохраненные продукты"):
-                #    del_button = gr.Button("Удалить")
-                #    process_button.click(fn=remover)
-            demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
             demo.launch()

+import gradio as gr
+import pandas as pd
+import tempfile
+from preprocess.utils.common.utils import get_delimiter
+from tmp.utils import uploader, remover, update_products_csv
+from glob import glob
+import os
+class GradioUI():
+    def __init__(self, processor, searcher=None):
+        self.processor=processor
+        self.searcher=searcher
+    def process_files(self, file1, file2, is_items_first, threshold): #, q_id):
+        try:
+            print(file1)
+            print()
+            print(os.getcwd())
+            print(os.path.dirname(os.path.abspath(__file__)))
+            print()
+            if file1!=None:
+                #file1=uploader(file1)
+                update_products_csv(file1)
+            #else:
+                #file1=glob('./home/user/app/tmp/*.csv')[0]
+            file1=os.getcwd()+'/tmp/prod.csv'
+            #print()
+            #print(file1)
+            #print()
+            if file2!=None:
+                items_delimiter=get_delimiter(file2)
+                print('items delimiter: '+items_delimiter)
+                row_items=pd.read_csv(file2, sep=items_delimiter, on_bad_lines='skip')
+                products_delimiter=get_delimiter(file1)
+                print('products delimiter: '+products_delimiter)
+                row_products=pd.read_csv(file1, sep=products_delimiter, on_bad_lines='skip')
+                # if q_id in row_products['id'].unique():
+                #     row_products=row_products[row_products['id']==q_id]
+                #print("product id: " + str(q_id))
+                df, items, products= self.processor.process(row_products, row_items, is_items_first, threshold)
+                # Создаём временный CSV файл для сохранения результата
+                self.searcher.set_df(df.copy())
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
+                    output_csv = tmp.name
+                df.to_csv(output_csv, sep='\t', index=False)
+                return output_csv
+        except Exception as ex:
+            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
+            return None
+    def run_ui(self):
+        with gr.Blocks() as demo:
+            with gr.Tabs():
+                #     with gr.Row():
+                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                #         process_button = gr.Button("Обновить")
+                # Вкладка для обработки CSV файлов
+                with gr.TabItem("Обработка CSV файлов"):
+                    gr.Markdown("## Обработка CSV файлов")
+                    with gr.Row():
+                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                        file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
+                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    with gr.Row():
+                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
+                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
+                    process_button = gr.Button("Обработать файлы")
+                    output_file = gr.File(label="Скачать результат (CSV)")
+                    process_button.click(
+                        fn=self.process_files,
+                        inputs=[file_input1, file_input2, toggle_input, threshold_input], #, search_number],
+                        outputs=output_file
+                    )
+                # Вкладка для поиска
+                with gr.TabItem("Поиск в обработанном csv"):
+                    gr.Markdown("## Поиск")
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search,
+                        inputs=[search_number],
+                        outputs=search_table
+                    )
+                with gr.TabItem("Загрузка результат и поиск в нем"):
+                    gr.Markdown("## Поиск")
+                    with gr.Row():
+                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search_in_uploaded_file,
+                        inputs=[input_path, search_number],
+                        outputs=search_table
+                    )
+                with gr.TabItem("Удалить сохраненные продукты"):
+                    del_button = gr.Button("Удалить")
+                    process_button.click(fn=remover)
             demo.launch()