Spaces:

Gainward777
/

Product_Matching

Sleeping

App Files Files Community

Gainward777 commited on Mar 25, 2025

Commit

36a20b5

verified ·

1 Parent(s): 4b066a6

Upload 22 files

Browse files

Files changed (8) hide show

.gitignore +3 -0
api.py +29 -14
app.py +1 -0
preprocess/preprocess.py +4 -2
processor/matching.py +101 -48
processor/processor.py +3 -3
search/matching_judge.py +133 -0
ui/gradio_ui.py +174 -169

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pyc
+.idea/*
+_data/*

api.py CHANGED Viewed

@@ -10,6 +10,13 @@ import uvicorn
 from pydantic import BaseModel
 import pandas as pd
 from tmp.utils import update_products_csv
 processor=Processor(LONG_TYPES_LIST,
                     SHORT_TYPES_LIST,
@@ -38,6 +45,7 @@ class match_request(BaseModel):
 def get_data_dir():
     return "/home/user/app/_data/"
 def get_products_dir():
     return os.path.join(get_data_dir(), "products")
@@ -94,6 +102,8 @@ async def upload_products_csv(file: UploadFile, overwrite_existing: int):
         fullfn = os.path.join(datadir, "products.csv")
         update_products_csv(tempfile, fullfn, overwrite_existing)
     except Exception:
         raise HTTPException(status_code=500, detail='Something went wrong')
     finally:
@@ -102,8 +112,8 @@ async def upload_products_csv(file: UploadFile, overwrite_existing: int):
     return {"message": f"Successfully uploaded {file.filename}"}
-@app.post("/api/upload_items_csv")
-async def upload_items_csv(file: UploadFile = File(...)):
     try:
         itemsdir = get_items_dir()
@@ -112,14 +122,16 @@ async def upload_items_csv(file: UploadFile = File(...)):
         contents = file.file.read()
-        with open(os.path.join(itemsdir, file.filename), 'wb') as f:
             f.write(contents)
     except Exception:
         raise HTTPException(status_code=500, detail='Something went wrong')
     finally:
         file.file.close()
-    return {"message": f"Successfully uploaded {file.filename}"}
 @app.get("/api/get_items_csv")
@@ -136,32 +148,35 @@ async def get_items_csv():
 @app.post("/api/match")
-async def match(r: match_request):
     prods_file = os.path.join(get_products_dir(), "products.csv")
     if not os.path.isfile(prods_file):
         return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
-    if len(r.items) == 0:
-        return {"Status": "Error", "ErrorDesc": "Items file not specified"}
-    if not r.threshold:
-        r.threshold = 50
-    items_fn = os.path.join(get_items_dir(), r.items)
-    if not os.path.isfile(items_fn):
-        return {"Status": "Error", "ErrorDesc": "Items file not found"}
     row_items = pd.read_csv(items_fn, sep='\t')
     row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
-    df, items, products = processor.process(row_products, row_items, r.items_first, r.threshold)
     results_dir = get_results_dir()
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
-    output_csv = "m1-" + str(r.threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
     df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
     return {"Status": "Success", "result_file" : output_csv}

 from pydantic import BaseModel
 import pandas as pd
 from tmp.utils import update_products_csv
+from search.matching_judge import compare_matching_with_manual
+'''compare_matching_with_manual("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\products.csv",
+                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\ws-items-for-test.csv",
+                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\m1-50-250325-133739.csv",
+                             "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\matching-20250318.csv")'''
 processor=Processor(LONG_TYPES_LIST,
                     SHORT_TYPES_LIST,
 def get_data_dir():
     return "/home/user/app/_data/"
+    #return "_data"
 def get_products_dir():
     return os.path.join(get_data_dir(), "products")
         fullfn = os.path.join(datadir, "products.csv")
         update_products_csv(tempfile, fullfn, overwrite_existing)
+        os.remove(tempfile)
     except Exception:
         raise HTTPException(status_code=500, detail='Something went wrong')
     finally:
     return {"message": f"Successfully uploaded {file.filename}"}
+#@app.post("/api/upload_items_csv")
+def upload_items_csv(file: UploadFile):
     try:
         itemsdir = get_items_dir()
         contents = file.file.read()
+        fullfn = os.path.join(itemsdir, file.filename)
+        with open(fullfn, 'wb') as f:
             f.write(contents)
     except Exception:
         raise HTTPException(status_code=500, detail='Something went wrong')
     finally:
         file.file.close()
+    #return {"message": f"Successfully uploaded {file.filename}"}
+    return fullfn
 @app.get("/api/get_items_csv")
 @app.post("/api/match")
+async def match(items_file: UploadFile, threshold: int, items_first: int):
     prods_file = os.path.join(get_products_dir(), "products.csv")
     if not os.path.isfile(prods_file):
         return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
+    items_fn = upload_items_csv(items_file)
+    #if len(r.items) == 0:
+    #    return {"Status": "Error", "ErrorDesc": "Items file not specified"}
+    if not threshold:
+        threshold = 50
+    #items_fn = os.path.join(get_items_dir(), r.items)
+    #if not os.path.isfile(items_fn):
+    #    return {"Status": "Error", "ErrorDesc": "Items file not found"}
     row_items = pd.read_csv(items_fn, sep='\t')
+    os.remove(items_fn)
     row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
+    df, items, products = processor.process(row_products, row_items, items_first, threshold)
     results_dir = get_results_dir()
     if not os.path.exists(results_dir):
         os.makedirs(results_dir)
+    output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
     df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
     return {"Status": "Success", "result_file" : output_csv}

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ processor=Processor(LONG_TYPES_LIST,
 searcher=Searcher()
 ui=GradioUI(processor, searcher, "/home/user/app/_data/")
 ui.run_ui()

 searcher=Searcher()
 ui=GradioUI(processor, searcher, "/home/user/app/_data/")
+#ui=GradioUI(processor, searcher, "_data")
 ui.run_ui()

preprocess/preprocess.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Preprocessor():
     def process_items(self, df):
-        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
     #counter=0
         for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
@@ -42,6 +42,7 @@ class Preprocessor():
                     result['brand'].append(i['brand'])
                 else: result['brand'].append(None)
                 result['name'].append(i['name'])
                 drink_type=get_type(i, self.long_types_list)
                 if drink_type is None:
                     drink_type=check_spark(i)
@@ -77,7 +78,7 @@ class Preprocessor():
     def process_products(self, products):
-        result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
         for idx, row in tqdm(products.iterrows()):
             try:
                 result['id'].append(row['id'])
@@ -85,6 +86,7 @@ class Preprocessor():
                 result['type_wine'].append(row['category'])
                 result['type'].append(row['product_type'])
                 result['name'].append(row['name_long'])
                 vol=extract_volume_or_number(row['name'])
                 result['volume'].append(vol)
                 #year=extract_production_year(row['name'])

     def process_items(self, df):
+        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
     #counter=0
         for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
                     result['brand'].append(i['brand'])
                 else: result['brand'].append(None)
                 result['name'].append(i['name'])
+                result['fullname'].append(i['name'])
                 drink_type=get_type(i, self.long_types_list)
                 if drink_type is None:
                     drink_type=check_spark(i)
     def process_products(self, products):
+        result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
         for idx, row in tqdm(products.iterrows()):
             try:
                 result['id'].append(row['id'])
                 result['type_wine'].append(row['category'])
                 result['type'].append(row['product_type'])
                 result['name'].append(row['name_long'])
+                result['fullname'].append(row['name_long'])
                 vol=extract_volume_or_number(row['name'])
                 result['volume'].append(vol)
                 #year=extract_production_year(row['name'])

processor/matching.py CHANGED Viewed

@@ -30,7 +30,7 @@ def prepare_groups_with_ids(items_df):
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
     grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
-        lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
@@ -46,11 +46,38 @@ def prepare_groups_by_alternative_keys(items_df):
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
     grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
-        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
-def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
     """
     Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
     нормализованные группы.
@@ -71,6 +98,9 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
     results = []
     no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
     # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
     for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
         product_brand = product['brand']
@@ -84,18 +114,21 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
         items_data = items_groups.get(key, [])
         if items_data:
             # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
-            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
         else:
-            items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
         norm_product_name = normalize_name(product_name)
         matches = process.extract(
-            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
         )
         matched_items = [
             {
                 'item_id': items_ids[idx_candidate],
-                'item_name': items_names[idx_candidate],
                 'score': score,
                 'volume': items_volumes[idx_candidate],
                 'color': item_type_wine[idx_candidate],
@@ -105,54 +138,74 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
             for match, score, idx_candidate in matches
         ]
-        if not matched_items:
             no_match_products.append((idx, product))
         results.append({
             'product_id': product['id'],
             'matched_items': matched_items,
-            'alternative': []  # Заполняется во втором проходе
         })
-    # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
-    groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
-    # Второй проход: для продуктов без совпадений ищем по альтернативным группам
-    for idx, product in tqdm(no_match_products):
-        product_brand = product['brand']
-        product_type_wine = product['new_type_wine']
-        product_type = product['new_type']
-        product_volume = product['volume']
-        product_name = product['name']
-        product_sour = product['sour']
-        alt_key = (product_type_wine, product_type, product_volume, product_sour)
-        type_items = groups_by_alternative_keys.get(alt_key, [])
-        # Фильтруем, исключая итемы с исходным брендом
-        filtered_items = [item for item in type_items if item[1] != product_brand]
-        if filtered_items:
-            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
-        else:
-            alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
-        norm_product_name = normalize_name(product_name)
-        alt_matches = process.extract(
-            norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
-        )
-        alt_matched_items = [
-            {
-                'item_id': alt_ids[idx_candidate],
-                'item_name': alt_names[idx_candidate],
-                'score': score,
-                'volume': alt_volumes[idx_candidate],
-                'color': alt_type_wine[idx_candidate],
-                'sour': alt_sour[idx_candidate],
-                'year': alt_year[idx_candidate],
-            }
-            for match, score, idx_candidate in alt_matches
-        ]
-        results[idx]['alternative'] = alt_matched_items
     results_df = pd.DataFrame(results)
     merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])

     items_df['norm_name'] = items_df['name'].apply(normalize_name)
     grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
     items_df['norm_name'] = items_df['name'].apply(normalize_name)
     grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
+        lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
     ).to_dict()
     return grouped
+def order_by_best_year(matched_items, year):
+    best_matched_items = []
+    max_year_matched_items = []
+    other_matched_items = []
+    max_year = 0
+    for mi in matched_items:
+        # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
+        if year and (int(year) != 0) and (mi['year'] == year):
+            best_matched_items.append(mi)
+        elif mi['year'] and int(mi['year']) != 0:
+            if int(mi['year']) > max_year:
+                max_year_matched_items = [mi]
+                max_year = int(mi['year'])
+            elif int(mi['year']) > max_year:
+                max_year_matched_items.append(mi)
+            else:
+                other_matched_items.append(mi)
+        else:
+            other_matched_items.append(mi)
+    best_matched_items.extend(max_year_matched_items)
+    best_matched_items.extend(other_matched_items)
+    return best_matched_items
+def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
     """
     Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
     нормализованные группы.
     results = []
     no_match_products = []  # Список для хранения продуктов без совпадения в исходной группе
+    if name_threshold < 50:
+        name_threshold = 50
     # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
     for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
         product_brand = product['brand']
         items_data = items_groups.get(key, [])
         if items_data:
             # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
+            items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
         else:
+            items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
         norm_product_name = normalize_name(product_name)
         matches = process.extract(
+            norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
         )
         matched_items = [
             {
                 'item_id': items_ids[idx_candidate],
+                'brand': product_brand,
+                'item_name': items_full_names[idx_candidate],
+                #'item_name': items_names[idx_candidate],
                 'score': score,
                 'volume': items_volumes[idx_candidate],
                 'color': item_type_wine[idx_candidate],
             for match, score, idx_candidate in matches
         ]
+        if matched_items:
+            matched_items = order_by_best_year(matched_items, product['year'])
+            matched_items = matched_items[:5]
+        else:
             no_match_products.append((idx, product))
         results.append({
             'product_id': product['id'],
+            #"matched_top_id": top_matched_id,
             'matched_items': matched_items,
+            #"alternative_top_id": "",
+            #'alternative': []  # Заполняется во втором проходе
         })
+    if include_alternatives:
+        # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
+        groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
+        # Второй проход: для продуктов без совпадений ищем по альтернативным группам
+        for idx, product in tqdm(no_match_products):
+            product_brand = product['brand']
+            product_type_wine = product['new_type_wine']
+            product_type = product['new_type']
+            product_volume = product['volume']
+            product_name = product['name']
+            product_sour = product['sour']
+            alt_key = (product_type_wine, product_type, product_volume, product_sour)
+            type_items = groups_by_alternative_keys.get(alt_key, [])
+            # Фильтруем, исключая итемы с исходным брендом
+            filtered_items = [item for item in type_items if item[1] != product_brand]
+            if filtered_items:
+                alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
+            else:
+                alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
+            norm_product_name = normalize_name(product_name)
+            alt_matches = process.extract(
+                norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
+            )
+            alt_matched_items = [
+                {
+                    'item_id': alt_ids[idx_candidate],
+                    'brand': alt_brands[idx_candidate],
+                    #'item_name': alt_names[idx_candidate],
+                    'item_name': alt_full_names[idx_candidate],
+                    'score': score / 2,
+                    'volume': alt_volumes[idx_candidate],
+                    'color': alt_type_wine[idx_candidate],
+                    'sour': alt_sour[idx_candidate],
+                    'year': alt_year[idx_candidate],
+                }
+                for match, score, idx_candidate in alt_matches
+            ]
+            alt_matched_items = order_by_best_year(alt_matched_items, product['year'])
+            alt_matched_items = alt_matched_items[:5]
+            results[idx]['matched_items'].extend(alt_matched_items)
+       #if alt_matched_items:
+        #    results[idx]['alternative_top_id'] = alt_matched_items[0]["item_id"]
+        #results[idx]['alternative'] = alt_matched_items
     results_df = pd.DataFrame(results)
     merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])

processor/processor.py CHANGED Viewed

@@ -11,7 +11,7 @@ class Processor():
                  type_wine, gbs, colors_for_trim, grapes, other_words,
                  sour_merge_dict, type_merge_dict, color_merge_dict)
-    def process(self, products, items, is_items_first=False, th=65):
         items, products=self.preprocessor.process(products, items)
         print('-----*-----Matching-----*-----')
@@ -20,9 +20,9 @@ class Processor():
             products['new_brand']=products['brand']
             items['brand']=items['new_brand']
             products_groups = prepare_groups_with_ids(products)
-            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
         else:
             items_groups = prepare_groups_with_ids(items)
-            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
         return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

                  type_wine, gbs, colors_for_trim, grapes, other_words,
                  sour_merge_dict, type_merge_dict, color_merge_dict)
+    def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
         items, products=self.preprocessor.process(products, items)
         print('-----*-----Matching-----*-----')
             products['new_brand']=products['brand']
             items['brand']=items['new_brand']
             products_groups = prepare_groups_with_ids(products)
+            res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th, include_alternatives=include_alternatives)
         else:
             items_groups = prepare_groups_with_ids(items)
+            res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th, include_alternatives=include_alternatives)
         return res.drop(['type','type_wine','alco','gb'], axis=1), items, products

search/matching_judge.py ADDED Viewed

	@@ -0,0 +1,133 @@

+import json
+import pandas as pd
+import ast
+import csv
+def compare_matching_with_manual(products_file, items_file, match_result_file, manual_result_file):
+    '''with open(products_file, mode="r", encoding="utf-8", newline='') as csvfile:
+        csvreader = csv.reader(csvfile, dialect="excel-tab")
+        for row in csvreader:
+            print(', '.join(row))'''
+    products_df = pd.read_csv(products_file, sep="\t")
+    items_df = pd.read_csv(items_file, sep=";")
+    match_df = pd.read_csv(match_result_file, sep="\t")
+    manual_df = pd.read_csv(manual_result_file, sep="\t")
+    results = {
+        "item_count" : int(items_df.count()[0]),
+        "product_count" : int(products_df.count()[0]),
+        "match_count" : int(match_df.count()[0]),
+        "manual_count" : int(manual_df.count()[0]),
+    }
+    items_to_manual = {}
+    for index, row in items_df.iterrows():
+        x = manual_df[manual_df['item_id'] == row["id"]]['state']
+        if (len(x) > 0) and (x.values[0] == 1):
+            p = products_df[products_df["id"] == manual_df.iloc[int(x.index[0])]["product_id"]]
+            items_to_manual[row["id"]] = int(manual_df.iloc[int(x.index[0])]["product_id"])
+    '''items_to_auto = {}
+    for index, row in match_df.iterrows():
+        if row["matched_top_id"] > 0:
+            p = products_df[products_df["id"] == int(row["matched_top_id"])]
+            items_to_auto[row["id"]] = int(row["matched_top_id"])
+    results["items_to_manual_count"] = len(items_to_manual)
+    results["items_to_auto_count"] = len(items_to_auto)'''
+    result_list = []
+    for index, row in items_df.iterrows():
+        result_data = {}
+        result_data["id"] = row["id"]
+        result_data["match_side"] = "no_match"
+        result_data["auto_score"] = ""
+        result_data["manual_score"] = ""
+        result_data["discuss"] = ""
+        auto_match = match_df[match_df['id'] == row["id"]]["matched_items"].values[0]
+        '''if len(auto_match) > 2:
+            if auto_match.find("\\'") >= 0:
+                auto_match = auto_match
+            auto_match = auto_match.replace("\\'", "$$$$$$").replace(": None}", ": \"\"}").replace("'", '"').replace("$$$$$$", "\\'")
+        auto_match = json.loads(auto_match)'''
+        manual_match = None
+        manual = manual_df[manual_df['item_id'] == row["id"]]['state']
+        if (len(manual) > 0) and (manual.values[0] == 1):
+            p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
+            if len(p.values) > 0:
+                manual_match = p
+            else:
+                print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
+        if (auto_match is not None) and len(auto_match) > 2 and (manual_match is not None):
+            result_data["match_side"] = "both"
+            manual_id = int(manual_match["id"].values[0])
+            auto_match_ns = auto_match.replace(" ", "")
+            i1 = auto_match_ns.find("'item_id':")
+            i2 = auto_match_ns.find("'item_id':" + str(manual_id) + ",")
+            if i1 == i2:
+                result_data["auto_score"] = 1
+                result_data["manual_score"] = 1
+            elif i2 >= 0:
+                result_data["auto_score"] = 0.5
+                result_data["manual_score"] = 0.5
+        elif (auto_match is not None) and len(auto_match) > 2:
+            result_data["match_side"] = "only_auto"
+        elif manual_match is not None:
+            result_data["match_side"] = "only_manual"
+        result_data["discuss"] = ""
+        result_data["item"] = row["attrs"]
+        result_data["auto_match"] = auto_match
+        manual_string = ""
+        if (manual_match is not None):
+            manual_string = '{' + \
+                        '"id": ' + str(manual_match["id"].values[0]) + ',' + \
+                        '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
+                        '"name": "' + str(manual_match["name_long"].values[0]) + '",' + \
+                        '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
+                        '"year": ' + str(manual_match["year"].values[0]) + '"}'
+        result_data["manual_match"] = manual_string
+        result_list.append(result_data)
+    results_df = pd.DataFrame(result_list)
+    results_df.to_csv("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\mjudge_new.csv")
+    '''common_match = {}
+    full_match = {}
+    for a_match in items_to_auto:
+        if a_match in items_to_manual:
+            common_match[a_match] = [items_to_auto[a_match], items_to_manual[a_match]]
+            if items_to_auto[a_match] == items_to_manual[a_match]:
+                full_match[a_match] = items_to_auto[a_match]'''
+    #results["items_to_manual"] = len(items_to_manual)
+    #results["items_to_auto"] = len(items_to_auto
+    print(results)
+    return results

ui/gradio_ui.py CHANGED Viewed

@@ -1,170 +1,175 @@
-from argparse import ArgumentError
-import gradio as gr
-import pandas as pd
-from preprocess.utils.common.utils import get_delimiter
-from tmp.utils import update_products_csv #remover,
-import os
-import datetime, time
-class GradioUI():
-    def __init__(self, processor, searcher, data_path):
-        self.processor=processor
-        self.searcher=searcher
-        self.data_path = data_path
-    def get_data_dir(self):
-        return self.data_path
-    def get_products_dir(self):
-        return os.path.join(self.get_data_dir(), "products")
-    def get_items_dir(self):
-        return os.path.join(self.get_data_dir(), "items")
-    def get_results_dir(self):
-        return os.path.join(self.get_data_dir(), "results")
-    def get_products_file_date(self):
-        fullfn = os.path.join(self.data_path, "products", "products.csv")
-        if not os.path.isfile(fullfn):
-            return "Файл Products не найден"
-        stinfo = os.stat(fullfn)
-        return time.ctime(stinfo.st_mtime)
-    def upload_products_file(self, prods_file, overwrite_existing):
-        try:
-            if not os.path.exists(self.get_products_dir()):
-                os.makedirs(self.get_products_dir())
-            fullfn = os.path.join(self.get_products_dir(), "products.csv")
-            if prods_file != None:
-                update_products_csv(prods_file, fullfn, overwrite_existing)
-            gr.Info("Файл Products успешно загружен")
-        except Exception as ex:
-            raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
-    def process_items(self, items_file, is_items_first, threshold): #, q_id):
-        try:
-            prods_file = os.path.join(self.get_products_dir(), "products.csv")
-            if not os.path.isfile(prods_file):
-                raise Exception("Файл Products не найден")
-            if items_file != None:
-                items_delimiter=get_delimiter(items_file)
-                print('items delimiter: '+items_delimiter)
-                row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
-                if not 'attrs' in row_items.columns.values:
-                    raise Exception("Uploaded Items CSV does not seem to be valid")
-                products_delimiter=get_delimiter(prods_file)
-                print('products delimiter: '+products_delimiter)
-                row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
-                # if q_id in row_products['id'].unique():
-                #     row_products=row_products[row_products['id']==q_id]
-                #print("product id: " + str(q_id))
-                df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold)
-                self.searcher.set_df(df.copy())
-                #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
-                #    output_csv = tmp.name
-                results_path = self.get_results_dir()
-                if not os.path.exists(results_path):
-                    os.makedirs(results_path)
-                output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
-                output_csv = os.path.join(results_path, output_csv)
-                df.to_csv(output_csv, sep='\t', index=False)
-                return output_csv
-        except Exception as ex:
-            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
-    def on_page_load(self, r: gr.Request):
-        m_time = self.get_products_file_date()
-        return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
-    def run_ui(self):
-        with gr.Blocks() as demo:
-            tabs = gr.Tabs()
-            with tabs:
-                #     with gr.Row():
-                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                #         process_button = gr.Button("Обновить")
-                with gr.TabItem("Загрузка файла Products"):
-                    prod_file_info1 = gr.Markdown("## Загрузка файла Products")
-                    with gr.Row():
-                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                    with gr.Row():
-                        toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
-                    upload_button = gr.Button("Загрузить файл")
-                    upload_button.click(
-                        fn=self.upload_products_file,
-                        inputs=[file_input1, toggle_input],
-                        #outputs=output_file
-                    )
-                # Вкладка для обраб��тки CSV файлов
-                with gr.TabItem("Обработка каталога поставщика"):
-                    gr.Markdown("## Обработка каталога поставщика")
-                    m_time = self.get_products_file_date()
-                    prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
-                    with gr.Row():
-                        #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
-                        file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
-                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    with gr.Row():
-                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
-                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
-                    process_button = gr.Button("Загрузить файл с каталогом и сравнить")
-                    output_file = gr.File(label="Скачать результат (CSV)")
-                    process_button.click(
-                        fn=self.process_items,
-                        inputs=[file_items, toggle_input, threshold_input], #, search_number],
-                        outputs=output_file
-                    )
-                # Вкладка для поиска
-                with gr.TabItem("Поиск в обработанном csv"):
-                    gr.Markdown("## Поиск")
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search,
-                        inputs=[search_number],
-                        outputs=search_table
-                    )
-                with gr.TabItem("Загрузка результат и поиск в нем"):
-                    gr.Markdown("## Поиск")
-                    with gr.Row():
-                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
-                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
-                    search_button = gr.Button("Найти")
-                    search_table = gr.Dataframe(label="Результаты поиска")
-                    search_button.click(
-                        fn=self.searcher.search_in_uploaded_file,
-                        inputs=[input_path, search_number],
-                        outputs=search_table
-                    )
-                #with gr.TabItem("Удалить сохраненные продукты"):
-                #    del_button = gr.Button("Удалить")
-                #    process_button.click(fn=remover)
-            demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
             demo.launch()

+from argparse import ArgumentError
+import gradio as gr
+import pandas as pd
+from preprocess.utils.common.utils import get_delimiter
+from tmp.utils import update_products_csv #remover,
+import os
+import datetime, time
+class GradioUI():
+    def __init__(self, processor, searcher, data_path):
+        self.processor=processor
+        self.searcher=searcher
+        self.data_path = data_path
+    def get_data_dir(self):
+        return self.data_path
+    def get_products_dir(self):
+        return os.path.join(self.get_data_dir(), "products")
+    def get_items_dir(self):
+        return os.path.join(self.get_data_dir(), "items")
+    def get_results_dir(self):
+        return os.path.join(self.get_data_dir(), "results")
+    def get_products_file_date(self):
+        fullfn = os.path.join(self.data_path, "products", "products.csv")
+        if not os.path.isfile(fullfn):
+            return "Файл Products не найден"
+        stinfo = os.stat(fullfn)
+        return time.ctime(stinfo.st_mtime)
+    def upload_products_file(self, prods_file, overwrite_existing):
+        try:
+            if not os.path.exists(self.get_products_dir()):
+                os.makedirs(self.get_products_dir())
+            fullfn = os.path.join(self.get_products_dir(), "products.csv")
+            if prods_file != None:
+                update_products_csv(prods_file, fullfn, overwrite_existing)
+            gr.Info("Файл Products успешно загружен")
+        except Exception as ex:
+            raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
+    def process_items(self, items_file, is_items_first, threshold, include_alternatives): #, q_id):
+        try:
+            prods_file = os.path.join(self.get_products_dir(), "products.csv")
+            if not os.path.isfile(prods_file):
+                raise Exception("Файл Products не найден")
+            if items_file != None:
+                items_delimiter=get_delimiter(items_file)
+                print('items delimiter: '+items_delimiter)
+                #row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
+                row_items = pd.read_csv(items_file, sep=items_delimiter)
+                if not 'attrs' in row_items.columns.values:
+                    raise Exception("Uploaded Items CSV does not seem to be valid")
+                products_delimiter=get_delimiter(prods_file)
+                print('products delimiter: '+products_delimiter)
+                #row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
+                row_products = pd.read_csv(prods_file, sep=products_delimiter)
+                # if q_id in row_products['id'].unique():
+                #     row_products=row_products[row_products['id']==q_id]
+                #print("product id: " + str(q_id))
+                df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold, include_alternatives)
+                self.searcher.set_df(df.copy())
+                #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
+                #    output_csv = tmp.name
+                results_path = self.get_results_dir()
+                if not os.path.exists(results_path):
+                    os.makedirs(results_path)
+                output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
+                output_csv = os.path.join(results_path, output_csv)
+                df.to_csv(output_csv, sep='\t', index=False)
+                return output_csv
+        except Exception as ex:
+            raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
+    def on_page_load(self, r: gr.Request):
+        m_time = self.get_products_file_date()
+        return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
+    def run_ui(self):
+        with gr.Blocks() as demo:
+            tabs = gr.Tabs()
+            with tabs:
+                #     with gr.Row():
+                #         file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                #         process_button = gr.Button("Обновить")
+                # Вкладка для обработки CSV файлов
+                with gr.TabItem("Обработка каталога поставщика"):
+                    gr.Markdown("## Обработка каталога поставщика")
+                    m_time = self.get_products_file_date()
+                    prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
+                    with gr.Row():
+                        #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                        file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
+                    #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    with gr.Row():
+                        toggle_input = gr.Checkbox(label="Инвертировать поиск", value=True)
+                        toggle_alternative = gr.Checkbox(label="Включать в результаты альтернативные варианты", value=True)
+                    threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
+                    process_button = gr.Button("Загрузить файл с каталогом и сравнить")
+                    output_file = gr.File(label="Скачать результат (CSV)")
+                    process_button.click(
+                        fn=self.process_items,
+                        inputs=[file_items, toggle_input, threshold_input, toggle_alternative], #, search_number],
+                        outputs=output_file
+                    )
+                with gr.TabItem("Загрузка файла Products"):
+                    prod_file_info1 = gr.Markdown("## Загрузка файла Products")
+                    with gr.Row():
+                        file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
+                    with gr.Row():
+                        toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
+                    upload_button = gr.Button("Загрузить файл")
+                    upload_button.click(
+                        fn=self.upload_products_file,
+                        inputs=[file_input1, toggle_input],
+                        #outputs=output_file
+                    )
+                # Вкладка для поиска
+                with gr.TabItem("Поиск в обработанном csv"):
+                    gr.Markdown("## Поиск")
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search,
+                        inputs=[search_number],
+                        outputs=search_table
+                    )
+                with gr.TabItem("Загрузка результат и поиск в нем"):
+                    gr.Markdown("## Поиск")
+                    with gr.Row():
+                        input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
+                    search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
+                    search_button = gr.Button("Найти")
+                    search_table = gr.Dataframe(label="Результаты поиска")
+                    search_button.click(
+                        fn=self.searcher.search_in_uploaded_file,
+                        inputs=[input_path, search_number],
+                        outputs=search_table
+                    )
+                #with gr.TabItem("Удалить сохраненные продукты"):
+                #    del_button = gr.Button("Удалить")
+                #    process_button.click(fn=remover)
+            demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
             demo.launch()