Gainward777 commited on
Commit
36a20b5
·
verified ·
1 Parent(s): 4b066a6

Upload 22 files

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pyc
2
+ .idea/*
3
+ _data/*
api.py CHANGED
@@ -10,6 +10,13 @@ import uvicorn
10
  from pydantic import BaseModel
11
  import pandas as pd
12
  from tmp.utils import update_products_csv
 
 
 
 
 
 
 
13
 
14
  processor=Processor(LONG_TYPES_LIST,
15
  SHORT_TYPES_LIST,
@@ -38,6 +45,7 @@ class match_request(BaseModel):
38
 
39
  def get_data_dir():
40
  return "/home/user/app/_data/"
 
41
 
42
  def get_products_dir():
43
  return os.path.join(get_data_dir(), "products")
@@ -94,6 +102,8 @@ async def upload_products_csv(file: UploadFile, overwrite_existing: int):
94
  fullfn = os.path.join(datadir, "products.csv")
95
  update_products_csv(tempfile, fullfn, overwrite_existing)
96
 
 
 
97
  except Exception:
98
  raise HTTPException(status_code=500, detail='Something went wrong')
99
  finally:
@@ -102,8 +112,8 @@ async def upload_products_csv(file: UploadFile, overwrite_existing: int):
102
  return {"message": f"Successfully uploaded {file.filename}"}
103
 
104
 
105
- @app.post("/api/upload_items_csv")
106
- async def upload_items_csv(file: UploadFile = File(...)):
107
  try:
108
  itemsdir = get_items_dir()
109
 
@@ -112,14 +122,16 @@ async def upload_items_csv(file: UploadFile = File(...)):
112
 
113
  contents = file.file.read()
114
 
115
- with open(os.path.join(itemsdir, file.filename), 'wb') as f:
 
116
  f.write(contents)
117
  except Exception:
118
  raise HTTPException(status_code=500, detail='Something went wrong')
119
  finally:
120
  file.file.close()
121
 
122
- return {"message": f"Successfully uploaded {file.filename}"}
 
123
 
124
 
125
  @app.get("/api/get_items_csv")
@@ -136,32 +148,35 @@ async def get_items_csv():
136
 
137
 
138
  @app.post("/api/match")
139
- async def match(r: match_request):
140
  prods_file = os.path.join(get_products_dir(), "products.csv")
141
  if not os.path.isfile(prods_file):
142
  return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
143
 
144
- if len(r.items) == 0:
145
- return {"Status": "Error", "ErrorDesc": "Items file not specified"}
 
146
 
147
- if not r.threshold:
148
- r.threshold = 50
149
 
150
- items_fn = os.path.join(get_items_dir(), r.items)
151
- if not os.path.isfile(items_fn):
152
- return {"Status": "Error", "ErrorDesc": "Items file not found"}
153
 
154
  row_items = pd.read_csv(items_fn, sep='\t')
 
 
155
  row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
156
 
157
 
158
- df, items, products = processor.process(row_products, row_items, r.items_first, r.threshold)
159
 
160
  results_dir = get_results_dir()
161
  if not os.path.exists(results_dir):
162
  os.makedirs(results_dir)
163
 
164
- output_csv = "m1-" + str(r.threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
165
  df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
166
 
167
  return {"Status": "Success", "result_file" : output_csv}
 
10
  from pydantic import BaseModel
11
  import pandas as pd
12
  from tmp.utils import update_products_csv
13
+ from search.matching_judge import compare_matching_with_manual
14
+
15
+ '''compare_matching_with_manual("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\products.csv",
16
+ "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\ws-items-for-test.csv",
17
+ "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\m1-50-250325-133739.csv",
18
+ "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\matching-20250318.csv")'''
19
+
20
 
21
  processor=Processor(LONG_TYPES_LIST,
22
  SHORT_TYPES_LIST,
 
45
 
46
  def get_data_dir():
47
  return "/home/user/app/_data/"
48
+ #return "_data"
49
 
50
  def get_products_dir():
51
  return os.path.join(get_data_dir(), "products")
 
102
  fullfn = os.path.join(datadir, "products.csv")
103
  update_products_csv(tempfile, fullfn, overwrite_existing)
104
 
105
+ os.remove(tempfile)
106
+
107
  except Exception:
108
  raise HTTPException(status_code=500, detail='Something went wrong')
109
  finally:
 
112
  return {"message": f"Successfully uploaded {file.filename}"}
113
 
114
 
115
+ #@app.post("/api/upload_items_csv")
116
+ def upload_items_csv(file: UploadFile):
117
  try:
118
  itemsdir = get_items_dir()
119
 
 
122
 
123
  contents = file.file.read()
124
 
125
+ fullfn = os.path.join(itemsdir, file.filename)
126
+ with open(fullfn, 'wb') as f:
127
  f.write(contents)
128
  except Exception:
129
  raise HTTPException(status_code=500, detail='Something went wrong')
130
  finally:
131
  file.file.close()
132
 
133
+ #return {"message": f"Successfully uploaded {file.filename}"}
134
+ return fullfn
135
 
136
 
137
  @app.get("/api/get_items_csv")
 
148
 
149
 
150
  @app.post("/api/match")
151
+ async def match(items_file: UploadFile, threshold: int, items_first: int):
152
  prods_file = os.path.join(get_products_dir(), "products.csv")
153
  if not os.path.isfile(prods_file):
154
  return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
155
 
156
+ items_fn = upload_items_csv(items_file)
157
+ #if len(r.items) == 0:
158
+ # return {"Status": "Error", "ErrorDesc": "Items file not specified"}
159
 
160
+ if not threshold:
161
+ threshold = 50
162
 
163
+ #items_fn = os.path.join(get_items_dir(), r.items)
164
+ #if not os.path.isfile(items_fn):
165
+ # return {"Status": "Error", "ErrorDesc": "Items file not found"}
166
 
167
  row_items = pd.read_csv(items_fn, sep='\t')
168
+ os.remove(items_fn)
169
+
170
  row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
171
 
172
 
173
+ df, items, products = processor.process(row_products, row_items, items_first, threshold)
174
 
175
  results_dir = get_results_dir()
176
  if not os.path.exists(results_dir):
177
  os.makedirs(results_dir)
178
 
179
+ output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
180
  df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
181
 
182
  return {"Status": "Success", "result_file" : output_csv}
app.py CHANGED
@@ -18,6 +18,7 @@ processor=Processor(LONG_TYPES_LIST,
18
  searcher=Searcher()
19
 
20
  ui=GradioUI(processor, searcher, "/home/user/app/_data/")
 
21
  ui.run_ui()
22
 
23
 
 
18
  searcher=Searcher()
19
 
20
  ui=GradioUI(processor, searcher, "/home/user/app/_data/")
21
+ #ui=GradioUI(processor, searcher, "_data")
22
  ui.run_ui()
23
 
24
 
preprocess/preprocess.py CHANGED
@@ -31,7 +31,7 @@ class Preprocessor():
31
 
32
 
33
  def process_items(self, df):
34
- result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
35
  #counter=0
36
  for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
37
 
@@ -42,6 +42,7 @@ class Preprocessor():
42
  result['brand'].append(i['brand'])
43
  else: result['brand'].append(None)
44
  result['name'].append(i['name'])
 
45
  drink_type=get_type(i, self.long_types_list)
46
  if drink_type is None:
47
  drink_type=check_spark(i)
@@ -77,7 +78,7 @@ class Preprocessor():
77
 
78
 
79
  def process_products(self, products):
80
- result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
81
  for idx, row in tqdm(products.iterrows()):
82
  try:
83
  result['id'].append(row['id'])
@@ -85,6 +86,7 @@ class Preprocessor():
85
  result['type_wine'].append(row['category'])
86
  result['type'].append(row['product_type'])
87
  result['name'].append(row['name_long'])
 
88
  vol=extract_volume_or_number(row['name'])
89
  result['volume'].append(vol)
90
  #year=extract_production_year(row['name'])
 
31
 
32
 
33
  def process_items(self, df):
34
+ result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
35
  #counter=0
36
  for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
37
 
 
42
  result['brand'].append(i['brand'])
43
  else: result['brand'].append(None)
44
  result['name'].append(i['name'])
45
+ result['fullname'].append(i['name'])
46
  drink_type=get_type(i, self.long_types_list)
47
  if drink_type is None:
48
  drink_type=check_spark(i)
 
78
 
79
 
80
  def process_products(self, products):
81
+ result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
82
  for idx, row in tqdm(products.iterrows()):
83
  try:
84
  result['id'].append(row['id'])
 
86
  result['type_wine'].append(row['category'])
87
  result['type'].append(row['product_type'])
88
  result['name'].append(row['name_long'])
89
+ result['fullname'].append(row['name_long'])
90
  vol=extract_volume_or_number(row['name'])
91
  result['volume'].append(vol)
92
  #year=extract_production_year(row['name'])
processor/matching.py CHANGED
@@ -30,7 +30,7 @@ def prepare_groups_with_ids(items_df):
30
  items_df['norm_name'] = items_df['name'].apply(normalize_name)
31
 
32
  grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
33
- lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
34
  ).to_dict()
35
  return grouped
36
 
@@ -46,11 +46,38 @@ def prepare_groups_by_alternative_keys(items_df):
46
  items_df['norm_name'] = items_df['name'].apply(normalize_name)
47
 
48
  grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
49
- lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
50
  ).to_dict()
51
  return grouped
52
 
53
- def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
  Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
56
  нормализованные группы.
@@ -71,6 +98,9 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
71
  results = []
72
  no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
73
 
 
 
 
74
  # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
75
  for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
76
  product_brand = product['brand']
@@ -84,18 +114,21 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
84
  items_data = items_groups.get(key, [])
85
  if items_data:
86
  # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
87
- items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
88
  else:
89
- items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
90
 
91
  norm_product_name = normalize_name(product_name)
92
  matches = process.extract(
93
- norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
94
  )
 
95
  matched_items = [
96
  {
97
  'item_id': items_ids[idx_candidate],
98
- 'item_name': items_names[idx_candidate],
 
 
99
  'score': score,
100
  'volume': items_volumes[idx_candidate],
101
  'color': item_type_wine[idx_candidate],
@@ -105,54 +138,74 @@ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshol
105
  for match, score, idx_candidate in matches
106
  ]
107
 
108
- if not matched_items:
 
 
 
 
109
  no_match_products.append((idx, product))
110
 
 
 
111
  results.append({
112
  'product_id': product['id'],
 
113
  'matched_items': matched_items,
114
- 'alternative': [] # Заполняется во втором проходе
 
115
  })
116
 
117
- # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
118
- groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
119
-
120
- # Второй проход: для продуктов без совпадений ищем по альтернативным группам
121
- for idx, product in tqdm(no_match_products):
122
- product_brand = product['brand']
123
- product_type_wine = product['new_type_wine']
124
- product_type = product['new_type']
125
- product_volume = product['volume']
126
- product_name = product['name']
127
- product_sour = product['sour']
128
-
129
- alt_key = (product_type_wine, product_type, product_volume, product_sour)
130
- type_items = groups_by_alternative_keys.get(alt_key, [])
131
- # Фильтруем, исключая итемы с исходным брендом
132
- filtered_items = [item for item in type_items if item[1] != product_brand]
133
- if filtered_items:
134
- alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
135
- else:
136
- alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
137
-
138
- norm_product_name = normalize_name(product_name)
139
- alt_matches = process.extract(
140
- norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
141
- )
142
- alt_matched_items = [
143
- {
144
- 'item_id': alt_ids[idx_candidate],
145
- 'item_name': alt_names[idx_candidate],
146
- 'score': score,
147
- 'volume': alt_volumes[idx_candidate],
148
- 'color': alt_type_wine[idx_candidate],
149
- 'sour': alt_sour[idx_candidate],
150
- 'year': alt_year[idx_candidate],
151
- }
152
- for match, score, idx_candidate in alt_matches
153
- ]
154
-
155
- results[idx]['alternative'] = alt_matched_items
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  results_df = pd.DataFrame(results)
158
  merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
 
30
  items_df['norm_name'] = items_df['name'].apply(normalize_name)
31
 
32
  grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
33
+ lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
34
  ).to_dict()
35
  return grouped
36
 
 
46
  items_df['norm_name'] = items_df['name'].apply(normalize_name)
47
 
48
  grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
49
+ lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
50
  ).to_dict()
51
  return grouped
52
 
53
+
54
+ def order_by_best_year(matched_items, year):
55
+ best_matched_items = []
56
+ max_year_matched_items = []
57
+ other_matched_items = []
58
+ max_year = 0
59
+
60
+ for mi in matched_items:
61
+ # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
62
+ if year and (int(year) != 0) and (mi['year'] == year):
63
+ best_matched_items.append(mi)
64
+ elif mi['year'] and int(mi['year']) != 0:
65
+ if int(mi['year']) > max_year:
66
+ max_year_matched_items = [mi]
67
+ max_year = int(mi['year'])
68
+ elif int(mi['year']) > max_year:
69
+ max_year_matched_items.append(mi)
70
+ else:
71
+ other_matched_items.append(mi)
72
+ else:
73
+ other_matched_items.append(mi)
74
+
75
+ best_matched_items.extend(max_year_matched_items)
76
+ best_matched_items.extend(other_matched_items)
77
+ return best_matched_items
78
+
79
+
80
+ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
81
  """
82
  Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
83
  нормализованные группы.
 
98
  results = []
99
  no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
100
 
101
+ if name_threshold < 50:
102
+ name_threshold = 50
103
+
104
  # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
105
  for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
106
  product_brand = product['brand']
 
114
  items_data = items_groups.get(key, [])
115
  if items_data:
116
  # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
117
+ items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
118
  else:
119
+ items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
120
 
121
  norm_product_name = normalize_name(product_name)
122
  matches = process.extract(
123
+ norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
124
  )
125
+
126
  matched_items = [
127
  {
128
  'item_id': items_ids[idx_candidate],
129
+ 'brand': product_brand,
130
+ 'item_name': items_full_names[idx_candidate],
131
+ #'item_name': items_names[idx_candidate],
132
  'score': score,
133
  'volume': items_volumes[idx_candidate],
134
  'color': item_type_wine[idx_candidate],
 
138
  for match, score, idx_candidate in matches
139
  ]
140
 
141
+
142
+ if matched_items:
143
+ matched_items = order_by_best_year(matched_items, product['year'])
144
+ matched_items = matched_items[:5]
145
+ else:
146
  no_match_products.append((idx, product))
147
 
148
+
149
+
150
  results.append({
151
  'product_id': product['id'],
152
+ #"matched_top_id": top_matched_id,
153
  'matched_items': matched_items,
154
+ #"alternative_top_id": "",
155
+ #'alternative': [] # Заполняется во втором проходе
156
  })
157
 
158
+ if include_alternatives:
159
+ # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
160
+ groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
161
+
162
+ # Второй проход: для продуктов без совпадений ищем по альтернативным группам
163
+ for idx, product in tqdm(no_match_products):
164
+ product_brand = product['brand']
165
+ product_type_wine = product['new_type_wine']
166
+ product_type = product['new_type']
167
+ product_volume = product['volume']
168
+ product_name = product['name']
169
+ product_sour = product['sour']
170
+
171
+ alt_key = (product_type_wine, product_type, product_volume, product_sour)
172
+ type_items = groups_by_alternative_keys.get(alt_key, [])
173
+ # Фильтруем, исключая итемы с исходным брендом
174
+ filtered_items = [item for item in type_items if item[1] != product_brand]
175
+ if filtered_items:
176
+ alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
177
+ else:
178
+ alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
179
+
180
+ norm_product_name = normalize_name(product_name)
181
+ alt_matches = process.extract(
182
+ norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
183
+ )
184
+ alt_matched_items = [
185
+ {
186
+ 'item_id': alt_ids[idx_candidate],
187
+ 'brand': alt_brands[idx_candidate],
188
+ #'item_name': alt_names[idx_candidate],
189
+ 'item_name': alt_full_names[idx_candidate],
190
+ 'score': score / 2,
191
+ 'volume': alt_volumes[idx_candidate],
192
+ 'color': alt_type_wine[idx_candidate],
193
+ 'sour': alt_sour[idx_candidate],
194
+ 'year': alt_year[idx_candidate],
195
+ }
196
+ for match, score, idx_candidate in alt_matches
197
+ ]
198
+
199
+ alt_matched_items = order_by_best_year(alt_matched_items, product['year'])
200
+ alt_matched_items = alt_matched_items[:5]
201
+
202
+ results[idx]['matched_items'].extend(alt_matched_items)
203
+
204
+
205
+ #if alt_matched_items:
206
+ # results[idx]['alternative_top_id'] = alt_matched_items[0]["item_id"]
207
+
208
+ #results[idx]['alternative'] = alt_matched_items
209
 
210
  results_df = pd.DataFrame(results)
211
  merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
processor/processor.py CHANGED
@@ -11,7 +11,7 @@ class Processor():
11
  type_wine, gbs, colors_for_trim, grapes, other_words,
12
  sour_merge_dict, type_merge_dict, color_merge_dict)
13
 
14
- def process(self, products, items, is_items_first=False, th=65):
15
  items, products=self.preprocessor.process(products, items)
16
 
17
  print('-----*-----Matching-----*-----')
@@ -20,9 +20,9 @@ class Processor():
20
  products['new_brand']=products['brand']
21
  items['brand']=items['new_brand']
22
  products_groups = prepare_groups_with_ids(products)
23
- res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
24
  else:
25
  items_groups = prepare_groups_with_ids(items)
26
- res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
27
 
28
  return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
 
11
  type_wine, gbs, colors_for_trim, grapes, other_words,
12
  sour_merge_dict, type_merge_dict, color_merge_dict)
13
 
14
+ def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
15
  items, products=self.preprocessor.process(products, items)
16
 
17
  print('-----*-----Matching-----*-----')
 
20
  products['new_brand']=products['brand']
21
  items['brand']=items['new_brand']
22
  products_groups = prepare_groups_with_ids(products)
23
+ res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th, include_alternatives=include_alternatives)
24
  else:
25
  items_groups = prepare_groups_with_ids(items)
26
+ res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th, include_alternatives=include_alternatives)
27
 
28
  return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
search/matching_judge.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import pandas as pd
3
+ import ast
4
+ import csv
5
+
6
+
7
+
8
+ def compare_matching_with_manual(products_file, items_file, match_result_file, manual_result_file):
9
+ '''with open(products_file, mode="r", encoding="utf-8", newline='') as csvfile:
10
+ csvreader = csv.reader(csvfile, dialect="excel-tab")
11
+ for row in csvreader:
12
+ print(', '.join(row))'''
13
+
14
+ products_df = pd.read_csv(products_file, sep="\t")
15
+ items_df = pd.read_csv(items_file, sep=";")
16
+ match_df = pd.read_csv(match_result_file, sep="\t")
17
+ manual_df = pd.read_csv(manual_result_file, sep="\t")
18
+
19
+ results = {
20
+ "item_count" : int(items_df.count()[0]),
21
+ "product_count" : int(products_df.count()[0]),
22
+ "match_count" : int(match_df.count()[0]),
23
+ "manual_count" : int(manual_df.count()[0]),
24
+ }
25
+
26
+
27
+ items_to_manual = {}
28
+ for index, row in items_df.iterrows():
29
+ x = manual_df[manual_df['item_id'] == row["id"]]['state']
30
+ if (len(x) > 0) and (x.values[0] == 1):
31
+ p = products_df[products_df["id"] == manual_df.iloc[int(x.index[0])]["product_id"]]
32
+ items_to_manual[row["id"]] = int(manual_df.iloc[int(x.index[0])]["product_id"])
33
+
34
+
35
+ '''items_to_auto = {}
36
+ for index, row in match_df.iterrows():
37
+ if row["matched_top_id"] > 0:
38
+ p = products_df[products_df["id"] == int(row["matched_top_id"])]
39
+ items_to_auto[row["id"]] = int(row["matched_top_id"])
40
+
41
+ results["items_to_manual_count"] = len(items_to_manual)
42
+ results["items_to_auto_count"] = len(items_to_auto)'''
43
+
44
+
45
+ result_list = []
46
+
47
+ for index, row in items_df.iterrows():
48
+ result_data = {}
49
+
50
+ result_data["id"] = row["id"]
51
+ result_data["match_side"] = "no_match"
52
+ result_data["auto_score"] = ""
53
+ result_data["manual_score"] = ""
54
+ result_data["discuss"] = ""
55
+
56
+
57
+ auto_match = match_df[match_df['id'] == row["id"]]["matched_items"].values[0]
58
+ '''if len(auto_match) > 2:
59
+ if auto_match.find("\\'") >= 0:
60
+ auto_match = auto_match
61
+
62
+ auto_match = auto_match.replace("\\'", "$$$$$$").replace(": None}", ": \"\"}").replace("'", '"').replace("$$$$$$", "\\'")
63
+
64
+ auto_match = json.loads(auto_match)'''
65
+
66
+ manual_match = None
67
+ manual = manual_df[manual_df['item_id'] == row["id"]]['state']
68
+ if (len(manual) > 0) and (manual.values[0] == 1):
69
+ p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
70
+
71
+ if len(p.values) > 0:
72
+ manual_match = p
73
+ else:
74
+ print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
75
+
76
+ if (auto_match is not None) and len(auto_match) > 2 and (manual_match is not None):
77
+ result_data["match_side"] = "both"
78
+
79
+ manual_id = int(manual_match["id"].values[0])
80
+ auto_match_ns = auto_match.replace(" ", "")
81
+ i1 = auto_match_ns.find("'item_id':")
82
+ i2 = auto_match_ns.find("'item_id':" + str(manual_id) + ",")
83
+
84
+ if i1 == i2:
85
+ result_data["auto_score"] = 1
86
+ result_data["manual_score"] = 1
87
+ elif i2 >= 0:
88
+ result_data["auto_score"] = 0.5
89
+ result_data["manual_score"] = 0.5
90
+ elif (auto_match is not None) and len(auto_match) > 2:
91
+ result_data["match_side"] = "only_auto"
92
+ elif manual_match is not None:
93
+ result_data["match_side"] = "only_manual"
94
+
95
+ result_data["discuss"] = ""
96
+ result_data["item"] = row["attrs"]
97
+
98
+
99
+ result_data["auto_match"] = auto_match
100
+
101
+ manual_string = ""
102
+ if (manual_match is not None):
103
+ manual_string = '{' + \
104
+ '"id": ' + str(manual_match["id"].values[0]) + ',' + \
105
+ '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
106
+ '"name": "' + str(manual_match["name_long"].values[0]) + '",' + \
107
+ '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
108
+ '"year": ' + str(manual_match["year"].values[0]) + '"}'
109
+
110
+ result_data["manual_match"] = manual_string
111
+ result_list.append(result_data)
112
+
113
+
114
+ results_df = pd.DataFrame(result_list)
115
+ results_df.to_csv("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\mjudge_new.csv")
116
+
117
+
118
+
119
+ '''common_match = {}
120
+ full_match = {}
121
+ for a_match in items_to_auto:
122
+ if a_match in items_to_manual:
123
+ common_match[a_match] = [items_to_auto[a_match], items_to_manual[a_match]]
124
+ if items_to_auto[a_match] == items_to_manual[a_match]:
125
+ full_match[a_match] = items_to_auto[a_match]'''
126
+
127
+
128
+ #results["items_to_manual"] = len(items_to_manual)
129
+ #results["items_to_auto"] = len(items_to_auto
130
+ print(results)
131
+
132
+ return results
133
+
ui/gradio_ui.py CHANGED
@@ -1,170 +1,175 @@
1
- from argparse import ArgumentError
2
-
3
- import gradio as gr
4
- import pandas as pd
5
- from preprocess.utils.common.utils import get_delimiter
6
- from tmp.utils import update_products_csv #remover,
7
- import os
8
- import datetime, time
9
-
10
-
11
- class GradioUI():
12
-
13
- def __init__(self, processor, searcher, data_path):
14
- self.processor=processor
15
- self.searcher=searcher
16
- self.data_path = data_path
17
-
18
- def get_data_dir(self):
19
- return self.data_path
20
-
21
- def get_products_dir(self):
22
- return os.path.join(self.get_data_dir(), "products")
23
-
24
- def get_items_dir(self):
25
- return os.path.join(self.get_data_dir(), "items")
26
-
27
- def get_results_dir(self):
28
- return os.path.join(self.get_data_dir(), "results")
29
-
30
- def get_products_file_date(self):
31
- fullfn = os.path.join(self.data_path, "products", "products.csv")
32
- if not os.path.isfile(fullfn):
33
- return "Файл Products не найден"
34
-
35
- stinfo = os.stat(fullfn)
36
- return time.ctime(stinfo.st_mtime)
37
-
38
-
39
- def upload_products_file(self, prods_file, overwrite_existing):
40
- try:
41
- if not os.path.exists(self.get_products_dir()):
42
- os.makedirs(self.get_products_dir())
43
-
44
- fullfn = os.path.join(self.get_products_dir(), "products.csv")
45
-
46
- if prods_file != None:
47
- update_products_csv(prods_file, fullfn, overwrite_existing)
48
-
49
- gr.Info("Файл Products успешно загружен")
50
- except Exception as ex:
51
- raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
52
-
53
-
54
- def process_items(self, items_file, is_items_first, threshold): #, q_id):
55
- try:
56
- prods_file = os.path.join(self.get_products_dir(), "products.csv")
57
- if not os.path.isfile(prods_file):
58
- raise Exception("Файл Products не найден")
59
-
60
- if items_file != None:
61
- items_delimiter=get_delimiter(items_file)
62
- print('items delimiter: '+items_delimiter)
63
- row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
64
- if not 'attrs' in row_items.columns.values:
65
- raise Exception("Uploaded Items CSV does not seem to be valid")
66
-
67
- products_delimiter=get_delimiter(prods_file)
68
- print('products delimiter: '+products_delimiter)
69
- row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
70
-
71
- # if q_id in row_products['id'].unique():
72
- # row_products=row_products[row_products['id']==q_id]
73
-
74
- #print("product id: " + str(q_id))
75
-
76
- df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold)
77
-
78
- self.searcher.set_df(df.copy())
79
- #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
80
- # output_csv = tmp.name
81
- results_path = self.get_results_dir()
82
- if not os.path.exists(results_path):
83
- os.makedirs(results_path)
84
-
85
- output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
86
- output_csv = os.path.join(results_path, output_csv)
87
- df.to_csv(output_csv, sep='\t', index=False)
88
- return output_csv
89
- except Exception as ex:
90
- raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
91
-
92
- def on_page_load(self, r: gr.Request):
93
- m_time = self.get_products_file_date()
94
- return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
95
-
96
-
97
- def run_ui(self):
98
- with gr.Blocks() as demo:
99
- tabs = gr.Tabs()
100
- with tabs:
101
-
102
- # with gr.Row():
103
- # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
104
- # process_button = gr.Button("Обновить")
105
-
106
- with gr.TabItem("Загрузка файла Products"):
107
- prod_file_info1 = gr.Markdown("## Загрузка файла Products")
108
- with gr.Row():
109
- file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
110
- with gr.Row():
111
- toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
112
- upload_button = gr.Button("Загрузить файл")
113
- upload_button.click(
114
- fn=self.upload_products_file,
115
- inputs=[file_input1, toggle_input],
116
- #outputs=output_file
117
- )
118
-
119
- # Вкладка для обраб��тки CSV файлов
120
- with gr.TabItem("Обработка каталога поставщика"):
121
- gr.Markdown("## Обработка каталога поставщика")
122
-
123
- m_time = self.get_products_file_date()
124
- prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
125
- with gr.Row():
126
- #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
127
- file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
128
- #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
129
- with gr.Row():
130
- toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
131
- threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
132
- process_button = gr.Button("Загрузить файл с каталогом и сравнить")
133
- output_file = gr.File(label="Скачать результат (CSV)")
134
- process_button.click(
135
- fn=self.process_items,
136
- inputs=[file_items, toggle_input, threshold_input], #, search_number],
137
- outputs=output_file
138
- )
139
-
140
- # Вкладка для поиска
141
- with gr.TabItem("Поиск в обработанном csv"):
142
- gr.Markdown("## Поиск")
143
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
144
- search_button = gr.Button("Найти")
145
- search_table = gr.Dataframe(label="Результаты поиска")
146
- search_button.click(
147
- fn=self.searcher.search,
148
- inputs=[search_number],
149
- outputs=search_table
150
- )
151
-
152
- with gr.TabItem("Загрузка результат и поиск в нем"):
153
- gr.Markdown("## Поиск")
154
- with gr.Row():
155
- input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
156
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
157
- search_button = gr.Button("Найти")
158
- search_table = gr.Dataframe(label="Результаты поиска")
159
- search_button.click(
160
- fn=self.searcher.search_in_uploaded_file,
161
- inputs=[input_path, search_number],
162
- outputs=search_table
163
- )
164
-
165
- #with gr.TabItem("Удалить сохраненные продукты"):
166
- # del_button = gr.Button("Удалить")
167
- # process_button.click(fn=remover)
168
-
169
- demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
 
 
 
 
 
170
  demo.launch()
 
1
+ from argparse import ArgumentError
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from preprocess.utils.common.utils import get_delimiter
6
+ from tmp.utils import update_products_csv #remover,
7
+ import os
8
+ import datetime, time
9
+
10
+
11
+ class GradioUI():
12
+
13
+ def __init__(self, processor, searcher, data_path):
14
+ self.processor=processor
15
+ self.searcher=searcher
16
+ self.data_path = data_path
17
+
18
+ def get_data_dir(self):
19
+ return self.data_path
20
+
21
+ def get_products_dir(self):
22
+ return os.path.join(self.get_data_dir(), "products")
23
+
24
+ def get_items_dir(self):
25
+ return os.path.join(self.get_data_dir(), "items")
26
+
27
+ def get_results_dir(self):
28
+ return os.path.join(self.get_data_dir(), "results")
29
+
30
+ def get_products_file_date(self):
31
+ fullfn = os.path.join(self.data_path, "products", "products.csv")
32
+ if not os.path.isfile(fullfn):
33
+ return "Файл Products не найден"
34
+
35
+ stinfo = os.stat(fullfn)
36
+ return time.ctime(stinfo.st_mtime)
37
+
38
+
39
+ def upload_products_file(self, prods_file, overwrite_existing):
40
+ try:
41
+ if not os.path.exists(self.get_products_dir()):
42
+ os.makedirs(self.get_products_dir())
43
+
44
+ fullfn = os.path.join(self.get_products_dir(), "products.csv")
45
+
46
+ if prods_file != None:
47
+ update_products_csv(prods_file, fullfn, overwrite_existing)
48
+
49
+ gr.Info("Файл Products успешно загружен")
50
+ except Exception as ex:
51
+ raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
52
+
53
+
54
+ def process_items(self, items_file, is_items_first, threshold, include_alternatives): #, q_id):
55
+ try:
56
+ prods_file = os.path.join(self.get_products_dir(), "products.csv")
57
+ if not os.path.isfile(prods_file):
58
+ raise Exception("Файл Products не найден")
59
+
60
+ if items_file != None:
61
+ items_delimiter=get_delimiter(items_file)
62
+ print('items delimiter: '+items_delimiter)
63
+ #row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
64
+ row_items = pd.read_csv(items_file, sep=items_delimiter)
65
+ if not 'attrs' in row_items.columns.values:
66
+ raise Exception("Uploaded Items CSV does not seem to be valid")
67
+
68
+ products_delimiter=get_delimiter(prods_file)
69
+ print('products delimiter: '+products_delimiter)
70
+ #row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
71
+ row_products = pd.read_csv(prods_file, sep=products_delimiter)
72
+
73
+ # if q_id in row_products['id'].unique():
74
+ # row_products=row_products[row_products['id']==q_id]
75
+
76
+ #print("product id: " + str(q_id))
77
+
78
+ df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold, include_alternatives)
79
+
80
+ self.searcher.set_df(df.copy())
81
+ #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
82
+ # output_csv = tmp.name
83
+ results_path = self.get_results_dir()
84
+ if not os.path.exists(results_path):
85
+ os.makedirs(results_path)
86
+
87
+ output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
88
+ output_csv = os.path.join(results_path, output_csv)
89
+ df.to_csv(output_csv, sep='\t', index=False)
90
+ return output_csv
91
+ except Exception as ex:
92
+ raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
93
+
94
+ def on_page_load(self, r: gr.Request):
95
+ m_time = self.get_products_file_date()
96
+ return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
97
+
98
+
99
+ def run_ui(self):
100
+ with gr.Blocks() as demo:
101
+ tabs = gr.Tabs()
102
+ with tabs:
103
+
104
+ # with gr.Row():
105
+ # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
106
+ # process_button = gr.Button("Обновить")
107
+
108
+ # Вкладка для обработки CSV файлов
109
+ with gr.TabItem("Обработка каталога поставщика"):
110
+ gr.Markdown("## Обработка каталога поставщика")
111
+
112
+ m_time = self.get_products_file_date()
113
+ prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
114
+ with gr.Row():
115
+ #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
116
+ file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
117
+ #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
118
+ with gr.Row():
119
+ toggle_input = gr.Checkbox(label="Инвертировать поиск", value=True)
120
+ toggle_alternative = gr.Checkbox(label="Включать в результаты альтернативные варианты", value=True)
121
+
122
+ threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
123
+ process_button = gr.Button("Загрузить файл с каталогом и сравнить")
124
+ output_file = gr.File(label="Скачать результат (CSV)")
125
+ process_button.click(
126
+ fn=self.process_items,
127
+ inputs=[file_items, toggle_input, threshold_input, toggle_alternative], #, search_number],
128
+ outputs=output_file
129
+ )
130
+
131
+ with gr.TabItem("Загрузка файла Products"):
132
+ prod_file_info1 = gr.Markdown("## Загрузка файла Products")
133
+ with gr.Row():
134
+ file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
135
+ with gr.Row():
136
+ toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
137
+ upload_button = gr.Button("Загрузить файл")
138
+ upload_button.click(
139
+ fn=self.upload_products_file,
140
+ inputs=[file_input1, toggle_input],
141
+ #outputs=output_file
142
+ )
143
+
144
+
145
+ # Вкладка для поиска
146
+ with gr.TabItem("Поиск в обработанном csv"):
147
+ gr.Markdown("## Поиск")
148
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
149
+ search_button = gr.Button("Найти")
150
+ search_table = gr.Dataframe(label="Результаты поиска")
151
+ search_button.click(
152
+ fn=self.searcher.search,
153
+ inputs=[search_number],
154
+ outputs=search_table
155
+ )
156
+
157
+ with gr.TabItem("Загрузка результат и поиск в нем"):
158
+ gr.Markdown("## Поиск")
159
+ with gr.Row():
160
+ input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
161
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
162
+ search_button = gr.Button("Найти")
163
+ search_table = gr.Dataframe(label="Результаты поиска")
164
+ search_button.click(
165
+ fn=self.searcher.search_in_uploaded_file,
166
+ inputs=[input_path, search_number],
167
+ outputs=search_table
168
+ )
169
+
170
+ #with gr.TabItem("Удалить сохраненные продукты"):
171
+ # del_button = gr.Button("Удалить")
172
+ # process_button.click(fn=remover)
173
+
174
+ demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
175
  demo.launch()