API and Changes

#2
by j-s-v - opened
.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore DELETED
@@ -1,3 +0,0 @@
1
- *.pyc
2
- .idea/*
3
- _data/*
 
 
 
 
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Product Matching
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.19.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Product Matching
3
+ emoji: 🏃
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
api.py DELETED
@@ -1,205 +0,0 @@
1
- import csv
2
- import json
3
- import os
4
- import datetime
5
-
6
- from processor.processor import Processor
7
- from constants.constants import *
8
- from search.search_by_id import Searcher
9
- from fastapi import FastAPI, File, UploadFile, HTTPException
10
- import uvicorn
11
- from pydantic import BaseModel
12
- import pandas as pd
13
- from tmp.utils import update_products_csv
14
- from search.matching_judge import compare_matching_with_manual
15
-
16
- '''compare_matching_with_manual("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New5)\\products.csv",
17
- "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\ws-items-for-test.csv",
18
- "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\m1-50-250325-133739.csv",
19
- "C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\matching-20250318.csv")'''
20
-
21
-
22
- processor=Processor(LONG_TYPES_LIST,
23
- SHORT_TYPES_LIST,
24
- SOUR,
25
- WINE_TYPES,
26
- GBS,
27
- COLORS_FOR_TRIM,
28
- GRAPES,
29
- OTHER_WORDS,
30
- SOUR_MERGE_DICT,
31
- TYPES_WINES_DICT,
32
- COLOR_MERGE_DICT)
33
-
34
- searcher=Searcher()
35
-
36
- class item_by_id(BaseModel):
37
- result_file: str
38
- id: str
39
-
40
-
41
- class match_request(BaseModel):
42
- items: str
43
- threshold: int
44
- items_first: int
45
-
46
-
47
- def get_data_dir():
48
- return "/home/user/app/_data/"
49
- #return "_data"
50
-
51
- def get_products_dir():
52
- return os.path.join(get_data_dir(), "products")
53
-
54
- def get_items_dir():
55
- return os.path.join(get_data_dir(), "items")
56
-
57
- def get_results_dir():
58
- return os.path.join(get_data_dir(), "results")
59
-
60
-
61
- app = FastAPI()
62
-
63
- @app.get("/api/get_result_csv")
64
- async def get_result_csv():
65
- results = []
66
- for file in os.listdir(get_results_dir()):
67
- if file.endswith(".csv"):
68
- results.append(file)
69
-
70
- results_json = json.dumps(results)
71
- return results_json
72
-
73
-
74
- @app.post("/api/upload_result_csv")
75
- async def upload_result_csv(file: UploadFile = File(...)):
76
- try:
77
- contents = file.file.read()
78
-
79
- with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
80
- f.write(contents)
81
- except Exception:
82
- raise HTTPException(status_code=500, detail='Something went wrong')
83
- finally:
84
- file.file.close()
85
-
86
- return {"message": f"Successfully uploaded {file.filename}"}
87
-
88
-
89
- @app.post("/api/upload_products_csv")
90
- async def upload_products_csv(file: UploadFile, overwrite_existing: int):
91
- try:
92
- datadir = get_products_dir()
93
- if not os.path.exists(datadir):
94
- os.makedirs(datadir)
95
-
96
- tempfile = os.path.join(datadir, "products.csv_upload")
97
-
98
- contents = file.file.read()
99
-
100
- with open(tempfile, 'wb') as f:
101
- f.write(contents)
102
-
103
- fullfn = os.path.join(datadir, "products.csv")
104
- update_products_csv(tempfile, fullfn, overwrite_existing)
105
-
106
- os.remove(tempfile)
107
-
108
- except Exception:
109
- raise HTTPException(status_code=500, detail='Something went wrong')
110
- finally:
111
- file.file.close()
112
-
113
- return {"message": f"Successfully uploaded {file.filename}"}
114
-
115
-
116
- #@app.post("/api/upload_items_csv")
117
- def upload_items_csv(file: UploadFile):
118
- try:
119
- itemsdir = get_items_dir()
120
-
121
- if not os.path.exists(itemsdir):
122
- os.makedirs(itemsdir)
123
-
124
- contents = file.file.read()
125
-
126
- fullfn = os.path.join(itemsdir, file.filename)
127
- with open(fullfn, 'wb') as f:
128
- f.write(contents)
129
- except Exception:
130
- raise HTTPException(status_code=500, detail='Something went wrong')
131
- finally:
132
- file.file.close()
133
-
134
- #return {"message": f"Successfully uploaded {file.filename}"}
135
- return fullfn
136
-
137
-
138
- @app.get("/api/get_items_csv")
139
- async def get_items_csv():
140
- itemsdir = get_items_dir()
141
-
142
- results = []
143
- for file in os.listdir(itemsdir):
144
- if file.endswith(".csv"):
145
- results.append(file)
146
-
147
- results_json = json.dumps(results)
148
- return results_json
149
-
150
-
151
- @app.post("/api/match")
152
- async def match(items_file: UploadFile, threshold: int, items_first: int):
153
- prods_file = os.path.join(get_products_dir(), "products.csv")
154
- if not os.path.isfile(prods_file):
155
- return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
156
-
157
- items_fn = upload_items_csv(items_file)
158
- #if len(r.items) == 0:
159
- # return {"Status": "Error", "ErrorDesc": "Items file not specified"}
160
-
161
- if not threshold:
162
- threshold = 50
163
-
164
- #items_fn = os.path.join(get_items_dir(), r.items)
165
- #if not os.path.isfile(items_fn):
166
- # return {"Status": "Error", "ErrorDesc": "Items file not found"}
167
-
168
- row_items = pd.read_csv(items_fn, sep='\t')
169
- os.remove(items_fn)
170
-
171
- row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
172
-
173
-
174
- df, items, products = processor.process(row_products, row_items, items_first, threshold)
175
-
176
- results_dir = get_results_dir()
177
- if not os.path.exists(results_dir):
178
- os.makedirs(results_dir)
179
-
180
- output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
181
- df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
182
-
183
- return {"Status": "Success", "result_file" : output_csv}
184
-
185
-
186
- @app.get("/api/get_matched_by_id")
187
- async def get_matched_by_id(item: item_by_id):
188
- fullfn = os.path.join(get_results_dir(), item.result_file)
189
- if not os.path.isfile(fullfn):
190
- return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
191
-
192
- (df, is_alternative) = searcher.search(fullfn, int(item.id))
193
- if df.empty:
194
- return {"Status": "Success", "IsAlternative": False, "Data": ""}
195
-
196
- return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
197
-
198
-
199
- if __name__ == "__main__":
200
- uvicorn.run(
201
- app,
202
- host="0.0.0.0",
203
- port=8000,
204
- log_level="debug"
205
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -1,33 +1,31 @@
1
- from processor.processor import Processor
2
- from constants.constants import *
3
- from ui.gradio_ui import GradioUI
4
- from search.search_by_id import Searcher
5
-
6
- processor=Processor(LONG_TYPES_LIST,
7
- SHORT_TYPES_LIST,
8
- SOUR,
9
- WINE_TYPES,
10
- GBS,
11
- COLORS_FOR_TRIM,
12
- GRAPES,
13
- OTHER_WORDS,
14
- SOUR_MERGE_DICT,
15
- TYPES_WINES_DICT,
16
- COLOR_MERGE_DICT,
17
- COUNTRY_LIST,
18
- NORMALIZED_NAMES_ALTERNATIVES_DICT
19
- )
20
-
21
- searcher=Searcher()
22
-
23
- ui=GradioUI(processor, searcher, "/home/user/app/_data/")
24
- #ui=GradioUI(processor, searcher, "_data")
25
- ui.run_ui()
26
-
27
-
28
-
29
-
30
-
31
-
32
-
33
-
 
1
+ from processor.processor import Processor
2
+ from constants.constants import *
3
+ from ui.gradio_ui import GradioUI
4
+ from search.search_by_id import Searcher
5
+
6
+
7
+
8
+ processor=Processor(LONG_TYPES_LIST,
9
+ SHORT_TYPES_LIST,
10
+ SOUR,
11
+ WINE_TYPES,
12
+ GBS,
13
+ COLORS_FOR_TRIM,
14
+ GRAPES,
15
+ OTHER_WORDS,
16
+ SOUR_MERGE_DICT,
17
+ TYPES_WINES_DICT,
18
+ COLOR_MERGE_DICT)
19
+
20
+ searcher=Searcher()
21
+
22
+ ui=GradioUI(processor, searcher)
23
+ ui.run_ui()
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
 
 
constants/constants.py CHANGED
@@ -75,9 +75,7 @@ SOUR = [
75
  'п/сл',
76
  'п/с',
77
  'сл',
78
- 'сл.',
79
  'сух',
80
- 'сух.'
81
  ]
82
 
83
 
@@ -87,8 +85,7 @@ WINE_TYPES = [
87
  'розовое',
88
  'роз',
89
  'кр',
90
- 'крас',
91
- 'бел',
92
  'розе',
93
  'rosso',
94
  'roso',
@@ -154,13 +151,11 @@ GBS = [
154
 
155
  COLORS_FOR_TRIM = [
156
  'красное',
157
- 'крас',
158
- 'кр',
159
  'белое',
 
 
160
  'бел',
161
- 'розовое',
162
  'розе',
163
- 'rose',
164
  'rosso',
165
  'roso',
166
  'roseto',
@@ -212,8 +207,6 @@ GRAPES = [
212
 
213
 
214
  OTHER_WORDS=[
215
- "Шампанское",
216
- "Шампань",
217
  "Игристое",
218
  "Жемчужное",
219
  "Газированный",
@@ -234,7 +227,6 @@ OTHER_WORDS=[
234
  "Десертный",
235
  "Вкус",
236
  "Сорт",
237
- "односолод."
238
  ]
239
 
240
 
@@ -244,14 +236,10 @@ SOUR_MERGE_DICT={
244
  'sweet':'сладкое',
245
  'сухое':'сухое',
246
  'п/сух':'полусухое',
247
- 'п/сух.':'полусухое',
248
  'п/сл':'полусладкое',
249
- 'п/сл.':'полусладкое',
250
  'п/с':'полусухое',
251
  'сл':'сладкое',
252
- 'сл.':'сладкое',
253
  'сух':'сухое',
254
- 'сух.':'сухое',
255
  None: 'unmatched',
256
  }
257
 
@@ -265,8 +253,7 @@ TYPES_WINES_DICT={
265
  'Сироп':'Сиропы',
266
  'Арманьяк':'Коньяк',
267
  'Бренди':'Коньяк',
268
- 'Ликер':'Ликер',
269
- 'Ликёр': 'Ликер',
270
  'Граппа':'Водка',
271
  'Настойка':'Водка',
272
  'Конфеты':'Сладости',
@@ -276,13 +263,11 @@ TYPES_WINES_DICT={
276
  'Винный напиток': "Вино",
277
  "Игристое вино":'Шампанское',
278
  "Самогон": "Водка",
279
- None: 'unmatched'
280
  }
281
 
282
 
283
  COLOR_MERGE_DICT={
284
  "кр":'красное',
285
- "крас":'красное',
286
  "red":"красное",
287
  "бел":"белое",
288
  "white":"белое",
@@ -298,15 +283,3 @@ COLOR_MERGE_DICT={
298
  None: 'unmatched'
299
  }
300
 
301
- COUNTRY_LIST=[
302
- "Франция",
303
- "Испания",
304
- "Италия",
305
- "Шотландия",
306
- ]
307
-
308
- NORMALIZED_NAMES_ALTERNATIVES_DICT={
309
- "M&H" : ["em end ejch"],
310
- "peats beast" : ["pits bist"],
311
- "xo": ["ho"]
312
- }
 
75
  'п/сл',
76
  'п/с',
77
  'сл',
 
78
  'сух',
 
79
  ]
80
 
81
 
 
85
  'розовое',
86
  'роз',
87
  'кр',
88
+ 'бел',
 
89
  'розе',
90
  'rosso',
91
  'roso',
 
151
 
152
  COLORS_FOR_TRIM = [
153
  'красное',
 
 
154
  'белое',
155
+ 'розовое'
156
+ 'кр',
157
  'бел',
 
158
  'розе',
 
159
  'rosso',
160
  'roso',
161
  'roseto',
 
207
 
208
 
209
  OTHER_WORDS=[
 
 
210
  "Игристое",
211
  "Жемчужное",
212
  "Газированный",
 
227
  "Десертный",
228
  "Вкус",
229
  "Сорт",
 
230
  ]
231
 
232
 
 
236
  'sweet':'сладкое',
237
  'сухое':'сухое',
238
  'п/сух':'полусухое',
 
239
  'п/сл':'полусладкое',
 
240
  'п/с':'полусухое',
241
  'сл':'сладкое',
 
242
  'сух':'сухое',
 
243
  None: 'unmatched',
244
  }
245
 
 
253
  'Сироп':'Сиропы',
254
  'Арманьяк':'Коньяк',
255
  'Бренди':'Коньяк',
256
+ 'Ликер':'Ликеры',
 
257
  'Граппа':'Водка',
258
  'Настойка':'Водка',
259
  'Конфеты':'Сладости',
 
263
  'Винный напиток': "Вино",
264
  "Игристое вино":'Шампанское',
265
  "Самогон": "Водка",
 
266
  }
267
 
268
 
269
  COLOR_MERGE_DICT={
270
  "кр":'красное',
 
271
  "red":"красное",
272
  "бел":"белое",
273
  "white":"белое",
 
283
  None: 'unmatched'
284
  }
285
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/preprocess.py CHANGED
@@ -1,243 +1,224 @@
1
- import json
2
- from tqdm import tqdm
3
- from preprocess.utils.items.attrs import *
4
- from preprocess.utils.common.extracters import *
5
- from preprocess.utils.common.brand_matching import *
6
- from preprocess.utils.common.parallel_brand_matching import *
7
- from preprocess.utils.common.utils import *
8
- from preprocess.utils.common.top_inserts import *
9
- import pandas as pd
10
-
11
-
12
-
13
- class Preprocessor():
14
-
15
- def __init__(self, long_types_list, short_types_list, sour_list,
16
- type_wine, gbs, colors_for_trim, grapes, other_words,
17
- sour_merge_dict, type_merge_dict, color_merge_dict,
18
- country_list, normalized_names_dict):
19
-
20
- self.long_types_list=long_types_list
21
- self.short_types_list=short_types_list
22
- self.sour=sour_list
23
- self.type_wine=type_wine
24
- self.gbs=gbs
25
- self.colors_ft=colors_for_trim
26
- self.grapes=grapes
27
- self.other_words=other_words
28
-
29
- self.types_n_others=long_types_list+other_words+sour_list+country_list
30
- self.types_n_others.remove("Шерри")
31
-
32
- self.sour_dict=sour_merge_dict
33
- self.type_dict=type_merge_dict
34
- self.color_merge_dict=color_merge_dict
35
- self.country_list = country_list
36
- self.normalized_names_dict=normalized_names_dict
37
-
38
-
39
- def preprocess_name(self, name):
40
- return name.replace("\n", " ")
41
-
42
-
43
- def process_items(self, df):
44
- result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
45
- #counter=0
46
- for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
47
-
48
- try:
49
- i=json.loads(i)
50
- result['id'].append(idf)
51
- if 'brand' in i.keys():
52
- result['brand'].append(i['brand'])
53
- else: result['brand'].append(None)
54
-
55
- name = self.preprocess_name(i['name'])
56
- result['name'].append(name)
57
- result['fullname'].append(name)
58
- drink_type=get_type(i, self.long_types_list)
59
- if drink_type is None:
60
- drink_type=check_spark(i)
61
- if drink_type is None:
62
- drink_type=check_color_and_sour(i)
63
- if drink_type is None:
64
- drink_type=check_spark(i, col_name='type_wine')
65
- if drink_type is None:
66
- drink_type=check_color_and_sour(i, types=self.sour)
67
- if drink_type is None:
68
- drink_type=check_color_and_sour(i, col_name='name')
69
- #if 'type' in i.keys():
70
- result['type'].append(drink_type)#i['type'])
71
- #else: dd['type'].append(None)
72
- if 'volume' in i.keys():
73
- result['volume'].append(i['volume'])
74
- else:
75
- vol=extract_volume_or_number(i['name'])
76
- result['volume'].append(vol)
77
- if 'year' in i.keys():
78
- result['year'].append(i['year'])
79
- else:
80
- year=extract_production_year(i['name'])
81
- result['year'].append(year)
82
- alco=extract_alcohol_content(i['name'])
83
- if 'type_wine' in i.keys():
84
- result['type_wine'].append(i['type_wine'])
85
- else: result['type_wine'].append(None)
86
- #f alco is not None:
87
- result['alco'].append(alco)
88
- #else: dd['type_wine'].append(None)
89
- except Exception as ex:
90
- print(idf, ex)
91
- return pd.DataFrame(result)
92
-
93
-
94
- def process_products(self, products):
95
- result={'id':[], 'brand':[], 'name':[], 'fullname':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
96
- for idx, row in tqdm(products.iterrows()):
97
- try:
98
- result['id'].append(row['id'])
99
- result['brand'].append(row['brand'])
100
- result['type_wine'].append(row['category'])
101
- result['type'].append(row['product_type'])
102
- result['name'].append(row['name_long'])
103
- result['fullname'].append(row['name_long'])
104
- vol=extract_volume_or_number(row['name'])
105
- result['volume'].append(vol)
106
- #year=extract_production_year(row['name'])
107
- year=extract_production_year(str(row['name_postfix']))
108
- result['year'].append(year)
109
- #rr['year'].append(row['name_postfix'])
110
- alco=extract_alcohol_content(row['name'])
111
- #f alco is not None:
112
- result['alco'].append(alco)
113
- except Exception as ex:
114
- print(ex)
115
- return pd.DataFrame(result)
116
-
117
-
118
- def prcess_text(self, text):
119
- #text=''+origin
120
- #text=str(split_russian_and_english(text))
121
- gb=find_full_word(text, self.gbs)#get_GB(text)
122
- if gb is not None:
123
- text=text.replace(str(gb), '')
124
-
125
- alcohol = extract_alcohol_content(text)
126
- if alcohol is not None:
127
- alco_w_comma=alcohol.replace('.', ',')
128
- text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
129
- volume_or_number = extract_volume_or_number(text)
130
- if volume_or_number is not None:
131
- volume_with_comma=str(volume_or_number).replace('.', ',')
132
- text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
133
- text = re.sub(r'\s+\b[лЛlL].\b', '', text)
134
- text = re.sub(r'\s+\b[лЛlL]\b', '', text)
135
- test=clean_wine_name(text) #remove_l(text)
136
- #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
137
- # else:
138
- # volume_or_number=re_extract_volume(text)
139
- # if volume_or_number is not None:
140
- # volume_with_comma=volume_or_number.replace('.', ',')
141
- # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
142
- years = extract_years(text)
143
- if years is not None:
144
- text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
145
- production_year = extract_production_year(text)
146
- if production_year is not None:
147
- text=text.replace(str(production_year), '')
148
-
149
-
150
- color=find_full_word(text, self.colors_ft)
151
- if color is not None:
152
- text=text.replace(str(color), '')
153
- sour=find_full_word(text, self.sour) #get_sour(text)
154
- if sour is not None:
155
- text=text.replace(str(sour), '')
156
- # re_extracted_volume=re_extract_volume(text)
157
- # if re_extracted_volume is not None:
158
- # volume_with_comma=re_extracted_volume.replace('.', ',')
159
- # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
160
-
161
- # else:
162
- # re_extracted_volume=re_extract_volume(str(volume_or_number))
163
- # volume_or_number=re_extracted_volume
164
-
165
- return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
166
-
167
-
168
- def process(self, products, items):
169
-
170
- print('------*-----Prepare items catalogue-----*-----')
171
- items=self.process_items(items.copy())
172
- print('-----*-----Prepare products catalogue-----*-----')
173
- products=self.process_products(products.copy())
174
-
175
- items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
176
- products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
177
-
178
- print('-----*-----Split n match-----*-----')
179
- splited=split_n_match(products, items)
180
- items["brand"] = items["brand"].replace(splited)
181
-
182
- print('-----*-----Fill brands in items-----*-----')
183
- fill_brands_in_dataframe(products['brand'].unique(), items)
184
-
185
- print('-----*-----Brand matching-----*-----')
186
- comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
187
- out_prods=list(set(prod_brand_list)-set(comp_list))
188
- out_items=list(set(items_brand_list)-set(comp_list))
189
- brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
190
- items["new_brand"] = items["new_brand"].replace(brand_map_improved)
191
-
192
- items['type']=items['type'].replace(self.type_dict)
193
-
194
- print('-----*-----Unwrap brand cats step 1-----*-----')
195
- unwrap_b_match=unwrap_brands(products)
196
- items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
197
- products["brand"] = products["brand"].replace(unwrap_b_match)
198
-
199
- print('-----*-----Unwrap brand cats step 2-----*-----')
200
- unwrap_b_match=unwrap_brands(products)
201
- items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
202
- products["brand"] = products["brand"].replace(unwrap_b_match)
203
-
204
- print('-----*-----Finding brands in names-----*-----')
205
- items['new_brand']=items['new_brand'].replace('none', None)
206
- i_brands=items[items['new_brand'].isna()]['name'].values
207
- p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
208
- new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
209
- items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
210
-
211
- print('-----*-----Top inserts-----*-----')
212
- process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
213
- self.grapes, self.other_words)
214
-
215
- print('-----*-----Adding service categories-----*-----')
216
- merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
217
- merge_types(items, products, type_merge_dict=self.type_dict)
218
- merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
219
- merge_types(products, products, type_merge_dict=self.type_dict)
220
-
221
-
222
- print('-----*-----Name trimming-----*-----')
223
- item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
224
- #items['name']=items['id'].replace(item_timed_names)
225
- items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
226
- items['gb']=gb
227
- items['sour']=sour
228
- items['sour']=items['sour'].replace(self.sour_dict)
229
-
230
- products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
231
- products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
232
- products['gb']=gb
233
- products['sour']=sour
234
- products['sour']=products['sour'].replace(self.sour_dict)
235
-
236
- print('-----*-----Replacing product types-----*-----')
237
- products['type']=products['type'].replace(self.type_dict)
238
-
239
- return items, products
240
-
241
-
242
-
243
-
 
1
+ import json
2
+ from tqdm import tqdm
3
+ from preprocess.utils.items.attrs import *
4
+ from preprocess.utils.common.extracters import *
5
+ from preprocess.utils.common.brand_matching import *
6
+ from preprocess.utils.common.parallel_brand_matching import *
7
+ from preprocess.utils.common.utils import *
8
+ from preprocess.utils.common.top_inserts import *
9
+ import pandas as pd
10
+
11
+
12
+
13
+ class Preprocessor():
14
+
15
+ def __init__(self, long_types_list, short_types_list, sour_list,
16
+ type_wine, gbs, colors_for_trim, grapes, other_words,
17
+ sour_merge_dict, type_merge_dict, color_merge_dict):
18
+
19
+ self.long_types_list=long_types_list
20
+ self.short_types_list=short_types_list
21
+ self.sour=sour_list
22
+ self.type_wine=type_wine
23
+ self.gbs=gbs
24
+ self.colors_ft=colors_for_trim
25
+ self.grapes=grapes
26
+ self.other_words=other_words
27
+ self.types_n_others=long_types_list+other_words
28
+ self.sour_dict=sour_merge_dict
29
+ self.type_dict=type_merge_dict
30
+ self.color_merge_dict=color_merge_dict
31
+
32
+
33
+ def process_items(self, df):
34
+ result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
35
+ #counter=0
36
+ for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
37
+
38
+ try:
39
+ i=json.loads(i)
40
+ result['id'].append(idf)
41
+ if 'brand' in i.keys():
42
+ result['brand'].append(i['brand'])
43
+ else: result['brand'].append(None)
44
+ result['name'].append(i['name'])
45
+ drink_type=get_type(i, self.long_types_list)
46
+ if drink_type is None:
47
+ drink_type=check_spark(i)
48
+ if drink_type is None:
49
+ drink_type=check_color_and_sour(i)
50
+ if drink_type is None:
51
+ drink_type=check_spark(i, col_name='type_wine')
52
+ if drink_type is None:
53
+ drink_type=check_color_and_sour(i, types=self.sour)
54
+ #if 'type' in i.keys():
55
+ result['type'].append(drink_type)#i['type'])
56
+ #else: dd['type'].append(None)
57
+ if 'volume' in i.keys():
58
+ result['volume'].append(i['volume'])
59
+ else:
60
+ vol=extract_volume_or_number(i['name'])
61
+ result['volume'].append(vol)
62
+ if 'year' in i.keys():
63
+ result['year'].append(i['year'])
64
+ else:
65
+ year=extract_production_year(i['name'])
66
+ result['year'].append(year)
67
+ alco=extract_alcohol_content(i['name'])
68
+ if 'type_wine' in i.keys():
69
+ result['type_wine'].append(i['type_wine'])
70
+ else: result['type_wine'].append(None)
71
+ #f alco is not None:
72
+ result['alco'].append(alco)
73
+ #else: dd['type_wine'].append(None)
74
+ except Exception as ex:
75
+ print(idf, ex)
76
+ return pd.DataFrame(result)
77
+
78
+
79
+ def process_products(self, products):
80
+ result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
81
+ for idx, row in tqdm(products.iterrows()):
82
+ try:
83
+ result['id'].append(row['id'])
84
+ result['brand'].append(row['brand'])
85
+ result['type_wine'].append(row['category'])
86
+ result['type'].append(row['product_type'])
87
+ result['name'].append(row['name_long'])
88
+ vol=extract_volume_or_number(row['name'])
89
+ result['volume'].append(vol)
90
+ #year=extract_production_year(row['name'])
91
+ year=extract_production_year(str(row['name_postfix']))
92
+ result['year'].append(year)
93
+ #rr['year'].append(row['name_postfix'])
94
+ alco=extract_alcohol_content(row['name'])
95
+ #f alco is not None:
96
+ result['alco'].append(alco)
97
+ except Exception as ex:
98
+ print(ex)
99
+ return pd.DataFrame(result)
100
+
101
+
102
+ def prcess_text(self, text):
103
+ #text=''+origin
104
+ #text=str(split_russian_and_english(text))
105
+ gb=find_full_word(text, self.gbs)#get_GB(text)
106
+ if gb is not None:
107
+ text=text.replace(str(gb), '')
108
+
109
+ alcohol = extract_alcohol_content(text)
110
+ if alcohol is not None:
111
+ alco_w_comma=alcohol.replace('.', ',')
112
+ text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
113
+ volume_or_number = extract_volume_or_number(text)
114
+ if volume_or_number is not None:
115
+ volume_with_comma=str(volume_or_number).replace('.', ',')
116
+ text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
117
+ test=clean_wine_name(text) #remove_l(text)
118
+ #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
119
+ # else:
120
+ # volume_or_number=re_extract_volume(text)
121
+ # if volume_or_number is not None:
122
+ # volume_with_comma=volume_or_number.replace('.', ',')
123
+ # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
124
+ years = extract_years(text)
125
+ if years is not None:
126
+ text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
127
+ production_year = extract_production_year(text)
128
+ if production_year is not None:
129
+ text=text.replace(str(production_year), '')
130
+
131
+
132
+ color=find_full_word(text, self.colors_ft)
133
+ if color is not None:
134
+ text=text.replace(str(color), '')
135
+ sour=find_full_word(text, self.sour) #get_sour(text)
136
+ if sour is not None:
137
+ text=text.replace(str(sour), '')
138
+ # re_extracted_volume=re_extract_volume(text)
139
+ # if re_extracted_volume is not None:
140
+ # volume_with_comma=re_extracted_volume.replace('.', ',')
141
+ # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
142
+
143
+ # else:
144
+ # re_extracted_volume=re_extract_volume(str(volume_or_number))
145
+ # volume_or_number=re_extracted_volume
146
+
147
+ return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
148
+
149
+
150
+ def process(self, products, items):
151
+
152
+ print('------*-----Prepare items catalogue-----*-----')
153
+ items=self.process_items(items.copy())
154
+ print('-----*-----Prepare products catalogue-----*-----')
155
+ products=self.process_products(products.copy())
156
+
157
+ items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
158
+ products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
159
+
160
+ print('-----*-----Split n match-----*-----')
161
+ splited=split_n_match(products, items)
162
+ items["brand"] = items["brand"].replace(splited)
163
+
164
+ print('-----*-----Fill brands in items-----*-----')
165
+ fill_brands_in_dataframe(products['brand'].unique(), items)
166
+
167
+ print('-----*-----Brand matching-----*-----')
168
+ comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
169
+ out_prods=list(set(prod_brand_list)-set(comp_list))
170
+ out_items=list(set(items_brand_list)-set(comp_list))
171
+ brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
172
+ items["new_brand"] = items["new_brand"].replace(brand_map_improved)
173
+
174
+ items['type']=items['type'].replace(self.type_dict)
175
+
176
+ print('-----*-----Unwrap brend cats step 1-----*-----')
177
+ unwrap_b_match=unwrap_brands(products)
178
+ items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
179
+ products["brand"] = products["brand"].replace(unwrap_b_match)
180
+
181
+ print('-----*-----Unwrap brend cats step 2-----*-----')
182
+ unwrap_b_match=unwrap_brands(products)
183
+ items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
184
+ products["brand"] = products["brand"].replace(unwrap_b_match)
185
+
186
+ print('-----*-----Finding brands in names-----*-----')
187
+ items['new_brand']=items['new_brand'].replace('none', None)
188
+ i_brands=items[items['new_brand'].isna()]['name'].values
189
+ p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
190
+ new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
191
+ items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
192
+
193
+ print('-----*-----Top inserts-----*-----')
194
+ process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
195
+ self.grapes, self.other_words)
196
+
197
+ print('-----*-----Adding service categories-----*-----')
198
+ merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
199
+ merge_types(items, products)
200
+ merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
201
+ merge_types(products, products)
202
+
203
+
204
+ print('-----*-----Name trimming-----*-----')
205
+ item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
206
+ #items['name']=items['id'].replace(item_timed_names)
207
+ items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
208
+ items['gb']=gb
209
+ items['sour']=sour
210
+ items['sour']=items['sour'].replace(self.sour_dict)
211
+ products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
212
+ products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
213
+ products['gb']=gb
214
+ products['sour']=sour
215
+ products['sour']=products['sour'].replace(self.sour_dict)
216
+
217
+ print('-----*-----Replacing product types-----*-----')
218
+ products['type']=products['type'].replace(self.type_dict)
219
+
220
+ return items, products
221
+
222
+
223
+
224
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
preprocess/utils/common/utils.py CHANGED
@@ -1,165 +1,138 @@
1
- import re
2
-
3
- from tqdm import tqdm
4
-
5
- '''def get_delimiter(file_path):
6
- with open(file_path, 'r') as f:
7
- sample = f.read(1024) # читаем часть файла для анализа
8
- dialect = csv.Sniffer().sniff(sample)
9
- return dialect.delimiter'''
10
-
11
- def get_delimiter(file_path):
12
- with open(file_path, 'r', encoding="utf-8") as f:
13
- ln = f.readline()
14
- if ',' in ln:
15
- return ','
16
- if ';' in ln:
17
- return ';'
18
- if '\t' in ln:
19
- return '\t'
20
- if '|' in ln:
21
- return '|'
22
-
23
- raise ValueError(None, "Error parsing CSV file. Cannot detect delimiter")
24
-
25
- def remove_quotes(text):
26
- return re.sub(r'["\']', '', text)
27
-
28
-
29
- def remove_l(text):
30
- result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
31
-
32
- # Убираем возможные лишние пробелы, возникающие после удаления
33
- result = re.sub(r'\s{2,}', ' ', result).strip()
34
- return result
35
-
36
-
37
- def clean_wine_name(name):
38
- """
39
- Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
40
- Например, "токай л" превратится в "токай".
41
- """
42
- # Регулярное выражение ищет:
43
- # \s+ – один или несколько пробельных символов;
44
- # \b – граница слова;
45
- # [A-Za-zА-ЯЁа-яё] ровно одна буква (латинская или кириллическая);
46
- # \b – граница слова;
47
- # \s*$ – любые пробелы до конца строки.
48
- return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
49
-
50
-
51
- def find_full_word(text, word_list):
52
- """
53
- Ищет первое полное вхождение слова из word_list в строке text.
54
- Возвращает найденное слово или None, если совпадение не найдено.
55
- """
56
- for word in word_list:
57
- pattern = r'\b' + re.escape(word) + r'\b'
58
- if re.search(pattern, text, re.IGNORECASE):
59
- return word
60
- return None
61
-
62
-
63
- def merge_wine_type(items, colors=None, color_merge_dict=None):
64
- result=[]
65
- for row in tqdm(items.iterrows()):
66
- try:
67
- #print("merge_wine_type:" + str(row))
68
- if row[1]['type_wine'] is not None:
69
- color=find_full_word(row[1]['type_wine'], colors)
70
- if color is not None:
71
- result.append(color)
72
- else:
73
- color=find_full_word(row[1]['name'], colors)
74
- if color is not None:
75
- result.append(color)
76
- else:
77
- result.append(None)
78
- else:
79
- color=find_full_word(row[1]['name'], colors)
80
- if color is not None:
81
- result.append(color)
82
- else:
83
- result.append(None)
84
- except Exception as ex:
85
- print(ex)
86
- result.append(None)
87
-
88
- items['new_type_wine']=result
89
- items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
90
-
91
-
92
- def merge_types(items, products, type_merge_dict={}, sub_alco_types=["Бренди", "Шампань", "Шампанское"]):
93
- alco_types=[i.strip().lower() for i in products['type'].unique()]
94
- alco_types.append('ликёр')
95
- result=[]
96
-
97
- for row in tqdm(items.iterrows()):
98
- try:
99
- # Parameter 'sub_alco_types' specifies specific alcohol types that usually specified
100
- # in product / item name along with "parent" type and in this case this subtype should have priority
101
- # For example, "Вино Шампано Ле Брён де Нёвиль", or "Бренди де Херес"
102
- if sub_alco_types:
103
- type_in_name=find_full_word(row[1]['name'], sub_alco_types)
104
- if type_in_name is not None:
105
- result.append(type_in_name)
106
- continue
107
-
108
- type_in_name=find_full_word(row[1]['name'], alco_types)
109
- if type_in_name is not None:
110
- result.append(type_in_name)
111
- continue
112
- if row[1]['type'] is not None:
113
- type_in_type=find_full_word(row[1]['type'], alco_types)
114
- if type_in_type is not None:
115
- result.append(type_in_type)
116
- else:
117
- result.append(row[1]['type'])
118
- else:
119
- result.append(None)
120
- except Exception as ex:
121
- print(ex)
122
- result.append(None)
123
-
124
- items['new_type']=result
125
- #items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
126
- items['new_type'] = items['new_type'].replace(type_merge_dict)
127
-
128
-
129
- def trim_name(text, words_to_remove):
130
- """
131
- Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
132
-
133
- :param text: Исходная строка.
134
- :param words_to_remove: Список слов, которые необходимо удалить.
135
- :return: Обновлённая строка с удалёнными словами.
136
- """
137
- # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
138
- # Используем re.escape, чтобы экранировать спецсимволы в словах.
139
- pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
140
- #print("Pattern: " + pattern)
141
-
142
- # Заменяем найденные полные слова на пустую строку.
143
- new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
144
-
145
- # Убираем лишние пробелы, возникающие после удаления слов.
146
- new_text = re.sub(r'\s+', ' ', new_text).strip()
147
-
148
- return new_text
149
-
150
-
151
- def name_trimmer(df, prcess_text, types_and_others):
152
- result={}
153
- gbs=[]
154
- sours=[]
155
- for idx, row in tqdm(df.iterrows()):
156
- #print("Name1: " + str(row['name']))
157
- text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
158
- #print("Name2: " + text)
159
- text=trim_name(text, types_and_others).replace(',','').replace('.','')
160
- #print("Name3: " + text)
161
- result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
162
-
163
- gbs.append(gb)
164
- sours.append(sour)
165
  return result, gbs, sours
 
1
+ import re
2
+ from tqdm import tqdm
3
+ import csv
4
+
5
+
6
+ def get_delimiter(file_path):
7
+ with open(file_path, 'r') as f:
8
+ sample = f.read(1024) # читаем часть файла для анализа
9
+ dialect = csv.Sniffer().sniff(sample)
10
+ return dialect.delimiter
11
+
12
+
13
+ def remove_quotes(text):
14
+ return re.sub(r'["\']', '', text)
15
+
16
+
17
+ def remove_l(text):
18
+ result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
19
+
20
+ # Убираем возможные лишние пробелы, возникающие после удаления
21
+ result = re.sub(r'\s{2,}', ' ', result).strip()
22
+ return result
23
+
24
+
25
+ def clean_wine_name(name):
26
+ """
27
+ Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
28
+ Например, "токай л" превратится в "токай".
29
+ """
30
+ # Регулярное выражение ищет:
31
+ # \s+ – один или несколько пробельных символов;
32
+ # \b – граница слова;
33
+ # [A-Za-zА-ЯЁа-яё] ровно одна буква (латинская или кириллическая);
34
+ # \b – граница слова;
35
+ # \s*$ – любые пробелы до конца строки.
36
+ return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
37
+
38
+
39
+ def find_full_word(text, word_list):
40
+ """
41
+ Ищет первое полное вхождение слова из word_list в строке text.
42
+ Возвращает найденное слово или None, если совпадение не найдено.
43
+ """
44
+ for word in word_list:
45
+ pattern = r'\b' + re.escape(word) + r'\b'
46
+ if re.search(pattern, text, re.IGNORECASE):
47
+ return word
48
+ return None
49
+
50
+
51
+ def merge_wine_type(items, colors=None, color_merge_dict=None):
52
+ result=[]
53
+ for row in tqdm(items.iterrows()):
54
+ try:
55
+ if row[1]['type_wine'] is not None:
56
+ color=find_full_word(row[1]['type_wine'], colors)
57
+ if color is not None:
58
+ result.append(color)
59
+ else:
60
+ color=find_full_word(row[1]['name'], colors)
61
+ if color is not None:
62
+ result.append(color)
63
+ else:
64
+ result.append(None)
65
+ else:
66
+ color=find_full_word(row[1]['name'], colors)
67
+ if color is not None:
68
+ result.append(color)
69
+ else:
70
+ result.append(None)
71
+ except Exception as ex:
72
+ print(ex)
73
+ result.append(None)
74
+
75
+ items['new_type_wine']=result
76
+ items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
77
+
78
+
79
+ def merge_types(items, products):
80
+ alco_types=[i.strip().lower() for i in products['type'].unique()]
81
+ alco_types.append('ликёр')
82
+ result=[]
83
+ for row in tqdm(items.iterrows()):
84
+ try:
85
+ type_in_name=find_full_word(row[1]['name'], alco_types)
86
+ if type_in_name is not None:
87
+ result.append(type_in_name)
88
+ continue
89
+ if row[1]['type'] is not None:
90
+ type_in_type=find_full_word(row[1]['type'], alco_types)
91
+ if type_in_type is not None:
92
+ result.append(type_in_type)
93
+ else:
94
+ result.append(row[1]['type'])
95
+ else:
96
+ result.append(None)
97
+ except Exception as ex:
98
+ print(ex)
99
+ result.append(None)
100
+
101
+ items['new_type']=result
102
+ items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
103
+
104
+
105
+ def trim_name(text, words_to_remove):
106
+ """
107
+ Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
108
+
109
+ :param text: Исходная строка.
110
+ :param words_to_remove: Список слов, которые необходимо удалить.
111
+ :return: Обновлённая строка с удалёнными словами.
112
+ """
113
+ # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
114
+ # Используем re.escape, чтобы экранировать спецсимволы в словах.
115
+ pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
116
+ #print(pattern)
117
+
118
+ # Заменяем найденные полные слова на пустую строку.
119
+ new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
120
+
121
+ # Убираем лишние пробелы, возникающие после удаления слов.
122
+ new_text = re.sub(r'\s+', ' ', new_text).strip()
123
+
124
+ return new_text
125
+
126
+
127
+ def name_trimmer(df, prcess_text, types_and_others):
128
+ result={}
129
+ gbs=[]
130
+ sours=[]
131
+ for idx, row in tqdm(df.iterrows()):
132
+ text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
133
+ text=trim_name(text, types_and_others).replace(',','').replace('.','')
134
+ result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
135
+
136
+ gbs.append(gb)
137
+ sours.append(sour)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  return result, gbs, sours
preprocess/utils/items/attrs.py CHANGED
@@ -6,7 +6,7 @@ def check_spark(row, col_name='name', types=['Игристое', 'игр']):
6
  return None
7
 
8
 
9
- def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное', 'крас.', 'бел.']):
10
  if col_name in row.keys():
11
  for t in types:
12
  if t.lower() in row[col_name].lower():
 
6
  return None
7
 
8
 
9
+ def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
10
  if col_name in row.keys():
11
  for t in types:
12
  if t.lower() in row[col_name].lower():
processor/matching.py CHANGED
@@ -1,302 +1,159 @@
1
- import json
2
- from constants.constants import *
3
-
4
- from tqdm import tqdm
5
- from transliterate import translit, detect_language
6
- import pandas as pd
7
- from rapidfuzz import fuzz, process
8
- import numpy as np
9
- from math import isnan
10
- from preprocess.utils.common.utils import *
11
-
12
-
13
- def normalize_name(name):
14
- """
15
- Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
16
- приводит к нижнему регистру.
17
- """
18
- try:
19
- if detect_language(name) == 'ru':
20
- return translit(name, 'ru', reversed=True).lower()
21
- except Exception:
22
- pass
23
- return name.lower()
24
-
25
-
26
- def normalize_name_ex(name):
27
- name = normalize_name(name)
28
- for nnk in NORMALIZED_NAMES_ALTERNATIVES_DICT:
29
- word = find_full_word(name, NORMALIZED_NAMES_ALTERNATIVES_DICT[nnk])
30
- if word:
31
- name = name.replace(word, nnk)
32
- return name
33
-
34
-
35
- def compare_names(name1, name2, scorer=fuzz.ratio, score_cutoff=50):
36
- print("Scoring: " + name1 + " vs " + name2)
37
- words1 = name1.split(" ")
38
- words2 = name2.split(" ")
39
-
40
- score = 0
41
- for w1 in words1:
42
- for w2 in words2:
43
- r = scorer(w1, w2)
44
- print("\t " + w1 + " - " + w2 + " ; " + str(r))
45
- if r >= score_cutoff:
46
- score = score + r
47
-
48
- print("Score result: " + str(score / (100*len(words1))))
49
- return score / (100*len(words1))
50
-
51
- def compare_name_with_list(name, names_list, scorer=fuzz.ratio, score_cutoff=50):
52
- result = []
53
- index = 0
54
- for name2 in names_list:
55
- result.append((name2, compare_names(name, name2, scorer, score_cutoff), index))
56
- index = index + 1
57
- return result
58
-
59
-
60
- def prepare_groups_with_ids(items_df):
61
- """
62
- Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
63
- с учетом нормализованного названия.
64
-
65
- Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
66
-
67
- :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
68
- :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
69
- """
70
- items_df = items_df.copy()
71
- items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
72
-
73
- grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
74
- lambda x: list(zip(x['id'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
75
- ).to_dict()
76
-
77
- #print(grouped)
78
-
79
- return grouped
80
-
81
- def prepare_groups_by_alternative_keys(items_df):
82
- """
83
- Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
84
- оригинального и нормализованного имени.
85
-
86
- :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
87
- :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
88
- """
89
- items_df = items_df.copy()
90
- items_df['norm_name'] = items_df['name'].apply(normalize_name_ex)
91
-
92
- #grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume' ''', 'sour''''']).apply(
93
- grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume']).apply(
94
- lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['fullname'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
95
- ).to_dict()
96
- return grouped
97
-
98
- def parse_year(year):
99
- if not year:
100
- return False
101
- elif isinstance(year, str):
102
- return int(year)
103
- elif isinstance(year, (int, float)) and not isnan(year):
104
- return int(year)
105
-
106
- return False
107
-
108
- def order_by_best_year(matched_items, year):
109
- best_matched_items = []
110
- max_year_matched_items = []
111
- other_matched_items = []
112
- max_year = 0
113
-
114
- year = parse_year(year)
115
-
116
- for mi in matched_items:
117
- # Если в оригинале указан год, то ищем точное совпадение, иначе сортируем по году в обратном порядке
118
- try:
119
- if isinstance(mi['year'], (int, float, str)):
120
- mi_year = int(mi['year'])
121
- else:
122
- mi_year = False
123
-
124
- if year and mi_year and (mi_year == year):
125
- best_matched_items.append(mi['item_id'])
126
- elif mi_year:
127
- if mi_year > max_year:
128
- max_year_matched_items = [mi]
129
- max_year = mi_year
130
- elif mi_year == max_year:
131
- max_year_matched_items.append(mi)
132
- else:
133
- other_matched_items.append(mi['item_id'])
134
- else:
135
- other_matched_items.append(mi['item_id'])
136
- except Exception as ex:
137
- print("Error processing best year for item " + str(mi["item_id"]) + " value " + str(mi['year']) + ": " + str(ex))
138
-
139
- if len(best_matched_items) > 0:
140
- for m in matched_items:
141
- if not m['item_id'] in best_matched_items:
142
- m['score'] = m['score']*0.8
143
-
144
- return matched_items
145
-
146
-
147
- def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85, include_alternatives=True):
148
- """
149
- Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
150
- нормализованные группы.
151
-
152
- Производится два прохода:
153
- - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
154
- - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
155
- исключая итемы с исходным брендом.
156
-
157
- Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
158
-
159
- :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
160
- :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
161
- :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
162
- :param name_threshold: Порог сходства для fuzzy matching.
163
- :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
164
- """
165
- results = []
166
- no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
167
-
168
- if name_threshold < 50:
169
- name_threshold = 50
170
-
171
- # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
172
- for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
173
- product_brand = product['brand']
174
- product_type = product['type']
175
- product_name = product['name']
176
- product_volume = product['volume']
177
- product_type_wine = product['new_type_wine']
178
- product_sour = product['sour']
179
-
180
- key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
181
- #print("Name: " + product_name)
182
- #print("Key: " + str(key))
183
- #print("Groups: " + str(items_groups))
184
- items_data = items_groups.get(key, [])
185
- if items_data:
186
- # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
187
- #print("Data: " + str(items_data))
188
- items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
189
- else:
190
- #print("Data: No")
191
- items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
192
-
193
- norm_product_name = normalize_name_ex(product_name)
194
- matches = process.extract(
195
- norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
196
- )
197
-
198
- matched_items = [
199
- {
200
- 'item_id': items_ids[idx_candidate],
201
- 'brand': product_brand,
202
- 'item_name': items_full_names[idx_candidate],
203
- #'item_name': items_names[idx_candidate],
204
- 'score': score,
205
- 'volume': items_volumes[idx_candidate],
206
- 'color': item_type_wine[idx_candidate],
207
- 'sour': items_sour[idx_candidate],
208
- 'year': items_year[idx_candidate],
209
- }
210
- for match, score, idx_candidate in matches
211
- ]
212
-
213
- if matched_items:
214
- matched_items = order_by_best_year(matched_items, product['year'])
215
- matched_items = matched_items[:5]
216
- else:
217
- no_match_products.append((idx, product))
218
-
219
- results.append({
220
- 'product_id': product['id'],
221
- #"matched_top_id": top_matched_id,
222
- 'matched_items': matched_items,
223
- #"alternative_top_id": "",
224
- #'alternative': [] # Заполняется во втором проходе
225
- })
226
-
227
- if include_alternatives:
228
- # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
229
- groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
230
-
231
- # Второй проход: для продуктов без совпадений ищем по альтернативным группам
232
- for idx, product in tqdm(no_match_products):
233
- #print("Product: " + str(product))
234
- product_brand = product['brand']
235
- product_type_wine = product['new_type_wine']
236
- product_type = product['new_type']
237
- product_volume = product['volume']
238
- product_name = product['name']
239
- product_sour = product['sour']
240
-
241
- #alt_key = (product_type_wine, product_type, product_volume, product_sour)
242
- alt_key = (product_type_wine, product_type, product_volume)
243
-
244
- #print("AltName: " + str(product))
245
- #print("AltKey: " + str(alt_key))
246
- #print("AltGroups: " + str(groups_by_alternative_keys))
247
- #print("AltGroups Keys: " + str(groups_by_alternative_keys.keys()))
248
- type_items = groups_by_alternative_keys.get(alt_key, [])
249
- #print("AltGroups2: " + str(type_items))
250
- # Фильтруем, исключая итемы с исходным брендом
251
- filtered_items = [item for item in type_items if item[1] != product_brand]
252
- if filtered_items:
253
- #print("AltData: " + str(filtered_items))
254
- alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
255
- else:
256
- #print("AltData: No")
257
- alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
258
-
259
- norm_product_name = normalize_name_ex(product_name)
260
- #print("norm_product_name: " + str(norm_product_name))
261
- #print("alt_norm_names: " + str(alt_norm_names))
262
- alt_matches = process.extract(
263
- norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=50
264
- )
265
-
266
-
267
- #alt_matches = compare_name_with_list(
268
- # norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=70
269
- #)
270
-
271
- #print("alt_matches: " + str(alt_matches))
272
- alt_matched_items = [
273
- {
274
- 'item_id': alt_ids[idx_candidate],
275
- 'brand': alt_brands[idx_candidate],
276
- #'item_name': alt_names[idx_candidate],
277
- 'item_name': alt_full_names[idx_candidate],
278
- 'score': score / 2,
279
- 'volume': alt_volumes[idx_candidate],
280
- 'color': alt_type_wine[idx_candidate],
281
- 'sour': alt_sour[idx_candidate],
282
- 'year': alt_year[idx_candidate],
283
- }
284
- for match, score, idx_candidate in alt_matches
285
- ]
286
-
287
- alt_matched_items = order_by_best_year(alt_matched_items, product['year'])
288
- alt_matched_items = alt_matched_items[:5]
289
-
290
- results[idx]['matched_items'].extend(alt_matched_items)
291
-
292
- for r in results:
293
- r['matched_items'] = json.dumps(r['matched_items'], ensure_ascii=False)
294
-
295
- #if alt_matched_items:
296
- # results[idx]['alternative_top_id'] = alt_matched_items[0]["item_id"]
297
-
298
- #results[idx]['alternative'] = alt_matched_items
299
-
300
- results_df = pd.DataFrame(results)
301
- merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
302
  return merged_df
 
1
+ from tqdm import tqdm
2
+ from transliterate import translit, detect_language
3
+ import pandas as pd
4
+ from rapidfuzz import fuzz, process
5
+
6
+
7
+ def normalize_name(name):
8
+ """
9
+ Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
10
+ приводит к нижнему регистру.
11
+ """
12
+ try:
13
+ if detect_language(name) == 'ru':
14
+ return translit(name, 'ru', reversed=True).lower()
15
+ except Exception:
16
+ pass
17
+ return name.lower()
18
+
19
+ def prepare_groups_with_ids(items_df):
20
+ """
21
+ Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
22
+ с учетом нормализованного названия.
23
+
24
+ Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
25
+
26
+ :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
27
+ :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
28
+ """
29
+ items_df = items_df.copy()
30
+ items_df['norm_name'] = items_df['name'].apply(normalize_name)
31
+
32
+ grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
33
+ lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
34
+ ).to_dict()
35
+ return grouped
36
+
37
+ def prepare_groups_by_alternative_keys(items_df):
38
+ """
39
+ Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
40
+ оригинального и нормализованного имени.
41
+
42
+ :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
43
+ :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
44
+ """
45
+ items_df = items_df.copy()
46
+ items_df['norm_name'] = items_df['name'].apply(normalize_name)
47
+
48
+ grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
49
+ lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
50
+ ).to_dict()
51
+ return grouped
52
+
53
+ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
54
+ """
55
+ Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
56
+ нормализованные группы.
57
+
58
+ Производится два прохода:
59
+ - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
60
+ - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
61
+ исключая итемы с исходным брендом.
62
+
63
+ Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
64
+
65
+ :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
66
+ :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
67
+ :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
68
+ :param name_threshold: Порог сходства для fuzzy matching.
69
+ :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
70
+ """
71
+ results = []
72
+ no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
73
+
74
+ # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
75
+ for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
76
+ product_brand = product['brand']
77
+ product_type = product['type']
78
+ product_name = product['name']
79
+ product_volume = product['volume']
80
+ product_type_wine = product['new_type_wine']
81
+ product_sour = product['sour']
82
+
83
+ key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
84
+ items_data = items_groups.get(key, [])
85
+ if items_data:
86
+ # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
87
+ items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
88
+ else:
89
+ items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
90
+
91
+ norm_product_name = normalize_name(product_name)
92
+ matches = process.extract(
93
+ norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
94
+ )
95
+ matched_items = [
96
+ {
97
+ 'item_id': items_ids[idx_candidate],
98
+ 'item_name': items_names[idx_candidate],
99
+ 'score': score,
100
+ 'volume': items_volumes[idx_candidate],
101
+ 'color': item_type_wine[idx_candidate],
102
+ 'sour': items_sour[idx_candidate],
103
+ 'year': items_year[idx_candidate],
104
+ }
105
+ for match, score, idx_candidate in matches
106
+ ]
107
+
108
+ if not matched_items:
109
+ no_match_products.append((idx, product))
110
+
111
+ results.append({
112
+ 'product_id': product['id'],
113
+ 'matched_items': matched_items,
114
+ 'alternative': [] # Заполняется во втором проход��
115
+ })
116
+
117
+ # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
118
+ groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
119
+
120
+ # Второй проход: для продуктов без совпадений ищем по альтернативным группам
121
+ for idx, product in tqdm(no_match_products):
122
+ product_brand = product['brand']
123
+ product_type_wine = product['new_type_wine']
124
+ product_type = product['new_type']
125
+ product_volume = product['volume']
126
+ product_name = product['name']
127
+ product_sour = product['sour']
128
+
129
+ alt_key = (product_type_wine, product_type, product_volume, product_sour)
130
+ type_items = groups_by_alternative_keys.get(alt_key, [])
131
+ # Фильтруем, исключая итемы с исходным брендом
132
+ filtered_items = [item for item in type_items if item[1] != product_brand]
133
+ if filtered_items:
134
+ alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
135
+ else:
136
+ alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
137
+
138
+ norm_product_name = normalize_name(product_name)
139
+ alt_matches = process.extract(
140
+ norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
141
+ )
142
+ alt_matched_items = [
143
+ {
144
+ 'item_id': alt_ids[idx_candidate],
145
+ 'item_name': alt_names[idx_candidate],
146
+ 'score': score,
147
+ 'volume': alt_volumes[idx_candidate],
148
+ 'color': alt_type_wine[idx_candidate],
149
+ 'sour': alt_sour[idx_candidate],
150
+ 'year': alt_year[idx_candidate],
151
+ }
152
+ for match, score, idx_candidate in alt_matches
153
+ ]
154
+
155
+ results[idx]['alternative'] = alt_matched_items
156
+
157
+ results_df = pd.DataFrame(results)
158
+ merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return merged_df
processor/processor.py CHANGED
@@ -1,30 +1,32 @@
1
- from preprocess.preprocess import Preprocessor
2
- from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
3
-
4
-
5
- class Processor():
6
- def __init__(self, long_types_list, short_types_list, sour_list,
7
- type_wine, gbs, colors_for_trim, grapes, other_words,
8
- sour_merge_dict, type_merge_dict, color_merge_dict,
9
- country_list, normalized_names_dict):
10
-
11
- self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
12
- type_wine, gbs, colors_for_trim, grapes, other_words,
13
- sour_merge_dict, type_merge_dict, color_merge_dict,
14
- country_list, normalized_names_dict)
15
-
16
- def process(self, products, items, is_items_first=False, th=65, include_alternatives=True):
17
- items, products=self.preprocessor.process(products, items)
18
-
19
- print('-----*-----Matching-----*-----')
20
-
21
- if is_items_first:
22
- products['new_brand']=products['brand']
23
- items['brand']=items['new_brand']
24
- products_groups = prepare_groups_with_ids(products)
25
- res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th, include_alternatives=include_alternatives)
26
- else:
27
- items_groups = prepare_groups_with_ids(items)
28
- res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th, include_alternatives=include_alternatives)
29
-
30
- return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
 
 
 
1
+ from preprocess.preprocess import Preprocessor
2
+ from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
3
+
4
+
5
+ class Processor():
6
+ def __init__(self, long_types_list, short_types_list, sour_list,
7
+ type_wine, gbs, colors_for_trim, grapes, other_words,
8
+ sour_merge_dict, type_merge_dict, color_merge_dict):
9
+
10
+ self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
11
+ type_wine, gbs, colors_for_trim, grapes, other_words,
12
+ sour_merge_dict, type_merge_dict, color_merge_dict)
13
+
14
+ def process(self, products, items, is_items_first=False, th=65):
15
+ items, products=self.preprocessor.process(products, items)
16
+
17
+ print('-----*-----Matching-----*-----')
18
+
19
+ if is_items_first:
20
+ products['new_brand']=products['brand']
21
+ items['brand']=items['new_brand']
22
+ products_groups = prepare_groups_with_ids(products)
23
+ res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
24
+ else:
25
+ items_groups = prepare_groups_with_ids(items)
26
+ res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
27
+
28
+ return res.drop(['type','type_wine','alco','gb'], axis=1), items, products #'year',
29
+
30
+
31
+
32
+
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- python-Levenshtein
2
- transliterate
3
- rapidfuzz
4
- pyahocorasick
5
- unidecode
6
- pqdm
7
  tqdm
 
1
+ python-Levenshtein
2
+ transliterate
3
+ rapidfuzz
4
+ pyahocorasick
5
+ unidecode
6
+ pqdm
7
  tqdm
search/matching_judge.py DELETED
@@ -1,156 +0,0 @@
1
- import json
2
- import pandas as pd
3
- import ast
4
- import csv
5
-
6
- def verify_csv(csv_file):
7
- lnnum = 1
8
- w = open(csv_file + ".1", "w", encoding="utf-8")
9
-
10
- with open(csv_file, "r", encoding="utf-8") as f:
11
- while True:
12
- ln = f.readline()
13
- if lnnum == 1:
14
- w.write(ln)
15
-
16
- if len(ln) == 0:
17
- break
18
-
19
- if ln.count('"') % 2 == 1:
20
- #raise Exception("Incorrect quotes at line " + str(lnnum) + " in file [" + csv_file + "]")
21
- w.write(ln)
22
-
23
- lnnum = lnnum + 1
24
-
25
- w.close()
26
- return True
27
-
28
- def compare_matching_with_manual(products_file, items_file, match_result_file, manual_result_file):
29
- '''with open(products_file, mode="r", encoding="utf-8", newline='') as csvfile:
30
- csvreader = csv.reader(csvfile, dialect="excel-tab")
31
- for row in csvreader:
32
- print(', '.join(row))'''
33
-
34
- if not verify_csv(products_file):
35
- raise Exception
36
-
37
- products_df = pd.read_csv(products_file, sep="\t")
38
- items_df = pd.read_csv(items_file, sep=";")
39
- match_df = pd.read_csv(match_result_file, sep="\t")
40
- manual_df = pd.read_csv(manual_result_file, sep="\t")
41
-
42
- results = {
43
- "item_count" : int(items_df.count()[0]),
44
- "product_count" : int(products_df.count()[0]),
45
- "match_count" : int(match_df.count()[0]),
46
- "manual_count" : int(manual_df.count()[0]),
47
- }
48
-
49
-
50
- items_to_manual = {}
51
- for index, row in items_df.iterrows():
52
- x = manual_df[manual_df['item_id'] == row["id"]]['state']
53
- if (len(x) > 0) and (x.values[0] == 1):
54
- p = products_df[products_df["id"] == manual_df.iloc[int(x.index[0])]["product_id"]]
55
- items_to_manual[row["id"]] = int(manual_df.iloc[int(x.index[0])]["product_id"])
56
-
57
-
58
- '''items_to_auto = {}
59
- for index, row in match_df.iterrows():
60
- if row["matched_top_id"] > 0:
61
- p = products_df[products_df["id"] == int(row["matched_top_id"])]
62
- items_to_auto[row["id"]] = int(row["matched_top_id"])
63
-
64
- results["items_to_manual_count"] = len(items_to_manual)
65
- results["items_to_auto_count"] = len(items_to_auto)'''
66
-
67
-
68
- result_list = []
69
-
70
- for index, row in items_df.iterrows():
71
- result_data = {}
72
-
73
- result_data["id"] = row["id"]
74
- result_data["match_side"] = "no_match"
75
- result_data["auto_score"] = ""
76
- result_data["manual_score"] = ""
77
- result_data["discuss"] = ""
78
-
79
-
80
- auto_match = match_df[match_df['id'] == row["id"]]["matched_items"].values[0]
81
- '''if len(auto_match) > 2:
82
- if auto_match.find("\\'") >= 0:
83
- auto_match = auto_match
84
-
85
- auto_match = auto_match.replace("\\'", "$$$$$$").replace(": None}", ": \"\"}").replace("'", '"').replace("$$$$$$", "\\'")
86
-
87
- auto_match = json.loads(auto_match)'''
88
-
89
- manual_match = None
90
- manual = manual_df[manual_df['item_id'] == row["id"]]['state']
91
- if (len(manual) > 0) and (manual.values[0] == 1):
92
- p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
93
-
94
- if len(p.values) > 0:
95
- manual_match = p
96
- else:
97
- print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
98
-
99
- if (auto_match is not None) and len(auto_match) > 2 and (manual_match is not None):
100
- result_data["match_side"] = "both"
101
-
102
- manual_id = int(manual_match["id"].values[0])
103
- auto_match_ns = auto_match.replace(" ", "")
104
- i1 = auto_match_ns.find("'item_id':")
105
- i2 = auto_match_ns.find("'item_id':" + str(manual_id) + ",")
106
-
107
- if i1 == i2:
108
- result_data["auto_score"] = 1
109
- result_data["manual_score"] = 1
110
- elif i2 >= 0:
111
- result_data["auto_score"] = 0.5
112
- result_data["manual_score"] = 0.5
113
- elif (auto_match is not None) and len(auto_match) > 2:
114
- result_data["match_side"] = "only_auto"
115
- elif manual_match is not None:
116
- result_data["match_side"] = "only_manual"
117
-
118
- result_data["discuss"] = ""
119
- result_data["item"] = row["attrs"]
120
-
121
-
122
- result_data["auto_match"] = auto_match
123
-
124
- manual_string = ""
125
- if (manual_match is not None):
126
- manual_string = '{' + \
127
- '"id": ' + str(manual_match["id"].values[0]) + ',' + \
128
- '"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
129
- '"name": "' + str(manual_match["name_long"].values[0]) + '",' + \
130
- '"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
131
- '"year": ' + str(manual_match["year"].values[0]) + '"}'
132
-
133
- result_data["manual_match"] = manual_string
134
- result_list.append(result_data)
135
-
136
-
137
- results_df = pd.DataFrame(result_list)
138
- results_df.to_csv("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\mjudge_new.csv")
139
-
140
-
141
-
142
- '''common_match = {}
143
- full_match = {}
144
- for a_match in items_to_auto:
145
- if a_match in items_to_manual:
146
- common_match[a_match] = [items_to_auto[a_match], items_to_manual[a_match]]
147
- if items_to_auto[a_match] == items_to_manual[a_match]:
148
- full_match[a_match] = items_to_auto[a_match]'''
149
-
150
-
151
- #results["items_to_manual"] = len(items_to_manual)
152
- #results["items_to_auto"] = len(items_to_auto
153
- print(results)
154
-
155
- return results
156
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
search/search_by_id.py CHANGED
@@ -1,53 +1,24 @@
1
- import json
2
- import pandas as pd
3
- import ast
4
-
5
-
6
- class Searcher():
7
- def __init__(self):
8
- self.df = None
9
-
10
-
11
- def set_df(self, df):
12
- self.df = df
13
- try:
14
- self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
15
- self.df['alternative'] = self.df['alternative'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
16
- except Exception as e:
17
- print(e)
18
-
19
-
20
- def search(self, query):
21
- data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
22
- return pd.DataFrame(data)
23
-
24
-
25
- def search(self, resultfn, query):
26
- is_alternative_items = False
27
- df_matched_items = pd.DataFrame()
28
-
29
- matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
30
- self.set_df(matching_result)
31
-
32
- items = self.df[self.df['id']==query]
33
- matched_items = items['matched_items']
34
- if (len(matched_items) != 0) and (len(matched_items.values[0])):
35
- data = json.loads(json.dumps(matched_items.values[0]))
36
- df_matched_items = pd.DataFrame(data)
37
- is_alternative_items = False
38
- else:
39
- alter_items = items['alternative']
40
-
41
- if (len(alter_items) != 0) and (len(alter_items.values[0])):
42
- data = json.loads(json.dumps(alter_items.values[0]))
43
- df_matched_items = pd.DataFrame(data)
44
- is_alternative_items = True
45
-
46
- return (df_matched_items, is_alternative_items)
47
-
48
-
49
- def search_in_uploaded_file(self, path, query):
50
- matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
51
- self.set_df(matching_result)
52
- result=self.search(query)
53
  return result
 
1
+ import json
2
+ import pandas as pd
3
+ import ast
4
+
5
+
6
+ class Searcher():
7
+ def __init__(self):
8
+ self.df = None
9
+ def set_df(self, df):
10
+ self.df = df
11
+ try:
12
+ self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
13
+ except Exception as e:
14
+ print(e)
15
+ #self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
16
+ def search(self, query):
17
+ data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
18
+ return pd.DataFrame(data)
19
+
20
+ def search_in_uploaded_file(self, path, query):
21
+ matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
22
+ self.set_df(matching_result)
23
+ result=self.search(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return result
tmp/prod.csv CHANGED
@@ -1 +1 @@
1
- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
 
1
+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
tmp/service/prod.csv CHANGED
@@ -1 +1 @@
1
- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
 
1
+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
tmp/utils.py CHANGED
@@ -1,48 +1,37 @@
1
- import pandas as pd
2
- from preprocess.utils.common.utils import get_delimiter
3
- import shutil
4
- import os
5
-
6
-
7
- def update_products_csv(new_csv_path, prods_file, overwrite_existing):
8
- if os.path.isfile(prods_file) and not overwrite_existing:
9
- main_sep=get_delimiter(prods_file)
10
- main_csv=pd.read_csv(prods_file, sep=main_sep, on_bad_lines="warn")
11
-
12
- new_sep=get_delimiter(new_csv_path)
13
- new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
14
- if 'attrs' in new_csv.columns.values:
15
- raise Exception("Uploaded Products CSV does not seem to be valid")
16
-
17
- result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
18
- result.to_csv(prods_file, sep=main_sep, index=False)
19
- else:
20
- new_sep=get_delimiter(new_csv_path)
21
- new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
22
- new_csv.to_csv(prods_file, sep=new_sep, index=False)
23
-
24
- return prods_file
25
-
26
-
27
- '''def is_csv_exist(path):
28
- file_list=glob(path+'/*.csv')
29
- if len(file_list)>0:
30
- return file_list[0]
31
- else:
32
- None
33
-
34
-
35
- def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
36
- main_path=is_csv_exist(main_dir)
37
- if main_path==None:
38
- new_path = shutil.move(new_path, main_dir)
39
- return new_path
40
- else:
41
- update_products_csv(main_path, new_path)
42
- return main_path
43
-
44
- def remover(data_path):
45
- #path=is_csv_exist('/home/user/app/tmp/prod.csv')
46
- #if path!=None:
47
- os.remove(os.getcwd()+'/tmp/prod.csv')
48
- shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''
 
1
+ import pandas as pd
2
+ from preprocess.utils.common.utils import get_delimiter
3
+ from glob import glob
4
+ import shutil
5
+ import os
6
+
7
+
8
+ def update_products_csv(new_csv_path, main_csv_path='/home/user/app/tmp/prod.csv'):
9
+ main_sep=get_delimiter(main_csv_path)
10
+ main_csv=pd.read_csv(main_csv_path, sep=main_sep)
11
+ new_sep=get_delimiter(new_csv_path)
12
+ new_csv=pd.read_csv(new_csv_path, sep=new_sep)
13
+ result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
14
+ result.to_csv(main_csv_path, sep=main_sep, index=False)
15
+
16
+ def is_csv_exist(path):
17
+ file_list=glob(path+'/*.csv')
18
+ if len(file_list)>0:
19
+ return file_list[0]
20
+ else:
21
+ None
22
+
23
+
24
+ def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
25
+ main_path=is_csv_exist(main_dir)
26
+ if main_path==None:
27
+ new_path = shutil.move(new_path, main_dir)
28
+ return new_path
29
+ else:
30
+ update_products_csv(main_path, new_path)
31
+ return main_path
32
+
33
+ def remover():
34
+ #path=is_csv_exist('/home/user/app/tmp/prod.csv')
35
+ #if path!=None:
36
+ os.remove(os.getcwd()+'/tmp/prod.csv')
37
+ shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')
 
 
 
 
 
 
 
 
 
 
 
ui/gradio_ui.py CHANGED
@@ -1,178 +1,121 @@
1
- import gradio as gr
2
- import pandas as pd
3
- from preprocess.utils.common.utils import get_delimiter
4
- from tmp.utils import update_products_csv #remover,
5
- import os
6
- import csv
7
- import datetime, time
8
-
9
-
10
- class GradioUI():
11
-
12
- def __init__(self, processor, searcher, data_path):
13
- self.processor=processor
14
- self.searcher=searcher
15
- self.data_path = data_path
16
-
17
- gr.set_static_paths(paths=[os.path.join(self.get_data_dir(), "products")])
18
-
19
- def get_data_dir(self):
20
- return self.data_path
21
-
22
- def get_products_dir(self):
23
- return os.path.join(self.get_data_dir(), "products")
24
-
25
- def get_items_dir(self):
26
- return os.path.join(self.get_data_dir(), "items")
27
-
28
- def get_results_dir(self):
29
- return os.path.join(self.get_data_dir(), "results")
30
-
31
- def get_products_file_date(self):
32
- fullfn = os.path.join(self.data_path, "products", "products.csv")
33
- if not os.path.isfile(fullfn):
34
- return "Файл Products не найден"
35
-
36
- stinfo = os.stat(fullfn)
37
- return time.ctime(stinfo.st_mtime)
38
-
39
-
40
- def upload_products_file(self, prods_file, overwrite_existing):
41
- try:
42
- if not os.path.exists(self.get_products_dir()):
43
- os.makedirs(self.get_products_dir())
44
-
45
- fullfn = os.path.join(self.get_products_dir(), "products.csv")
46
-
47
- if prods_file != None:
48
- update_products_csv(prods_file, fullfn, overwrite_existing)
49
-
50
- gr.Info("Файл Products успешно загружен")
51
- except Exception as ex:
52
- raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
53
-
54
-
55
- def process_items(self, items_file, is_items_first, threshold, include_alternatives): #, q_id):
56
- try:
57
- prods_file = os.path.join(self.get_products_dir(), "products.csv")
58
- if not os.path.isfile(prods_file):
59
- raise Exception("Файл Products не найден")
60
-
61
- if items_file != None:
62
- items_delimiter=get_delimiter(items_file)
63
- print('items delimiter: '+items_delimiter)
64
- #row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
65
- row_items = pd.read_csv(items_file, sep=items_delimiter)
66
- if not 'attrs' in row_items.columns.values:
67
- raise Exception("Uploaded Items CSV does not seem to be valid")
68
-
69
- products_delimiter=get_delimiter(prods_file)
70
- print('products delimiter: '+products_delimiter)
71
- #row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
72
- row_products = pd.read_csv(prods_file, sep=products_delimiter)
73
-
74
- # if q_id in row_products['id'].unique():
75
- # row_products=row_products[row_products['id']==q_id]
76
-
77
- #print("product id: " + str(q_id))
78
-
79
- df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold, include_alternatives)
80
-
81
- self.searcher.set_df(df.copy())
82
- #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
83
- # output_csv = tmp.name
84
- results_path = self.get_results_dir()
85
- if not os.path.exists(results_path):
86
- os.makedirs(results_path)
87
-
88
- output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
89
- output_csv = os.path.join(results_path, output_csv)
90
- df.to_csv(output_csv, sep='\t', index=False, quotechar="'", quoting=csv.QUOTE_NONE, escapechar="@")
91
- return output_csv
92
- except Exception as ex:
93
- raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
94
-
95
- def on_page_load(self, r: gr.Request):
96
- m_time = self.get_products_file_date()
97
- return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
98
-
99
-
100
- def run_ui(self):
101
- with gr.Blocks() as demo:
102
- tabs = gr.Tabs()
103
- with tabs:
104
-
105
- # with gr.Row():
106
- # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
107
- # process_button = gr.Button("Обновить")
108
-
109
- # Вкладка для обработки CSV файлов
110
- with gr.TabItem("Обработка каталога поставщика"):
111
- gr.Markdown("## Обработка каталога поставщика")
112
-
113
- m_time = self.get_products_file_date()
114
- prod_file_info2 = gr.Markdown(f"Дата последнего о��новления файла Products: {m_time}")
115
- with gr.Row():
116
- #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
117
- file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
118
- #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
119
- with gr.Row():
120
- toggle_input = gr.Checkbox(label="Инвертировать поиск", value=True)
121
- toggle_alternative = gr.Checkbox(label="Включать в результаты альтернативные варианты", value=True)
122
-
123
- threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
124
- process_button = gr.Button("Загрузить файл с каталогом и сравнить")
125
- output_file = gr.File(label="Скачать результат (CSV)")
126
- process_button.click(
127
- fn=self.process_items,
128
- inputs=[file_items, toggle_input, threshold_input, toggle_alternative], #, search_number],
129
- outputs=output_file
130
- )
131
-
132
- with gr.TabItem("Загрузка файла Products"):
133
- with gr.Row():
134
- prod_file_info1 = gr.Markdown("## Загрузка файла Products")
135
- product_download_button = gr.DownloadButton(label="Скачать", value=os.path.join(self.get_products_dir(), "products.csv"), visible=True)
136
- with gr.Row():
137
- file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
138
- with gr.Row():
139
- toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
140
- upload_button = gr.Button("Загрузить файл")
141
- upload_button.click(
142
- fn=self.upload_products_file,
143
- inputs=[file_input1, toggle_input],
144
- #outputs=output_file
145
- )
146
-
147
-
148
- # Вкладка для поиска
149
- with gr.TabItem("Поиск в обработанном csv"):
150
- gr.Markdown("## Поиск")
151
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
152
- search_button = gr.Button("Найти")
153
- search_table = gr.Dataframe(label="Результаты поиска")
154
- search_button.click(
155
- fn=self.searcher.search,
156
- inputs=[search_number],
157
- outputs=search_table
158
- )
159
-
160
- with gr.TabItem("Загрузка результат и поиск в нем"):
161
- gr.Markdown("## Поиск")
162
- with gr.Row():
163
- input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
164
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
165
- search_button = gr.Button("Найти")
166
- search_table = gr.Dataframe(label="Результаты поиска")
167
- search_button.click(
168
- fn=self.searcher.search_in_uploaded_file,
169
- inputs=[input_path, search_number],
170
- outputs=search_table
171
- )
172
-
173
- #with gr.TabItem("Удалить сохраненные продукты"):
174
- # del_button = gr.Button("Удалить")
175
- # process_button.click(fn=remover)
176
-
177
- demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
178
  demo.launch()
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import tempfile
4
+ from preprocess.utils.common.utils import get_delimiter
5
+ from tmp.utils import uploader, remover, update_products_csv
6
+ from glob import glob
7
+ import os
8
+
9
+
10
+ class GradioUI():
11
+
12
+ def __init__(self, processor, searcher=None):
13
+ self.processor=processor
14
+ self.searcher=searcher
15
+
16
+
17
+
18
+ def process_files(self, file1, file2, is_items_first, threshold): #, q_id):
19
+ try:
20
+ print(file1)
21
+
22
+ print()
23
+ print(os.getcwd())
24
+ print(os.path.dirname(os.path.abspath(__file__)))
25
+ print()
26
+
27
+ if file1!=None:
28
+ #file1=uploader(file1)
29
+ update_products_csv(file1)
30
+ #else:
31
+ #file1=glob('./home/user/app/tmp/*.csv')[0]
32
+ file1=os.getcwd()+'/tmp/prod.csv'
33
+
34
+ #print()
35
+ #print(file1)
36
+ #print()
37
+
38
+ if file2!=None:
39
+ items_delimiter=get_delimiter(file2)
40
+ print('items delimiter: '+items_delimiter)
41
+ row_items=pd.read_csv(file2, sep=items_delimiter, on_bad_lines='skip')
42
+
43
+ products_delimiter=get_delimiter(file1)
44
+ print('products delimiter: '+products_delimiter)
45
+ row_products=pd.read_csv(file1, sep=products_delimiter, on_bad_lines='skip')
46
+
47
+ # if q_id in row_products['id'].unique():
48
+ # row_products=row_products[row_products['id']==q_id]
49
+
50
+ #print("product id: " + str(q_id))
51
+
52
+ df, items, products= self.processor.process(row_products, row_items, is_items_first, threshold)
53
+ # Создаём временный CSV файл для сохранения результата
54
+
55
+ self.searcher.set_df(df.copy())
56
+
57
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
58
+ output_csv = tmp.name
59
+ df.to_csv(output_csv, sep='\t', index=False)
60
+ return output_csv
61
+ except Exception as ex:
62
+ raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
63
+ return None
64
+
65
+
66
+ def run_ui(self):
67
+ with gr.Blocks() as demo:
68
+ with gr.Tabs():
69
+
70
+ # with gr.Row():
71
+ # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
72
+ # process_button = gr.Button("Обновить")
73
+
74
+ # Вкладка для обработки CSV файлов
75
+ with gr.TabItem("Обработка CSV файлов"):
76
+ gr.Markdown("## Обработка CSV файлов")
77
+ with gr.Row():
78
+ file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
79
+ file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
80
+ #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
81
+ with gr.Row():
82
+ toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
83
+ threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
84
+ process_button = gr.Button("Обработать файлы")
85
+ output_file = gr.File(label="Скачать результат (CSV)")
86
+ process_button.click(
87
+ fn=self.process_files,
88
+ inputs=[file_input1, file_input2, toggle_input, threshold_input], #, search_number],
89
+ outputs=output_file
90
+ )
91
+
92
+ # Вкладка для поиска
93
+ with gr.TabItem("Поиск в обработанном csv"):
94
+ gr.Markdown("## Поиск")
95
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
96
+ search_button = gr.Button("Найти")
97
+ search_table = gr.Dataframe(label="Результаты поиска")
98
+ search_button.click(
99
+ fn=self.searcher.search,
100
+ inputs=[search_number],
101
+ outputs=search_table
102
+ )
103
+
104
+ with gr.TabItem("Загрузка результат и поиск в нем"):
105
+ gr.Markdown("## Поиск")
106
+ with gr.Row():
107
+ input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
108
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
109
+ search_button = gr.Button("Найти")
110
+ search_table = gr.Dataframe(label="Результаты поиска")
111
+ search_button.click(
112
+ fn=self.searcher.search_in_uploaded_file,
113
+ inputs=[input_path, search_number],
114
+ outputs=search_table
115
+ )
116
+
117
+ with gr.TabItem("Удалить сохраненные продукты"):
118
+ del_button = gr.Button("Удалить")
119
+ process_button.click(fn=remover)
120
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  demo.launch()