.gitattributes CHANGED
@@ -1,35 +1,35 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,12 @@
1
- ---
2
- title: Product Matching
3
- emoji: 🏃
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.19.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: Product Matching
3
+ emoji: 🏃
4
+ colorFrom: gray
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.19.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
api.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import datetime
4
+
5
+ from processor.processor import Processor
6
+ from constants.constants import *
7
+ from search.search_by_id import Searcher
8
+ from fastapi import FastAPI, File, UploadFile, HTTPException
9
+ import uvicorn
10
+ from pydantic import BaseModel
11
+ import pandas as pd
12
+ from tmp.utils import update_products_csv
13
+
14
+ processor=Processor(LONG_TYPES_LIST,
15
+ SHORT_TYPES_LIST,
16
+ SOUR,
17
+ WINE_TYPES,
18
+ GBS,
19
+ COLORS_FOR_TRIM,
20
+ GRAPES,
21
+ OTHER_WORDS,
22
+ SOUR_MERGE_DICT,
23
+ TYPES_WINES_DICT,
24
+ COLOR_MERGE_DICT)
25
+
26
+ searcher=Searcher()
27
+
28
+ class item_by_id(BaseModel):
29
+ result_file: str
30
+ id: str
31
+
32
+
33
+ class match_request(BaseModel):
34
+ items: str
35
+ threshold: int
36
+ items_first: int
37
+
38
+
39
+ def get_data_dir():
40
+ return "/home/user/app/_data/"
41
+
42
+ def get_products_dir():
43
+ return os.path.join(get_data_dir(), "products")
44
+
45
+ def get_items_dir():
46
+ return os.path.join(get_data_dir(), "items")
47
+
48
+ def get_results_dir():
49
+ return os.path.join(get_data_dir(), "results")
50
+
51
+
52
+ app = FastAPI()
53
+
54
+ @app.get("/api/get_result_csv")
55
+ async def get_result_csv():
56
+ results = []
57
+ for file in os.listdir(get_results_dir()):
58
+ if file.endswith(".csv"):
59
+ results.append(file)
60
+
61
+ results_json = json.dumps(results)
62
+ return results_json
63
+
64
+
65
+ @app.post("/api/upload_result_csv")
66
+ async def upload_result_csv(file: UploadFile = File(...)):
67
+ try:
68
+ contents = file.file.read()
69
+
70
+ with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
71
+ f.write(contents)
72
+ except Exception:
73
+ raise HTTPException(status_code=500, detail='Something went wrong')
74
+ finally:
75
+ file.file.close()
76
+
77
+ return {"message": f"Successfully uploaded {file.filename}"}
78
+
79
+
80
+ @app.post("/api/upload_products_csv")
81
+ async def upload_products_csv(file: UploadFile, overwrite_existing: int):
82
+ try:
83
+ datadir = get_products_dir()
84
+ if not os.path.exists(datadir):
85
+ os.makedirs(datadir)
86
+
87
+ tempfile = os.path.join(datadir, "products.csv_upload")
88
+
89
+ contents = file.file.read()
90
+
91
+ with open(tempfile, 'wb') as f:
92
+ f.write(contents)
93
+
94
+ fullfn = os.path.join(datadir, "products.csv")
95
+ update_products_csv(tempfile, fullfn, overwrite_existing)
96
+
97
+ except Exception:
98
+ raise HTTPException(status_code=500, detail='Something went wrong')
99
+ finally:
100
+ file.file.close()
101
+
102
+ return {"message": f"Successfully uploaded {file.filename}"}
103
+
104
+
105
+ @app.post("/api/upload_items_csv")
106
+ async def upload_items_csv(file: UploadFile = File(...)):
107
+ try:
108
+ itemsdir = get_items_dir()
109
+
110
+ if not os.path.exists(itemsdir):
111
+ os.makedirs(itemsdir)
112
+
113
+ contents = file.file.read()
114
+
115
+ with open(os.path.join(itemsdir, file.filename), 'wb') as f:
116
+ f.write(contents)
117
+ except Exception:
118
+ raise HTTPException(status_code=500, detail='Something went wrong')
119
+ finally:
120
+ file.file.close()
121
+
122
+ return {"message": f"Successfully uploaded {file.filename}"}
123
+
124
+
125
+ @app.get("/api/get_items_csv")
126
+ async def get_items_csv():
127
+ itemsdir = get_items_dir()
128
+
129
+ results = []
130
+ for file in os.listdir(itemsdir):
131
+ if file.endswith(".csv"):
132
+ results.append(file)
133
+
134
+ results_json = json.dumps(results)
135
+ return results_json
136
+
137
+
138
+ @app.post("/api/match")
139
+ async def match(r: match_request):
140
+ prods_file = os.path.join(get_products_dir(), "products.csv")
141
+ if not os.path.isfile(prods_file):
142
+ return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
143
+
144
+ if len(r.items) == 0:
145
+ return {"Status": "Error", "ErrorDesc": "Items file not specified"}
146
+
147
+ if not r.threshold:
148
+ r.threshold = 50
149
+
150
+ items_fn = os.path.join(get_items_dir(), r.items)
151
+ if not os.path.isfile(items_fn):
152
+ return {"Status": "Error", "ErrorDesc": "Items file not found"}
153
+
154
+ row_items = pd.read_csv(items_fn, sep='\t')
155
+ row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
156
+
157
+
158
+ df, items, products = processor.process(row_products, row_items, r.items_first, r.threshold)
159
+
160
+ results_dir = get_results_dir()
161
+ if not os.path.exists(results_dir):
162
+ os.makedirs(results_dir)
163
+
164
+ output_csv = "m1-" + str(r.threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
165
+ df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
166
+
167
+ return {"Status": "Success", "result_file" : output_csv}
168
+
169
+
170
+ @app.get("/api/get_matched_by_id")
171
+ async def get_matched_by_id(item: item_by_id):
172
+ fullfn = os.path.join(get_results_dir(), item.result_file)
173
+ if not os.path.isfile(fullfn):
174
+ return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
175
+
176
+ (df, is_alternative) = searcher.search(fullfn, int(item.id))
177
+ if df.empty:
178
+ return {"Status": "Success", "IsAlternative": False, "Data": ""}
179
+
180
+ return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
181
+
182
+
183
+ if __name__ == "__main__":
184
+ uvicorn.run(
185
+ app,
186
+ host="0.0.0.0",
187
+ port=8000,
188
+ log_level="debug"
189
+ )
app.py CHANGED
@@ -1,31 +1,29 @@
1
- from processor.processor import Processor
2
- from constants.constants import *
3
- from ui.gradio_ui import GradioUI
4
- from search.search_by_id import Searcher
5
-
6
-
7
-
8
- processor=Processor(LONG_TYPES_LIST,
9
- SHORT_TYPES_LIST,
10
- SOUR,
11
- WINE_TYPES,
12
- GBS,
13
- COLORS_FOR_TRIM,
14
- GRAPES,
15
- OTHER_WORDS,
16
- SOUR_MERGE_DICT,
17
- TYPES_WINES_DICT,
18
- COLOR_MERGE_DICT)
19
-
20
- searcher=Searcher()
21
-
22
- ui=GradioUI(processor, searcher)
23
- ui.run_ui()
24
-
25
-
26
-
27
-
28
-
29
-
30
-
31
-
 
1
+ from processor.processor import Processor
2
+ from constants.constants import *
3
+ from ui.gradio_ui import GradioUI
4
+ from search.search_by_id import Searcher
5
+
6
+ processor=Processor(LONG_TYPES_LIST,
7
+ SHORT_TYPES_LIST,
8
+ SOUR,
9
+ WINE_TYPES,
10
+ GBS,
11
+ COLORS_FOR_TRIM,
12
+ GRAPES,
13
+ OTHER_WORDS,
14
+ SOUR_MERGE_DICT,
15
+ TYPES_WINES_DICT,
16
+ COLOR_MERGE_DICT)
17
+
18
+ searcher=Searcher()
19
+
20
+ ui=GradioUI(processor, searcher, "/home/user/app/_data/")
21
+ ui.run_ui()
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
 
 
preprocess/preprocess.py CHANGED
@@ -1,224 +1,224 @@
1
- import json
2
- from tqdm import tqdm
3
- from preprocess.utils.items.attrs import *
4
- from preprocess.utils.common.extracters import *
5
- from preprocess.utils.common.brand_matching import *
6
- from preprocess.utils.common.parallel_brand_matching import *
7
- from preprocess.utils.common.utils import *
8
- from preprocess.utils.common.top_inserts import *
9
- import pandas as pd
10
-
11
-
12
-
13
- class Preprocessor():
14
-
15
- def __init__(self, long_types_list, short_types_list, sour_list,
16
- type_wine, gbs, colors_for_trim, grapes, other_words,
17
- sour_merge_dict, type_merge_dict, color_merge_dict):
18
-
19
- self.long_types_list=long_types_list
20
- self.short_types_list=short_types_list
21
- self.sour=sour_list
22
- self.type_wine=type_wine
23
- self.gbs=gbs
24
- self.colors_ft=colors_for_trim
25
- self.grapes=grapes
26
- self.other_words=other_words
27
- self.types_n_others=long_types_list+other_words
28
- self.sour_dict=sour_merge_dict
29
- self.type_dict=type_merge_dict
30
- self.color_merge_dict=color_merge_dict
31
-
32
-
33
- def process_items(self, df):
34
- result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
35
- #counter=0
36
- for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
37
-
38
- try:
39
- i=json.loads(i)
40
- result['id'].append(idf)
41
- if 'brand' in i.keys():
42
- result['brand'].append(i['brand'])
43
- else: result['brand'].append(None)
44
- result['name'].append(i['name'])
45
- drink_type=get_type(i, self.long_types_list)
46
- if drink_type is None:
47
- drink_type=check_spark(i)
48
- if drink_type is None:
49
- drink_type=check_color_and_sour(i)
50
- if drink_type is None:
51
- drink_type=check_spark(i, col_name='type_wine')
52
- if drink_type is None:
53
- drink_type=check_color_and_sour(i, types=self.sour)
54
- #if 'type' in i.keys():
55
- result['type'].append(drink_type)#i['type'])
56
- #else: dd['type'].append(None)
57
- if 'volume' in i.keys():
58
- result['volume'].append(i['volume'])
59
- else:
60
- vol=extract_volume_or_number(i['name'])
61
- result['volume'].append(vol)
62
- if 'year' in i.keys():
63
- result['year'].append(i['year'])
64
- else:
65
- year=extract_production_year(i['name'])
66
- result['year'].append(year)
67
- alco=extract_alcohol_content(i['name'])
68
- if 'type_wine' in i.keys():
69
- result['type_wine'].append(i['type_wine'])
70
- else: result['type_wine'].append(None)
71
- #f alco is not None:
72
- result['alco'].append(alco)
73
- #else: dd['type_wine'].append(None)
74
- except Exception as ex:
75
- print(idf, ex)
76
- return pd.DataFrame(result)
77
-
78
-
79
- def process_products(self, products):
80
- result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
81
- for idx, row in tqdm(products.iterrows()):
82
- try:
83
- result['id'].append(row['id'])
84
- result['brand'].append(row['brand'])
85
- result['type_wine'].append(row['category'])
86
- result['type'].append(row['product_type'])
87
- result['name'].append(row['name_long'])
88
- vol=extract_volume_or_number(row['name'])
89
- result['volume'].append(vol)
90
- #year=extract_production_year(row['name'])
91
- year=extract_production_year(str(row['name_postfix']))
92
- result['year'].append(year)
93
- #rr['year'].append(row['name_postfix'])
94
- alco=extract_alcohol_content(row['name'])
95
- #f alco is not None:
96
- result['alco'].append(alco)
97
- except Exception as ex:
98
- print(ex)
99
- return pd.DataFrame(result)
100
-
101
-
102
- def prcess_text(self, text):
103
- #text=''+origin
104
- #text=str(split_russian_and_english(text))
105
- gb=find_full_word(text, self.gbs)#get_GB(text)
106
- if gb is not None:
107
- text=text.replace(str(gb), '')
108
-
109
- alcohol = extract_alcohol_content(text)
110
- if alcohol is not None:
111
- alco_w_comma=alcohol.replace('.', ',')
112
- text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
113
- volume_or_number = extract_volume_or_number(text)
114
- if volume_or_number is not None:
115
- volume_with_comma=str(volume_or_number).replace('.', ',')
116
- text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
117
- test=clean_wine_name(text) #remove_l(text)
118
- #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
119
- # else:
120
- # volume_or_number=re_extract_volume(text)
121
- # if volume_or_number is not None:
122
- # volume_with_comma=volume_or_number.replace('.', ',')
123
- # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
124
- years = extract_years(text)
125
- if years is not None:
126
- text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
127
- production_year = extract_production_year(text)
128
- if production_year is not None:
129
- text=text.replace(str(production_year), '')
130
-
131
-
132
- color=find_full_word(text, self.colors_ft)
133
- if color is not None:
134
- text=text.replace(str(color), '')
135
- sour=find_full_word(text, self.sour) #get_sour(text)
136
- if sour is not None:
137
- text=text.replace(str(sour), '')
138
- # re_extracted_volume=re_extract_volume(text)
139
- # if re_extracted_volume is not None:
140
- # volume_with_comma=re_extracted_volume.replace('.', ',')
141
- # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
142
-
143
- # else:
144
- # re_extracted_volume=re_extract_volume(str(volume_or_number))
145
- # volume_or_number=re_extracted_volume
146
-
147
- return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
148
-
149
-
150
- def process(self, products, items):
151
-
152
- print('------*-----Prepare items catalogue-----*-----')
153
- items=self.process_items(items.copy())
154
- print('-----*-----Prepare products catalogue-----*-----')
155
- products=self.process_products(products.copy())
156
-
157
- items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
158
- products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
159
-
160
- print('-----*-----Split n match-----*-----')
161
- splited=split_n_match(products, items)
162
- items["brand"] = items["brand"].replace(splited)
163
-
164
- print('-----*-----Fill brands in items-----*-----')
165
- fill_brands_in_dataframe(products['brand'].unique(), items)
166
-
167
- print('-----*-----Brand matching-----*-----')
168
- comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
169
- out_prods=list(set(prod_brand_list)-set(comp_list))
170
- out_items=list(set(items_brand_list)-set(comp_list))
171
- brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
172
- items["new_brand"] = items["new_brand"].replace(brand_map_improved)
173
-
174
- items['type']=items['type'].replace(self.type_dict)
175
-
176
- print('-----*-----Unwrap brend cats step 1-----*-----')
177
- unwrap_b_match=unwrap_brands(products)
178
- items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
179
- products["brand"] = products["brand"].replace(unwrap_b_match)
180
-
181
- print('-----*-----Unwrap brend cats step 2-----*-----')
182
- unwrap_b_match=unwrap_brands(products)
183
- items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
184
- products["brand"] = products["brand"].replace(unwrap_b_match)
185
-
186
- print('-----*-----Finding brands in names-----*-----')
187
- items['new_brand']=items['new_brand'].replace('none', None)
188
- i_brands=items[items['new_brand'].isna()]['name'].values
189
- p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
190
- new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
191
- items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
192
-
193
- print('-----*-----Top inserts-----*-----')
194
- process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
195
- self.grapes, self.other_words)
196
-
197
- print('-----*-----Adding service categories-----*-----')
198
- merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
199
- merge_types(items, products)
200
- merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
201
- merge_types(products, products)
202
-
203
-
204
- print('-----*-----Name trimming-----*-----')
205
- item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
206
- #items['name']=items['id'].replace(item_timed_names)
207
- items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
208
- items['gb']=gb
209
- items['sour']=sour
210
- items['sour']=items['sour'].replace(self.sour_dict)
211
- products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
212
- products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
213
- products['gb']=gb
214
- products['sour']=sour
215
- products['sour']=products['sour'].replace(self.sour_dict)
216
-
217
- print('-----*-----Replacing product types-----*-----')
218
- products['type']=products['type'].replace(self.type_dict)
219
-
220
- return items, products
221
-
222
-
223
-
224
-
 
1
+ import json
2
+ from tqdm import tqdm
3
+ from preprocess.utils.items.attrs import *
4
+ from preprocess.utils.common.extracters import *
5
+ from preprocess.utils.common.brand_matching import *
6
+ from preprocess.utils.common.parallel_brand_matching import *
7
+ from preprocess.utils.common.utils import *
8
+ from preprocess.utils.common.top_inserts import *
9
+ import pandas as pd
10
+
11
+
12
+
13
+ class Preprocessor():
14
+
15
+ def __init__(self, long_types_list, short_types_list, sour_list,
16
+ type_wine, gbs, colors_for_trim, grapes, other_words,
17
+ sour_merge_dict, type_merge_dict, color_merge_dict):
18
+
19
+ self.long_types_list=long_types_list
20
+ self.short_types_list=short_types_list
21
+ self.sour=sour_list
22
+ self.type_wine=type_wine
23
+ self.gbs=gbs
24
+ self.colors_ft=colors_for_trim
25
+ self.grapes=grapes
26
+ self.other_words=other_words
27
+ self.types_n_others=long_types_list+other_words
28
+ self.sour_dict=sour_merge_dict
29
+ self.type_dict=type_merge_dict
30
+ self.color_merge_dict=color_merge_dict
31
+
32
+
33
+ def process_items(self, df):
34
+ result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
35
+ #counter=0
36
+ for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
37
+
38
+ try:
39
+ i=json.loads(i)
40
+ result['id'].append(idf)
41
+ if 'brand' in i.keys():
42
+ result['brand'].append(i['brand'])
43
+ else: result['brand'].append(None)
44
+ result['name'].append(i['name'])
45
+ drink_type=get_type(i, self.long_types_list)
46
+ if drink_type is None:
47
+ drink_type=check_spark(i)
48
+ if drink_type is None:
49
+ drink_type=check_color_and_sour(i)
50
+ if drink_type is None:
51
+ drink_type=check_spark(i, col_name='type_wine')
52
+ if drink_type is None:
53
+ drink_type=check_color_and_sour(i, types=self.sour)
54
+ #if 'type' in i.keys():
55
+ result['type'].append(drink_type)#i['type'])
56
+ #else: dd['type'].append(None)
57
+ if 'volume' in i.keys():
58
+ result['volume'].append(i['volume'])
59
+ else:
60
+ vol=extract_volume_or_number(i['name'])
61
+ result['volume'].append(vol)
62
+ if 'year' in i.keys():
63
+ result['year'].append(i['year'])
64
+ else:
65
+ year=extract_production_year(i['name'])
66
+ result['year'].append(year)
67
+ alco=extract_alcohol_content(i['name'])
68
+ if 'type_wine' in i.keys():
69
+ result['type_wine'].append(i['type_wine'])
70
+ else: result['type_wine'].append(None)
71
+ #f alco is not None:
72
+ result['alco'].append(alco)
73
+ #else: dd['type_wine'].append(None)
74
+ except Exception as ex:
75
+ print(idf, ex)
76
+ return pd.DataFrame(result)
77
+
78
+
79
+ def process_products(self, products):
80
+ result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
81
+ for idx, row in tqdm(products.iterrows()):
82
+ try:
83
+ result['id'].append(row['id'])
84
+ result['brand'].append(row['brand'])
85
+ result['type_wine'].append(row['category'])
86
+ result['type'].append(row['product_type'])
87
+ result['name'].append(row['name_long'])
88
+ vol=extract_volume_or_number(row['name'])
89
+ result['volume'].append(vol)
90
+ #year=extract_production_year(row['name'])
91
+ year=extract_production_year(str(row['name_postfix']))
92
+ result['year'].append(year)
93
+ #rr['year'].append(row['name_postfix'])
94
+ alco=extract_alcohol_content(row['name'])
95
+ #f alco is not None:
96
+ result['alco'].append(alco)
97
+ except Exception as ex:
98
+ print(ex)
99
+ return pd.DataFrame(result)
100
+
101
+
102
+ def prcess_text(self, text):
103
+ #text=''+origin
104
+ #text=str(split_russian_and_english(text))
105
+ gb=find_full_word(text, self.gbs)#get_GB(text)
106
+ if gb is not None:
107
+ text=text.replace(str(gb), '')
108
+
109
+ alcohol = extract_alcohol_content(text)
110
+ if alcohol is not None:
111
+ alco_w_comma=alcohol.replace('.', ',')
112
+ text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
113
+ volume_or_number = extract_volume_or_number(text)
114
+ if volume_or_number is not None:
115
+ volume_with_comma=str(volume_or_number).replace('.', ',')
116
+ text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
117
+ test=clean_wine_name(text) #remove_l(text)
118
+ #text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
119
+ # else:
120
+ # volume_or_number=re_extract_volume(text)
121
+ # if volume_or_number is not None:
122
+ # volume_with_comma=volume_or_number.replace('.', ',')
123
+ # text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
124
+ years = extract_years(text)
125
+ if years is not None:
126
+ text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
127
+ production_year = extract_production_year(text)
128
+ if production_year is not None:
129
+ text=text.replace(str(production_year), '')
130
+
131
+
132
+ color=find_full_word(text, self.colors_ft)
133
+ if color is not None:
134
+ text=text.replace(str(color), '')
135
+ sour=find_full_word(text, self.sour) #get_sour(text)
136
+ if sour is not None:
137
+ text=text.replace(str(sour), '')
138
+ # re_extracted_volume=re_extract_volume(text)
139
+ # if re_extracted_volume is not None:
140
+ # volume_with_comma=re_extracted_volume.replace('.', ',')
141
+ # text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
142
+
143
+ # else:
144
+ # re_extracted_volume=re_extract_volume(str(volume_or_number))
145
+ # volume_or_number=re_extracted_volume
146
+
147
+ return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
148
+
149
+
150
+ def process(self, products, items):
151
+
152
+ print('------*-----Prepare items catalogue-----*-----')
153
+ items=self.process_items(items.copy())
154
+ print('-----*-----Prepare products catalogue-----*-----')
155
+ products=self.process_products(products.copy())
156
+
157
+ items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
158
+ products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
159
+
160
+ print('-----*-----Split n match-----*-----')
161
+ splited=split_n_match(products, items)
162
+ items["brand"] = items["brand"].replace(splited)
163
+
164
+ print('-----*-----Fill brands in items-----*-----')
165
+ fill_brands_in_dataframe(products['brand'].unique(), items)
166
+
167
+ print('-----*-----Brand matching-----*-----')
168
+ comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
169
+ out_prods=list(set(prod_brand_list)-set(comp_list))
170
+ out_items=list(set(items_brand_list)-set(comp_list))
171
+ brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
172
+ items["new_brand"] = items["new_brand"].replace(brand_map_improved)
173
+
174
+ items['type']=items['type'].replace(self.type_dict)
175
+
176
+ print('-----*-----Unwrap brend cats step 1-----*-----')
177
+ unwrap_b_match=unwrap_brands(products)
178
+ items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
179
+ products["brand"] = products["brand"].replace(unwrap_b_match)
180
+
181
+ print('-----*-----Unwrap brend cats step 2-----*-----')
182
+ unwrap_b_match=unwrap_brands(products)
183
+ items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
184
+ products["brand"] = products["brand"].replace(unwrap_b_match)
185
+
186
+ print('-----*-----Finding brands in names-----*-----')
187
+ items['new_brand']=items['new_brand'].replace('none', None)
188
+ i_brands=items[items['new_brand'].isna()]['name'].values
189
+ p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
190
+ new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
191
+ items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
192
+
193
+ print('-----*-----Top inserts-----*-----')
194
+ process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
195
+ self.grapes, self.other_words)
196
+
197
+ print('-----*-----Adding service categories-----*-----')
198
+ merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
199
+ merge_types(items, products)
200
+ merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
201
+ merge_types(products, products)
202
+
203
+
204
+ print('-----*-----Name trimming-----*-----')
205
+ item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
206
+ #items['name']=items['id'].replace(item_timed_names)
207
+ items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
208
+ items['gb']=gb
209
+ items['sour']=sour
210
+ items['sour']=items['sour'].replace(self.sour_dict)
211
+ products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
212
+ products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
213
+ products['gb']=gb
214
+ products['sour']=sour
215
+ products['sour']=products['sour'].replace(self.sour_dict)
216
+
217
+ print('-----*-----Replacing product types-----*-----')
218
+ products['type']=products['type'].replace(self.type_dict)
219
+
220
+ return items, products
221
+
222
+
223
+
224
+
preprocess/utils/common/utils.py CHANGED
@@ -1,138 +1,150 @@
1
- import re
2
- from tqdm import tqdm
3
- import csv
4
-
5
-
6
- def get_delimiter(file_path):
7
- with open(file_path, 'r') as f:
8
- sample = f.read(1024) # читаем часть файла для анализа
9
- dialect = csv.Sniffer().sniff(sample)
10
- return dialect.delimiter
11
-
12
-
13
- def remove_quotes(text):
14
- return re.sub(r'["\']', '', text)
15
-
16
-
17
- def remove_l(text):
18
- result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
19
-
20
- # Убираем возможные лишние пробелы, возникающие после удаления
21
- result = re.sub(r'\s{2,}', ' ', result).strip()
22
- return result
23
-
24
-
25
- def clean_wine_name(name):
26
- """
27
- Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
28
- Например, "токай л" превратится в "токай".
29
- """
30
- # Регулярное выражение ищет:
31
- # \s+ – один или несколько пробельных символов;
32
- # \b – граница слова;
33
- # [A-Za-zА-ЯЁа-яё] ровно одна буква (латинская или кириллическая);
34
- # \b – граница слова;
35
- # \s*$ – любые пробелы до конца строки.
36
- return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
37
-
38
-
39
- def find_full_word(text, word_list):
40
- """
41
- Ищет первое полное вхождение слова из word_list в строке text.
42
- Возвращает найденное слово или None, если совпадение не найдено.
43
- """
44
- for word in word_list:
45
- pattern = r'\b' + re.escape(word) + r'\b'
46
- if re.search(pattern, text, re.IGNORECASE):
47
- return word
48
- return None
49
-
50
-
51
- def merge_wine_type(items, colors=None, color_merge_dict=None):
52
- result=[]
53
- for row in tqdm(items.iterrows()):
54
- try:
55
- if row[1]['type_wine'] is not None:
56
- color=find_full_word(row[1]['type_wine'], colors)
57
- if color is not None:
58
- result.append(color)
59
- else:
60
- color=find_full_word(row[1]['name'], colors)
61
- if color is not None:
62
- result.append(color)
63
- else:
64
- result.append(None)
65
- else:
66
- color=find_full_word(row[1]['name'], colors)
67
- if color is not None:
68
- result.append(color)
69
- else:
70
- result.append(None)
71
- except Exception as ex:
72
- print(ex)
73
- result.append(None)
74
-
75
- items['new_type_wine']=result
76
- items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
77
-
78
-
79
- def merge_types(items, products):
80
- alco_types=[i.strip().lower() for i in products['type'].unique()]
81
- alco_types.append('ликёр')
82
- result=[]
83
- for row in tqdm(items.iterrows()):
84
- try:
85
- type_in_name=find_full_word(row[1]['name'], alco_types)
86
- if type_in_name is not None:
87
- result.append(type_in_name)
88
- continue
89
- if row[1]['type'] is not None:
90
- type_in_type=find_full_word(row[1]['type'], alco_types)
91
- if type_in_type is not None:
92
- result.append(type_in_type)
93
- else:
94
- result.append(row[1]['type'])
95
- else:
96
- result.append(None)
97
- except Exception as ex:
98
- print(ex)
99
- result.append(None)
100
-
101
- items['new_type']=result
102
- items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
103
-
104
-
105
- def trim_name(text, words_to_remove):
106
- """
107
- Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
108
-
109
- :param text: Исходная строка.
110
- :param words_to_remove: Список слов, которые необходимо удалить.
111
- :return: Обновлённая строка с удалёнными словами.
112
- """
113
- # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
114
- # Используем re.escape, чтобы экранировать спецсимволы в словах.
115
- pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
116
- #print(pattern)
117
-
118
- # Заменяем найденные полные слова на пустую строку.
119
- new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
120
-
121
- # Убираем лишние пробелы, возникающие после удаления слов.
122
- new_text = re.sub(r'\s+', ' ', new_text).strip()
123
-
124
- return new_text
125
-
126
-
127
- def name_trimmer(df, prcess_text, types_and_others):
128
- result={}
129
- gbs=[]
130
- sours=[]
131
- for idx, row in tqdm(df.iterrows()):
132
- text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
133
- text=trim_name(text, types_and_others).replace(',','').replace('.','')
134
- result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
135
-
136
- gbs.append(gb)
137
- sours.append(sour)
 
 
 
 
 
 
 
 
 
 
 
 
138
  return result, gbs, sours
 
1
+ import re
2
+
3
+ from tqdm import tqdm
4
+
5
+ '''def get_delimiter(file_path):
6
+ with open(file_path, 'r') as f:
7
+ sample = f.read(1024) # читаем часть файла для анализа
8
+ dialect = csv.Sniffer().sniff(sample)
9
+ return dialect.delimiter'''
10
+
11
+ def get_delimiter(file_path):
12
+ with open(file_path, 'r', encoding="utf-8") as f:
13
+ ln = f.readline()
14
+ if ',' in ln:
15
+ return ','
16
+ if ';' in ln:
17
+ return ';'
18
+ if '\t' in ln:
19
+ return '\t'
20
+ if '|' in ln:
21
+ return '|'
22
+
23
+ raise ValueError(None, "Error parsing CSV file. Cannot detect delimiter")
24
+
25
+ def remove_quotes(text):
26
+ return re.sub(r'["\']', '', text)
27
+
28
+
29
+ def remove_l(text):
30
+ result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
31
+
32
+ # Убираем возможные лишние пробелы, возникающие после удаления
33
+ result = re.sub(r'\s{2,}', ' ', result).strip()
34
+ return result
35
+
36
+
37
+ def clean_wine_name(name):
38
+ """
39
+ Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
40
+ Например, "токай л" превратится в "токай".
41
+ """
42
+ # Регулярное выражение ищет:
43
+ # \s+ – один или несколько пробельных символов;
44
+ # \b – граница слова;
45
+ # [A-Za-zА-ЯЁа-яё] ровно одна буква (латинская или кириллическая);
46
+ # \b – граница слова;
47
+ # \s*$ – любые пробелы до конца строки.
48
+ return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
49
+
50
+
51
+ def find_full_word(text, word_list):
52
+ """
53
+ Ищет первое полное вхождение слова из word_list в строке text.
54
+ Возвращает найденное слово или None, если совпадение не найдено.
55
+ """
56
+ for word in word_list:
57
+ pattern = r'\b' + re.escape(word) + r'\b'
58
+ if re.search(pattern, text, re.IGNORECASE):
59
+ return word
60
+ return None
61
+
62
+
63
+ def merge_wine_type(items, colors=None, color_merge_dict=None):
64
+ result=[]
65
+ for row in tqdm(items.iterrows()):
66
+ try:
67
+ if row[1]['type_wine'] is not None:
68
+ color=find_full_word(row[1]['type_wine'], colors)
69
+ if color is not None:
70
+ result.append(color)
71
+ else:
72
+ color=find_full_word(row[1]['name'], colors)
73
+ if color is not None:
74
+ result.append(color)
75
+ else:
76
+ result.append(None)
77
+ else:
78
+ color=find_full_word(row[1]['name'], colors)
79
+ if color is not None:
80
+ result.append(color)
81
+ else:
82
+ result.append(None)
83
+ except Exception as ex:
84
+ print(ex)
85
+ result.append(None)
86
+
87
+ items['new_type_wine']=result
88
+ items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
89
+
90
+
91
+ def merge_types(items, products):
92
+ alco_types=[i.strip().lower() for i in products['type'].unique()]
93
+ alco_types.append('ликёр')
94
+ result=[]
95
+ for row in tqdm(items.iterrows()):
96
+ try:
97
+ type_in_name=find_full_word(row[1]['name'], alco_types)
98
+ if type_in_name is not None:
99
+ result.append(type_in_name)
100
+ continue
101
+ if row[1]['type'] is not None:
102
+ type_in_type=find_full_word(row[1]['type'], alco_types)
103
+ if type_in_type is not None:
104
+ result.append(type_in_type)
105
+ else:
106
+ result.append(row[1]['type'])
107
+ else:
108
+ result.append(None)
109
+ except Exception as ex:
110
+ print(ex)
111
+ result.append(None)
112
+
113
+ items['new_type']=result
114
+ items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
115
+
116
+
117
+ def trim_name(text, words_to_remove):
118
+ """
119
+ Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
120
+
121
+ :param text: Исходная строка.
122
+ :param words_to_remove: Список слов, которые необходимо удалить.
123
+ :return: Обновлённая строка с удалёнными словами.
124
+ """
125
+ # Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
126
+ # Используем re.escape, чтобы экранировать спецсимволы в словах.
127
+ pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
128
+ #print(pattern)
129
+
130
+ # Заменяем найденные полные слова на пустую строку.
131
+ new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
132
+
133
+ # Убираем лишние пробелы, возникающие после удаления слов.
134
+ new_text = re.sub(r'\s+', ' ', new_text).strip()
135
+
136
+ return new_text
137
+
138
+
139
+ def name_trimmer(df, prcess_text, types_and_others):
140
+ result={}
141
+ gbs=[]
142
+ sours=[]
143
+ for idx, row in tqdm(df.iterrows()):
144
+ text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
145
+ text=trim_name(text, types_and_others).replace(',','').replace('.','')
146
+ result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
147
+
148
+ gbs.append(gb)
149
+ sours.append(sour)
150
  return result, gbs, sours
processor/matching.py CHANGED
@@ -1,159 +1,159 @@
1
- from tqdm import tqdm
2
- from transliterate import translit, detect_language
3
- import pandas as pd
4
- from rapidfuzz import fuzz, process
5
-
6
-
7
- def normalize_name(name):
8
- """
9
- Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
10
- приводит к нижнему регистру.
11
- """
12
- try:
13
- if detect_language(name) == 'ru':
14
- return translit(name, 'ru', reversed=True).lower()
15
- except Exception:
16
- pass
17
- return name.lower()
18
-
19
- def prepare_groups_with_ids(items_df):
20
- """
21
- Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
22
- с учетом нормализованного названия.
23
-
24
- Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
25
-
26
- :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
27
- :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
28
- """
29
- items_df = items_df.copy()
30
- items_df['norm_name'] = items_df['name'].apply(normalize_name)
31
-
32
- grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
33
- lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
34
- ).to_dict()
35
- return grouped
36
-
37
- def prepare_groups_by_alternative_keys(items_df):
38
- """
39
- Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
40
- оригинального и нормализованного имени.
41
-
42
- :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
43
- :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
44
- """
45
- items_df = items_df.copy()
46
- items_df['norm_name'] = items_df['name'].apply(normalize_name)
47
-
48
- grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
49
- lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
50
- ).to_dict()
51
- return grouped
52
-
53
- def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
54
- """
55
- Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
56
- нормализованные группы.
57
-
58
- Производится два прохода:
59
- - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
60
- - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
61
- исключая итемы с исходным брендом.
62
-
63
- Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
64
-
65
- :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
66
- :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
67
- :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
68
- :param name_threshold: Порог сходства для fuzzy matching.
69
- :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
70
- """
71
- results = []
72
- no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
73
-
74
- # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
75
- for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
76
- product_brand = product['brand']
77
- product_type = product['type']
78
- product_name = product['name']
79
- product_volume = product['volume']
80
- product_type_wine = product['new_type_wine']
81
- product_sour = product['sour']
82
-
83
- key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
84
- items_data = items_groups.get(key, [])
85
- if items_data:
86
- # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
87
- items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
88
- else:
89
- items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
90
-
91
- norm_product_name = normalize_name(product_name)
92
- matches = process.extract(
93
- norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
94
- )
95
- matched_items = [
96
- {
97
- 'item_id': items_ids[idx_candidate],
98
- 'item_name': items_names[idx_candidate],
99
- 'score': score,
100
- 'volume': items_volumes[idx_candidate],
101
- 'color': item_type_wine[idx_candidate],
102
- 'sour': items_sour[idx_candidate],
103
- 'year': items_year[idx_candidate],
104
- }
105
- for match, score, idx_candidate in matches
106
- ]
107
-
108
- if not matched_items:
109
- no_match_products.append((idx, product))
110
-
111
- results.append({
112
- 'product_id': product['id'],
113
- 'matched_items': matched_items,
114
- 'alternative': [] # Заполняется во втором проходе
115
- })
116
-
117
- # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
118
- groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
119
-
120
- # Второй проход: для продуктов без совпадений ищем по альтернативным группам
121
- for idx, product in tqdm(no_match_products):
122
- product_brand = product['brand']
123
- product_type_wine = product['new_type_wine']
124
- product_type = product['new_type']
125
- product_volume = product['volume']
126
- product_name = product['name']
127
- product_sour = product['sour']
128
-
129
- alt_key = (product_type_wine, product_type, product_volume, product_sour)
130
- type_items = groups_by_alternative_keys.get(alt_key, [])
131
- # Фильтруем, исключая итемы с исходным брендом
132
- filtered_items = [item for item in type_items if item[1] != product_brand]
133
- if filtered_items:
134
- alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
135
- else:
136
- alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
137
-
138
- norm_product_name = normalize_name(product_name)
139
- alt_matches = process.extract(
140
- norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
141
- )
142
- alt_matched_items = [
143
- {
144
- 'item_id': alt_ids[idx_candidate],
145
- 'item_name': alt_names[idx_candidate],
146
- 'score': score,
147
- 'volume': alt_volumes[idx_candidate],
148
- 'color': alt_type_wine[idx_candidate],
149
- 'sour': alt_sour[idx_candidate],
150
- 'year': alt_year[idx_candidate],
151
- }
152
- for match, score, idx_candidate in alt_matches
153
- ]
154
-
155
- results[idx]['alternative'] = alt_matched_items
156
-
157
- results_df = pd.DataFrame(results)
158
- merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
159
  return merged_df
 
1
+ from tqdm import tqdm
2
+ from transliterate import translit, detect_language
3
+ import pandas as pd
4
+ from rapidfuzz import fuzz, process
5
+
6
+
7
+ def normalize_name(name):
8
+ """
9
+ Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
10
+ приводит к нижнему регистру.
11
+ """
12
+ try:
13
+ if detect_language(name) == 'ru':
14
+ return translit(name, 'ru', reversed=True).lower()
15
+ except Exception:
16
+ pass
17
+ return name.lower()
18
+
19
+ def prepare_groups_with_ids(items_df):
20
+ """
21
+ Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
22
+ с учетом нормализованного названия.
23
+
24
+ Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
25
+
26
+ :param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
27
+ :return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
28
+ """
29
+ items_df = items_df.copy()
30
+ items_df['norm_name'] = items_df['name'].apply(normalize_name)
31
+
32
+ grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
33
+ lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
34
+ ).to_dict()
35
+ return grouped
36
+
37
+ def prepare_groups_by_alternative_keys(items_df):
38
+ """
39
+ Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
40
+ оригинального и нормализованного имени.
41
+
42
+ :param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
43
+ :return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
44
+ """
45
+ items_df = items_df.copy()
46
+ items_df['norm_name'] = items_df['name'].apply(normalize_name)
47
+
48
+ grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
49
+ lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
50
+ ).to_dict()
51
+ return grouped
52
+
53
+ def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
54
+ """
55
+ Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
56
+ нормализованные группы.
57
+
58
+ Производится два прохода:
59
+ - Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
60
+ - Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
61
+ исключая итемы с исходным брендом.
62
+
63
+ Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
64
+
65
+ :param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
66
+ :param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
67
+ :param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
68
+ :param name_threshold: Порог сходства для fuzzy matching.
69
+ :return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
70
+ """
71
+ results = []
72
+ no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
73
+
74
+ # Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
75
+ for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
76
+ product_brand = product['brand']
77
+ product_type = product['type']
78
+ product_name = product['name']
79
+ product_volume = product['volume']
80
+ product_type_wine = product['new_type_wine']
81
+ product_sour = product['sour']
82
+
83
+ key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
84
+ items_data = items_groups.get(key, [])
85
+ if items_data:
86
+ # Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
87
+ items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
88
+ else:
89
+ items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
90
+
91
+ norm_product_name = normalize_name(product_name)
92
+ matches = process.extract(
93
+ norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
94
+ )
95
+ matched_items = [
96
+ {
97
+ 'item_id': items_ids[idx_candidate],
98
+ 'item_name': items_names[idx_candidate],
99
+ 'score': score,
100
+ 'volume': items_volumes[idx_candidate],
101
+ 'color': item_type_wine[idx_candidate],
102
+ 'sour': items_sour[idx_candidate],
103
+ 'year': items_year[idx_candidate],
104
+ }
105
+ for match, score, idx_candidate in matches
106
+ ]
107
+
108
+ if not matched_items:
109
+ no_match_products.append((idx, product))
110
+
111
+ results.append({
112
+ 'product_id': product['id'],
113
+ 'matched_items': matched_items,
114
+ 'alternative': [] # Заполняется во втором проходе
115
+ })
116
+
117
+ # Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
118
+ groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
119
+
120
+ # Второй проход: для продуктов без совпадений ищем по альтернативным группам
121
+ for idx, product in tqdm(no_match_products):
122
+ product_brand = product['brand']
123
+ product_type_wine = product['new_type_wine']
124
+ product_type = product['new_type']
125
+ product_volume = product['volume']
126
+ product_name = product['name']
127
+ product_sour = product['sour']
128
+
129
+ alt_key = (product_type_wine, product_type, product_volume, product_sour)
130
+ type_items = groups_by_alternative_keys.get(alt_key, [])
131
+ # Фильтруем, исключая итемы с исходным брендом
132
+ filtered_items = [item for item in type_items if item[1] != product_brand]
133
+ if filtered_items:
134
+ alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
135
+ else:
136
+ alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
137
+
138
+ norm_product_name = normalize_name(product_name)
139
+ alt_matches = process.extract(
140
+ norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
141
+ )
142
+ alt_matched_items = [
143
+ {
144
+ 'item_id': alt_ids[idx_candidate],
145
+ 'item_name': alt_names[idx_candidate],
146
+ 'score': score,
147
+ 'volume': alt_volumes[idx_candidate],
148
+ 'color': alt_type_wine[idx_candidate],
149
+ 'sour': alt_sour[idx_candidate],
150
+ 'year': alt_year[idx_candidate],
151
+ }
152
+ for match, score, idx_candidate in alt_matches
153
+ ]
154
+
155
+ results[idx]['alternative'] = alt_matched_items
156
+
157
+ results_df = pd.DataFrame(results)
158
+ merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
159
  return merged_df
processor/processor.py CHANGED
@@ -1,32 +1,28 @@
1
- from preprocess.preprocess import Preprocessor
2
- from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
3
-
4
-
5
- class Processor():
6
- def __init__(self, long_types_list, short_types_list, sour_list,
7
- type_wine, gbs, colors_for_trim, grapes, other_words,
8
- sour_merge_dict, type_merge_dict, color_merge_dict):
9
-
10
- self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
11
- type_wine, gbs, colors_for_trim, grapes, other_words,
12
- sour_merge_dict, type_merge_dict, color_merge_dict)
13
-
14
- def process(self, products, items, is_items_first=False, th=65):
15
- items, products=self.preprocessor.process(products, items)
16
-
17
- print('-----*-----Matching-----*-----')
18
-
19
- if is_items_first:
20
- products['new_brand']=products['brand']
21
- items['brand']=items['new_brand']
22
- products_groups = prepare_groups_with_ids(products)
23
- res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
24
- else:
25
- items_groups = prepare_groups_with_ids(items)
26
- res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
27
-
28
- return res.drop(['type','type_wine','alco','gb'], axis=1), items, products #'year',
29
-
30
-
31
-
32
-
 
1
+ from preprocess.preprocess import Preprocessor
2
+ from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
3
+
4
+
5
+ class Processor():
6
+ def __init__(self, long_types_list, short_types_list, sour_list,
7
+ type_wine, gbs, colors_for_trim, grapes, other_words,
8
+ sour_merge_dict, type_merge_dict, color_merge_dict):
9
+
10
+ self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
11
+ type_wine, gbs, colors_for_trim, grapes, other_words,
12
+ sour_merge_dict, type_merge_dict, color_merge_dict)
13
+
14
+ def process(self, products, items, is_items_first=False, th=65):
15
+ items, products=self.preprocessor.process(products, items)
16
+
17
+ print('-----*-----Matching-----*-----')
18
+
19
+ if is_items_first:
20
+ products['new_brand']=products['brand']
21
+ items['brand']=items['new_brand']
22
+ products_groups = prepare_groups_with_ids(products)
23
+ res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
24
+ else:
25
+ items_groups = prepare_groups_with_ids(items)
26
+ res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
27
+
28
+ return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
 
 
 
 
requirements.txt CHANGED
@@ -1,7 +1,7 @@
1
- python-Levenshtein
2
- transliterate
3
- rapidfuzz
4
- pyahocorasick
5
- unidecode
6
- pqdm
7
  tqdm
 
1
+ python-Levenshtein
2
+ transliterate
3
+ rapidfuzz
4
+ pyahocorasick
5
+ unidecode
6
+ pqdm
7
  tqdm
search/search_by_id.py CHANGED
@@ -1,24 +1,53 @@
1
- import json
2
- import pandas as pd
3
- import ast
4
-
5
-
6
- class Searcher():
7
- def __init__(self):
8
- self.df = None
9
- def set_df(self, df):
10
- self.df = df
11
- try:
12
- self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
13
- except Exception as e:
14
- print(e)
15
- #self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
16
- def search(self, query):
17
- data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
18
- return pd.DataFrame(data)
19
-
20
- def search_in_uploaded_file(self, path, query):
21
- matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
22
- self.set_df(matching_result)
23
- result=self.search(query)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  return result
 
1
+ import json
2
+ import pandas as pd
3
+ import ast
4
+
5
+
6
+ class Searcher():
7
+ def __init__(self):
8
+ self.df = None
9
+
10
+
11
+ def set_df(self, df):
12
+ self.df = df
13
+ try:
14
+ self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
15
+ self.df['alternative'] = self.df['alternative'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
16
+ except Exception as e:
17
+ print(e)
18
+
19
+
20
+ def search(self, query):
21
+ data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
22
+ return pd.DataFrame(data)
23
+
24
+
25
+ def search(self, resultfn, query):
26
+ is_alternative_items = False
27
+ df_matched_items = pd.DataFrame()
28
+
29
+ matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
30
+ self.set_df(matching_result)
31
+
32
+ items = self.df[self.df['id']==query]
33
+ matched_items = items['matched_items']
34
+ if (len(matched_items) != 0) and (len(matched_items.values[0])):
35
+ data = json.loads(json.dumps(matched_items.values[0]))
36
+ df_matched_items = pd.DataFrame(data)
37
+ is_alternative_items = False
38
+ else:
39
+ alter_items = items['alternative']
40
+
41
+ if (len(alter_items) != 0) and (len(alter_items.values[0])):
42
+ data = json.loads(json.dumps(alter_items.values[0]))
43
+ df_matched_items = pd.DataFrame(data)
44
+ is_alternative_items = True
45
+
46
+ return (df_matched_items, is_alternative_items)
47
+
48
+
49
+ def search_in_uploaded_file(self, path, query):
50
+ matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
51
+ self.set_df(matching_result)
52
+ result=self.search(query)
53
  return result
tmp/prod.csv CHANGED
@@ -1 +1 @@
1
- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
 
1
+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
tmp/service/prod.csv CHANGED
@@ -1 +1 @@
1
- id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
 
1
+ id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
tmp/utils.py CHANGED
@@ -1,37 +1,48 @@
1
- import pandas as pd
2
- from preprocess.utils.common.utils import get_delimiter
3
- from glob import glob
4
- import shutil
5
- import os
6
-
7
-
8
- def update_products_csv(new_csv_path, main_csv_path='/home/user/app/tmp/prod.csv'):
9
- main_sep=get_delimiter(main_csv_path)
10
- main_csv=pd.read_csv(main_csv_path, sep=main_sep)
11
- new_sep=get_delimiter(new_csv_path)
12
- new_csv=pd.read_csv(new_csv_path, sep=new_sep)
13
- result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
14
- result.to_csv(main_csv_path, sep=main_sep, index=False)
15
-
16
- def is_csv_exist(path):
17
- file_list=glob(path+'/*.csv')
18
- if len(file_list)>0:
19
- return file_list[0]
20
- else:
21
- None
22
-
23
-
24
- def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
25
- main_path=is_csv_exist(main_dir)
26
- if main_path==None:
27
- new_path = shutil.move(new_path, main_dir)
28
- return new_path
29
- else:
30
- update_products_csv(main_path, new_path)
31
- return main_path
32
-
33
- def remover():
34
- #path=is_csv_exist('/home/user/app/tmp/prod.csv')
35
- #if path!=None:
36
- os.remove(os.getcwd()+'/tmp/prod.csv')
37
- shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from preprocess.utils.common.utils import get_delimiter
3
+ import shutil
4
+ import os
5
+
6
+
7
+ def update_products_csv(new_csv_path, prods_file, overwrite_existing):
8
+ if os.path.isfile(prods_file) and not overwrite_existing:
9
+ main_sep=get_delimiter(prods_file)
10
+ main_csv=pd.read_csv(prods_file, sep=main_sep, on_bad_lines="warn")
11
+
12
+ new_sep=get_delimiter(new_csv_path)
13
+ new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
14
+ if 'attrs' in new_csv.columns.values:
15
+ raise Exception("Uploaded Products CSV does not seem to be valid")
16
+
17
+ result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
18
+ result.to_csv(prods_file, sep=main_sep, index=False)
19
+ else:
20
+ new_sep=get_delimiter(new_csv_path)
21
+ new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
22
+ new_csv.to_csv(prods_file, sep=new_sep, index=False)
23
+
24
+ return prods_file
25
+
26
+
27
+ '''def is_csv_exist(path):
28
+ file_list=glob(path+'/*.csv')
29
+ if len(file_list)>0:
30
+ return file_list[0]
31
+ else:
32
+ None
33
+
34
+
35
+ def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
36
+ main_path=is_csv_exist(main_dir)
37
+ if main_path==None:
38
+ new_path = shutil.move(new_path, main_dir)
39
+ return new_path
40
+ else:
41
+ update_products_csv(main_path, new_path)
42
+ return main_path
43
+
44
+ def remover(data_path):
45
+ #path=is_csv_exist('/home/user/app/tmp/prod.csv')
46
+ #if path!=None:
47
+ os.remove(os.getcwd()+'/tmp/prod.csv')
48
+ shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''
ui/gradio_ui.py CHANGED
@@ -1,121 +1,170 @@
1
- import gradio as gr
2
- import pandas as pd
3
- import tempfile
4
- from preprocess.utils.common.utils import get_delimiter
5
- from tmp.utils import uploader, remover, update_products_csv
6
- from glob import glob
7
- import os
8
-
9
-
10
- class GradioUI():
11
-
12
- def __init__(self, processor, searcher=None):
13
- self.processor=processor
14
- self.searcher=searcher
15
-
16
-
17
-
18
- def process_files(self, file1, file2, is_items_first, threshold): #, q_id):
19
- try:
20
- print(file1)
21
-
22
- print()
23
- print(os.getcwd())
24
- print(os.path.dirname(os.path.abspath(__file__)))
25
- print()
26
-
27
- if file1!=None:
28
- #file1=uploader(file1)
29
- update_products_csv(file1)
30
- #else:
31
- #file1=glob('./home/user/app/tmp/*.csv')[0]
32
- file1=os.getcwd()+'/tmp/prod.csv'
33
-
34
- #print()
35
- #print(file1)
36
- #print()
37
-
38
- if file2!=None:
39
- items_delimiter=get_delimiter(file2)
40
- print('items delimiter: '+items_delimiter)
41
- row_items=pd.read_csv(file2, sep=items_delimiter, on_bad_lines='skip')
42
-
43
- products_delimiter=get_delimiter(file1)
44
- print('products delimiter: '+products_delimiter)
45
- row_products=pd.read_csv(file1, sep=products_delimiter, on_bad_lines='skip')
46
-
47
- # if q_id in row_products['id'].unique():
48
- # row_products=row_products[row_products['id']==q_id]
49
-
50
- #print("product id: " + str(q_id))
51
-
52
- df, items, products= self.processor.process(row_products, row_items, is_items_first, threshold)
53
- # Создаём временный CSV файл для сохранения результата
54
-
55
- self.searcher.set_df(df.copy())
56
-
57
- with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
58
- output_csv = tmp.name
59
- df.to_csv(output_csv, sep='\t', index=False)
60
- return output_csv
61
- except Exception as ex:
62
- raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
63
- return None
64
-
65
-
66
- def run_ui(self):
67
- with gr.Blocks() as demo:
68
- with gr.Tabs():
69
-
70
- # with gr.Row():
71
- # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
72
- # process_button = gr.Button("Обновить")
73
-
74
- # Вкладка для обработки CSV файлов
75
- with gr.TabItem("Обработка CSV файлов"):
76
- gr.Markdown("## Обработка CSV файлов")
77
- with gr.Row():
78
- file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
79
- file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
80
- #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
81
- with gr.Row():
82
- toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
83
- threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
84
- process_button = gr.Button("Обработать файлы")
85
- output_file = gr.File(label="Скачать результат (CSV)")
86
- process_button.click(
87
- fn=self.process_files,
88
- inputs=[file_input1, file_input2, toggle_input, threshold_input], #, search_number],
89
- outputs=output_file
90
- )
91
-
92
- # Вкладка для поиска
93
- with gr.TabItem("Поиск в обработанном csv"):
94
- gr.Markdown("## Поиск")
95
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
96
- search_button = gr.Button("Найти")
97
- search_table = gr.Dataframe(label="Результаты поиска")
98
- search_button.click(
99
- fn=self.searcher.search,
100
- inputs=[search_number],
101
- outputs=search_table
102
- )
103
-
104
- with gr.TabItem("Загрузка результат и поиск в нем"):
105
- gr.Markdown("## Поиск")
106
- with gr.Row():
107
- input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
108
- search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
109
- search_button = gr.Button("Найти")
110
- search_table = gr.Dataframe(label="Результаты поиска")
111
- search_button.click(
112
- fn=self.searcher.search_in_uploaded_file,
113
- inputs=[input_path, search_number],
114
- outputs=search_table
115
- )
116
-
117
- with gr.TabItem("Удалить сохраненные продукты"):
118
- del_button = gr.Button("Удалить")
119
- process_button.click(fn=remover)
120
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  demo.launch()
 
1
+ from argparse import ArgumentError
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ from preprocess.utils.common.utils import get_delimiter
6
+ from tmp.utils import remover, update_products_csv
7
+ import os
8
+ import datetime, time
9
+
10
+
11
+ class GradioUI():
12
+
13
+ def __init__(self, processor, searcher, data_path):
14
+ self.processor=processor
15
+ self.searcher=searcher
16
+ self.data_path = data_path
17
+
18
+ def get_data_dir(self):
19
+ return self.data_path
20
+
21
+ def get_products_dir(self):
22
+ return os.path.join(self.get_data_dir(), "products")
23
+
24
+ def get_items_dir(self):
25
+ return os.path.join(self.get_data_dir(), "items")
26
+
27
+ def get_results_dir(self):
28
+ return os.path.join(self.get_data_dir(), "results")
29
+
30
+ def get_products_file_date(self):
31
+ fullfn = os.path.join(self.data_path, "products", "products.csv")
32
+ if not os.path.isfile(fullfn):
33
+ return "Файл Products не найден"
34
+
35
+ stinfo = os.stat(fullfn)
36
+ return time.ctime(stinfo.st_mtime)
37
+
38
+
39
+ def upload_products_file(self, prods_file, overwrite_existing):
40
+ try:
41
+ if not os.path.exists(self.get_products_dir()):
42
+ os.makedirs(self.get_products_dir())
43
+
44
+ fullfn = os.path.join(self.get_products_dir(), "products.csv")
45
+
46
+ if prods_file != None:
47
+ update_products_csv(prods_file, fullfn, overwrite_existing)
48
+
49
+ gr.Info("Файл Products успешно загружен")
50
+ except Exception as ex:
51
+ raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
52
+
53
+
54
+ def process_items(self, items_file, is_items_first, threshold): #, q_id):
55
+ try:
56
+ prods_file = os.path.join(self.get_products_dir(), "products.csv")
57
+ if not os.path.isfile(prods_file):
58
+ raise Exception("Файл Products не найден")
59
+
60
+ if items_file != None:
61
+ items_delimiter=get_delimiter(items_file)
62
+ print('items delimiter: '+items_delimiter)
63
+ row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
64
+ if not 'attrs' in row_items.columns.values:
65
+ raise Exception("Uploaded Items CSV does not seem to be valid")
66
+
67
+ products_delimiter=get_delimiter(prods_file)
68
+ print('products delimiter: '+products_delimiter)
69
+ row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
70
+
71
+ # if q_id in row_products['id'].unique():
72
+ # row_products=row_products[row_products['id']==q_id]
73
+
74
+ #print("product id: " + str(q_id))
75
+
76
+ df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold)
77
+
78
+ self.searcher.set_df(df.copy())
79
+ #with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
80
+ # output_csv = tmp.name
81
+ results_path = self.get_results_dir()
82
+ if not os.path.exists(results_path):
83
+ os.makedirs(results_path)
84
+
85
+ output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
86
+ output_csv = os.path.join(results_path, output_csv)
87
+ df.to_csv(output_csv, sep='\t', index=False)
88
+ return output_csv
89
+ except Exception as ex:
90
+ raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
91
+
92
+ def on_page_load(self, r: gr.Request):
93
+ m_time = self.get_products_file_date()
94
+ return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
95
+
96
+
97
+ def run_ui(self):
98
+ with gr.Blocks() as demo:
99
+ tabs = gr.Tabs()
100
+ with tabs:
101
+
102
+ # with gr.Row():
103
+ # file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
104
+ # process_button = gr.Button("Обновить")
105
+
106
+ with gr.TabItem("Загрузка файла Products"):
107
+ prod_file_info1 = gr.Markdown("## Загрузка файла Products")
108
+ with gr.Row():
109
+ file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
110
+ with gr.Row():
111
+ toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
112
+ upload_button = gr.Button("Загрузить файл")
113
+ upload_button.click(
114
+ fn=self.upload_products_file,
115
+ inputs=[file_input1, toggle_input],
116
+ #outputs=output_file
117
+ )
118
+
119
+ # Вкладка для обработки CSV файлов
120
+ with gr.TabItem("Обработка каталога поставщика"):
121
+ gr.Markdown("## Обработка каталога поставщика")
122
+
123
+ m_time = self.get_products_file_date()
124
+ prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
125
+ with gr.Row():
126
+ #file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
127
+ file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
128
+ #search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
129
+ with gr.Row():
130
+ toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
131
+ threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
132
+ process_button = gr.Button("Загрузить файл с каталогом и сравнить")
133
+ output_file = gr.File(label="Скачать результат (CSV)")
134
+ process_button.click(
135
+ fn=self.process_items,
136
+ inputs=[file_items, toggle_input, threshold_input], #, search_number],
137
+ outputs=output_file
138
+ )
139
+
140
+ # Вкладка для поиска
141
+ with gr.TabItem("Поиск в обработанном csv"):
142
+ gr.Markdown("## Поиск")
143
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
144
+ search_button = gr.Button("Найти")
145
+ search_table = gr.Dataframe(label="Результаты поиска")
146
+ search_button.click(
147
+ fn=self.searcher.search,
148
+ inputs=[search_number],
149
+ outputs=search_table
150
+ )
151
+
152
+ with gr.TabItem("Загрузка результат и поиск в нем"):
153
+ gr.Markdown("## Поиск")
154
+ with gr.Row():
155
+ input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
156
+ search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
157
+ search_button = gr.Button("Найти")
158
+ search_table = gr.Dataframe(label="Результаты поиска")
159
+ search_button.click(
160
+ fn=self.searcher.search_in_uploaded_file,
161
+ inputs=[input_path, search_number],
162
+ outputs=search_table
163
+ )
164
+
165
+ #with gr.TabItem("Удалить сохраненные продукты"):
166
+ # del_button = gr.Button("Удалить")
167
+ # process_button.click(fn=remover)
168
+
169
+ demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
170
  demo.launch()