Spaces:
Sleeping
Sleeping
Upload 20 files
#3
by
Gainward777 - opened
- .gitattributes +35 -35
- README.md +12 -12
- api.py +189 -0
- app.py +29 -31
- preprocess/preprocess.py +224 -224
- preprocess/utils/common/utils.py +149 -137
- processor/matching.py +158 -158
- processor/processor.py +28 -32
- requirements.txt +6 -6
- search/search_by_id.py +52 -23
- tmp/prod.csv +1 -1
- tmp/service/prod.csv +1 -1
- tmp/utils.py +48 -37
- ui/gradio_ui.py +169 -120
.gitattributes
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Product Matching
|
| 3 |
-
emoji: 🏃
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.19.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Product Matching
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.19.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
api.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import datetime
|
| 4 |
+
|
| 5 |
+
from processor.processor import Processor
|
| 6 |
+
from constants.constants import *
|
| 7 |
+
from search.search_by_id import Searcher
|
| 8 |
+
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 9 |
+
import uvicorn
|
| 10 |
+
from pydantic import BaseModel
|
| 11 |
+
import pandas as pd
|
| 12 |
+
from tmp.utils import update_products_csv
|
| 13 |
+
|
| 14 |
+
processor=Processor(LONG_TYPES_LIST,
|
| 15 |
+
SHORT_TYPES_LIST,
|
| 16 |
+
SOUR,
|
| 17 |
+
WINE_TYPES,
|
| 18 |
+
GBS,
|
| 19 |
+
COLORS_FOR_TRIM,
|
| 20 |
+
GRAPES,
|
| 21 |
+
OTHER_WORDS,
|
| 22 |
+
SOUR_MERGE_DICT,
|
| 23 |
+
TYPES_WINES_DICT,
|
| 24 |
+
COLOR_MERGE_DICT)
|
| 25 |
+
|
| 26 |
+
searcher=Searcher()
|
| 27 |
+
|
| 28 |
+
class item_by_id(BaseModel):
|
| 29 |
+
result_file: str
|
| 30 |
+
id: str
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class match_request(BaseModel):
|
| 34 |
+
items: str
|
| 35 |
+
threshold: int
|
| 36 |
+
items_first: int
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_data_dir():
|
| 40 |
+
return "/home/user/app/_data/"
|
| 41 |
+
|
| 42 |
+
def get_products_dir():
|
| 43 |
+
return os.path.join(get_data_dir(), "products")
|
| 44 |
+
|
| 45 |
+
def get_items_dir():
|
| 46 |
+
return os.path.join(get_data_dir(), "items")
|
| 47 |
+
|
| 48 |
+
def get_results_dir():
|
| 49 |
+
return os.path.join(get_data_dir(), "results")
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
app = FastAPI()
|
| 53 |
+
|
| 54 |
+
@app.get("/api/get_result_csv")
|
| 55 |
+
async def get_result_csv():
|
| 56 |
+
results = []
|
| 57 |
+
for file in os.listdir(get_results_dir()):
|
| 58 |
+
if file.endswith(".csv"):
|
| 59 |
+
results.append(file)
|
| 60 |
+
|
| 61 |
+
results_json = json.dumps(results)
|
| 62 |
+
return results_json
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
@app.post("/api/upload_result_csv")
|
| 66 |
+
async def upload_result_csv(file: UploadFile = File(...)):
|
| 67 |
+
try:
|
| 68 |
+
contents = file.file.read()
|
| 69 |
+
|
| 70 |
+
with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
|
| 71 |
+
f.write(contents)
|
| 72 |
+
except Exception:
|
| 73 |
+
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 74 |
+
finally:
|
| 75 |
+
file.file.close()
|
| 76 |
+
|
| 77 |
+
return {"message": f"Successfully uploaded {file.filename}"}
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@app.post("/api/upload_products_csv")
|
| 81 |
+
async def upload_products_csv(file: UploadFile, overwrite_existing: int):
|
| 82 |
+
try:
|
| 83 |
+
datadir = get_products_dir()
|
| 84 |
+
if not os.path.exists(datadir):
|
| 85 |
+
os.makedirs(datadir)
|
| 86 |
+
|
| 87 |
+
tempfile = os.path.join(datadir, "products.csv_upload")
|
| 88 |
+
|
| 89 |
+
contents = file.file.read()
|
| 90 |
+
|
| 91 |
+
with open(tempfile, 'wb') as f:
|
| 92 |
+
f.write(contents)
|
| 93 |
+
|
| 94 |
+
fullfn = os.path.join(datadir, "products.csv")
|
| 95 |
+
update_products_csv(tempfile, fullfn, overwrite_existing)
|
| 96 |
+
|
| 97 |
+
except Exception:
|
| 98 |
+
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 99 |
+
finally:
|
| 100 |
+
file.file.close()
|
| 101 |
+
|
| 102 |
+
return {"message": f"Successfully uploaded {file.filename}"}
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
@app.post("/api/upload_items_csv")
|
| 106 |
+
async def upload_items_csv(file: UploadFile = File(...)):
|
| 107 |
+
try:
|
| 108 |
+
itemsdir = get_items_dir()
|
| 109 |
+
|
| 110 |
+
if not os.path.exists(itemsdir):
|
| 111 |
+
os.makedirs(itemsdir)
|
| 112 |
+
|
| 113 |
+
contents = file.file.read()
|
| 114 |
+
|
| 115 |
+
with open(os.path.join(itemsdir, file.filename), 'wb') as f:
|
| 116 |
+
f.write(contents)
|
| 117 |
+
except Exception:
|
| 118 |
+
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 119 |
+
finally:
|
| 120 |
+
file.file.close()
|
| 121 |
+
|
| 122 |
+
return {"message": f"Successfully uploaded {file.filename}"}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
@app.get("/api/get_items_csv")
|
| 126 |
+
async def get_items_csv():
|
| 127 |
+
itemsdir = get_items_dir()
|
| 128 |
+
|
| 129 |
+
results = []
|
| 130 |
+
for file in os.listdir(itemsdir):
|
| 131 |
+
if file.endswith(".csv"):
|
| 132 |
+
results.append(file)
|
| 133 |
+
|
| 134 |
+
results_json = json.dumps(results)
|
| 135 |
+
return results_json
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
@app.post("/api/match")
|
| 139 |
+
async def match(r: match_request):
|
| 140 |
+
prods_file = os.path.join(get_products_dir(), "products.csv")
|
| 141 |
+
if not os.path.isfile(prods_file):
|
| 142 |
+
return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
|
| 143 |
+
|
| 144 |
+
if len(r.items) == 0:
|
| 145 |
+
return {"Status": "Error", "ErrorDesc": "Items file not specified"}
|
| 146 |
+
|
| 147 |
+
if not r.threshold:
|
| 148 |
+
r.threshold = 50
|
| 149 |
+
|
| 150 |
+
items_fn = os.path.join(get_items_dir(), r.items)
|
| 151 |
+
if not os.path.isfile(items_fn):
|
| 152 |
+
return {"Status": "Error", "ErrorDesc": "Items file not found"}
|
| 153 |
+
|
| 154 |
+
row_items = pd.read_csv(items_fn, sep='\t')
|
| 155 |
+
row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
df, items, products = processor.process(row_products, row_items, r.items_first, r.threshold)
|
| 159 |
+
|
| 160 |
+
results_dir = get_results_dir()
|
| 161 |
+
if not os.path.exists(results_dir):
|
| 162 |
+
os.makedirs(results_dir)
|
| 163 |
+
|
| 164 |
+
output_csv = "m1-" + str(r.threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
|
| 165 |
+
df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
|
| 166 |
+
|
| 167 |
+
return {"Status": "Success", "result_file" : output_csv}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@app.get("/api/get_matched_by_id")
|
| 171 |
+
async def get_matched_by_id(item: item_by_id):
|
| 172 |
+
fullfn = os.path.join(get_results_dir(), item.result_file)
|
| 173 |
+
if not os.path.isfile(fullfn):
|
| 174 |
+
return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
|
| 175 |
+
|
| 176 |
+
(df, is_alternative) = searcher.search(fullfn, int(item.id))
|
| 177 |
+
if df.empty:
|
| 178 |
+
return {"Status": "Success", "IsAlternative": False, "Data": ""}
|
| 179 |
+
|
| 180 |
+
return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
if __name__ == "__main__":
|
| 184 |
+
uvicorn.run(
|
| 185 |
+
app,
|
| 186 |
+
host="0.0.0.0",
|
| 187 |
+
port=8000,
|
| 188 |
+
log_level="debug"
|
| 189 |
+
)
|
app.py
CHANGED
|
@@ -1,31 +1,29 @@
|
|
| 1 |
-
from processor.processor import Processor
|
| 2 |
-
from constants.constants import *
|
| 3 |
-
from ui.gradio_ui import GradioUI
|
| 4 |
-
from search.search_by_id import Searcher
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 1 |
+
from processor.processor import Processor
|
| 2 |
+
from constants.constants import *
|
| 3 |
+
from ui.gradio_ui import GradioUI
|
| 4 |
+
from search.search_by_id import Searcher
|
| 5 |
+
|
| 6 |
+
processor=Processor(LONG_TYPES_LIST,
|
| 7 |
+
SHORT_TYPES_LIST,
|
| 8 |
+
SOUR,
|
| 9 |
+
WINE_TYPES,
|
| 10 |
+
GBS,
|
| 11 |
+
COLORS_FOR_TRIM,
|
| 12 |
+
GRAPES,
|
| 13 |
+
OTHER_WORDS,
|
| 14 |
+
SOUR_MERGE_DICT,
|
| 15 |
+
TYPES_WINES_DICT,
|
| 16 |
+
COLOR_MERGE_DICT)
|
| 17 |
+
|
| 18 |
+
searcher=Searcher()
|
| 19 |
+
|
| 20 |
+
ui=GradioUI(processor, searcher, "/home/user/app/_data/")
|
| 21 |
+
ui.run_ui()
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
|
|
|
|
|
preprocess/preprocess.py
CHANGED
|
@@ -1,224 +1,224 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from tqdm import tqdm
|
| 3 |
-
from preprocess.utils.items.attrs import *
|
| 4 |
-
from preprocess.utils.common.extracters import *
|
| 5 |
-
from preprocess.utils.common.brand_matching import *
|
| 6 |
-
from preprocess.utils.common.parallel_brand_matching import *
|
| 7 |
-
from preprocess.utils.common.utils import *
|
| 8 |
-
from preprocess.utils.common.top_inserts import *
|
| 9 |
-
import pandas as pd
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class Preprocessor():
|
| 14 |
-
|
| 15 |
-
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
-
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
-
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 18 |
-
|
| 19 |
-
self.long_types_list=long_types_list
|
| 20 |
-
self.short_types_list=short_types_list
|
| 21 |
-
self.sour=sour_list
|
| 22 |
-
self.type_wine=type_wine
|
| 23 |
-
self.gbs=gbs
|
| 24 |
-
self.colors_ft=colors_for_trim
|
| 25 |
-
self.grapes=grapes
|
| 26 |
-
self.other_words=other_words
|
| 27 |
-
self.types_n_others=long_types_list+other_words
|
| 28 |
-
self.sour_dict=sour_merge_dict
|
| 29 |
-
self.type_dict=type_merge_dict
|
| 30 |
-
self.color_merge_dict=color_merge_dict
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
def process_items(self, df):
|
| 34 |
-
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 35 |
-
#counter=0
|
| 36 |
-
for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
|
| 37 |
-
|
| 38 |
-
try:
|
| 39 |
-
i=json.loads(i)
|
| 40 |
-
result['id'].append(idf)
|
| 41 |
-
if 'brand' in i.keys():
|
| 42 |
-
result['brand'].append(i['brand'])
|
| 43 |
-
else: result['brand'].append(None)
|
| 44 |
-
result['name'].append(i['name'])
|
| 45 |
-
drink_type=get_type(i, self.long_types_list)
|
| 46 |
-
if drink_type is None:
|
| 47 |
-
drink_type=check_spark(i)
|
| 48 |
-
if drink_type is None:
|
| 49 |
-
drink_type=check_color_and_sour(i)
|
| 50 |
-
if drink_type is None:
|
| 51 |
-
drink_type=check_spark(i, col_name='type_wine')
|
| 52 |
-
if drink_type is None:
|
| 53 |
-
drink_type=check_color_and_sour(i, types=self.sour)
|
| 54 |
-
#if 'type' in i.keys():
|
| 55 |
-
result['type'].append(drink_type)#i['type'])
|
| 56 |
-
#else: dd['type'].append(None)
|
| 57 |
-
if 'volume' in i.keys():
|
| 58 |
-
result['volume'].append(i['volume'])
|
| 59 |
-
else:
|
| 60 |
-
vol=extract_volume_or_number(i['name'])
|
| 61 |
-
result['volume'].append(vol)
|
| 62 |
-
if 'year' in i.keys():
|
| 63 |
-
result['year'].append(i['year'])
|
| 64 |
-
else:
|
| 65 |
-
year=extract_production_year(i['name'])
|
| 66 |
-
result['year'].append(year)
|
| 67 |
-
alco=extract_alcohol_content(i['name'])
|
| 68 |
-
if 'type_wine' in i.keys():
|
| 69 |
-
result['type_wine'].append(i['type_wine'])
|
| 70 |
-
else: result['type_wine'].append(None)
|
| 71 |
-
#f alco is not None:
|
| 72 |
-
result['alco'].append(alco)
|
| 73 |
-
#else: dd['type_wine'].append(None)
|
| 74 |
-
except Exception as ex:
|
| 75 |
-
print(idf, ex)
|
| 76 |
-
return pd.DataFrame(result)
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
def process_products(self, products):
|
| 80 |
-
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 81 |
-
for idx, row in tqdm(products.iterrows()):
|
| 82 |
-
try:
|
| 83 |
-
result['id'].append(row['id'])
|
| 84 |
-
result['brand'].append(row['brand'])
|
| 85 |
-
result['type_wine'].append(row['category'])
|
| 86 |
-
result['type'].append(row['product_type'])
|
| 87 |
-
result['name'].append(row['name_long'])
|
| 88 |
-
vol=extract_volume_or_number(row['name'])
|
| 89 |
-
result['volume'].append(vol)
|
| 90 |
-
#year=extract_production_year(row['name'])
|
| 91 |
-
year=extract_production_year(str(row['name_postfix']))
|
| 92 |
-
result['year'].append(year)
|
| 93 |
-
#rr['year'].append(row['name_postfix'])
|
| 94 |
-
alco=extract_alcohol_content(row['name'])
|
| 95 |
-
#f alco is not None:
|
| 96 |
-
result['alco'].append(alco)
|
| 97 |
-
except Exception as ex:
|
| 98 |
-
print(ex)
|
| 99 |
-
return pd.DataFrame(result)
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
def prcess_text(self, text):
|
| 103 |
-
#text=''+origin
|
| 104 |
-
#text=str(split_russian_and_english(text))
|
| 105 |
-
gb=find_full_word(text, self.gbs)#get_GB(text)
|
| 106 |
-
if gb is not None:
|
| 107 |
-
text=text.replace(str(gb), '')
|
| 108 |
-
|
| 109 |
-
alcohol = extract_alcohol_content(text)
|
| 110 |
-
if alcohol is not None:
|
| 111 |
-
alco_w_comma=alcohol.replace('.', ',')
|
| 112 |
-
text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
|
| 113 |
-
volume_or_number = extract_volume_or_number(text)
|
| 114 |
-
if volume_or_number is not None:
|
| 115 |
-
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 116 |
-
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 117 |
-
test=clean_wine_name(text) #remove_l(text)
|
| 118 |
-
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 119 |
-
# else:
|
| 120 |
-
# volume_or_number=re_extract_volume(text)
|
| 121 |
-
# if volume_or_number is not None:
|
| 122 |
-
# volume_with_comma=volume_or_number.replace('.', ',')
|
| 123 |
-
# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 124 |
-
years = extract_years(text)
|
| 125 |
-
if years is not None:
|
| 126 |
-
text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
|
| 127 |
-
production_year = extract_production_year(text)
|
| 128 |
-
if production_year is not None:
|
| 129 |
-
text=text.replace(str(production_year), '')
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
color=find_full_word(text, self.colors_ft)
|
| 133 |
-
if color is not None:
|
| 134 |
-
text=text.replace(str(color), '')
|
| 135 |
-
sour=find_full_word(text, self.sour) #get_sour(text)
|
| 136 |
-
if sour is not None:
|
| 137 |
-
text=text.replace(str(sour), '')
|
| 138 |
-
# re_extracted_volume=re_extract_volume(text)
|
| 139 |
-
# if re_extracted_volume is not None:
|
| 140 |
-
# volume_with_comma=re_extracted_volume.replace('.', ',')
|
| 141 |
-
# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
|
| 142 |
-
|
| 143 |
-
# else:
|
| 144 |
-
# re_extracted_volume=re_extract_volume(str(volume_or_number))
|
| 145 |
-
# volume_or_number=re_extracted_volume
|
| 146 |
-
|
| 147 |
-
return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
def process(self, products, items):
|
| 151 |
-
|
| 152 |
-
print('------*-----Prepare items catalogue-----*-----')
|
| 153 |
-
items=self.process_items(items.copy())
|
| 154 |
-
print('-----*-----Prepare products catalogue-----*-----')
|
| 155 |
-
products=self.process_products(products.copy())
|
| 156 |
-
|
| 157 |
-
items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
|
| 158 |
-
products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
|
| 159 |
-
|
| 160 |
-
print('-----*-----Split n match-----*-----')
|
| 161 |
-
splited=split_n_match(products, items)
|
| 162 |
-
items["brand"] = items["brand"].replace(splited)
|
| 163 |
-
|
| 164 |
-
print('-----*-----Fill brands in items-----*-----')
|
| 165 |
-
fill_brands_in_dataframe(products['brand'].unique(), items)
|
| 166 |
-
|
| 167 |
-
print('-----*-----Brand matching-----*-----')
|
| 168 |
-
comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
|
| 169 |
-
out_prods=list(set(prod_brand_list)-set(comp_list))
|
| 170 |
-
out_items=list(set(items_brand_list)-set(comp_list))
|
| 171 |
-
brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
|
| 172 |
-
items["new_brand"] = items["new_brand"].replace(brand_map_improved)
|
| 173 |
-
|
| 174 |
-
items['type']=items['type'].replace(self.type_dict)
|
| 175 |
-
|
| 176 |
-
print('-----*-----Unwrap brend cats step 1-----*-----')
|
| 177 |
-
unwrap_b_match=unwrap_brands(products)
|
| 178 |
-
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 179 |
-
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 180 |
-
|
| 181 |
-
print('-----*-----Unwrap brend cats step 2-----*-----')
|
| 182 |
-
unwrap_b_match=unwrap_brands(products)
|
| 183 |
-
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 184 |
-
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 185 |
-
|
| 186 |
-
print('-----*-----Finding brands in names-----*-----')
|
| 187 |
-
items['new_brand']=items['new_brand'].replace('none', None)
|
| 188 |
-
i_brands=items[items['new_brand'].isna()]['name'].values
|
| 189 |
-
p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
|
| 190 |
-
new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
|
| 191 |
-
items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
|
| 192 |
-
|
| 193 |
-
print('-----*-----Top inserts-----*-----')
|
| 194 |
-
process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
|
| 195 |
-
self.grapes, self.other_words)
|
| 196 |
-
|
| 197 |
-
print('-----*-----Adding service categories-----*-----')
|
| 198 |
-
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 199 |
-
merge_types(items, products)
|
| 200 |
-
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 201 |
-
merge_types(products, products)
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
print('-----*-----Name trimming-----*-----')
|
| 205 |
-
item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
|
| 206 |
-
#items['name']=items['id'].replace(item_timed_names)
|
| 207 |
-
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
|
| 208 |
-
items['gb']=gb
|
| 209 |
-
items['sour']=sour
|
| 210 |
-
items['sour']=items['sour'].replace(self.sour_dict)
|
| 211 |
-
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 212 |
-
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 213 |
-
products['gb']=gb
|
| 214 |
-
products['sour']=sour
|
| 215 |
-
products['sour']=products['sour'].replace(self.sour_dict)
|
| 216 |
-
|
| 217 |
-
print('-----*-----Replacing product types-----*-----')
|
| 218 |
-
products['type']=products['type'].replace(self.type_dict)
|
| 219 |
-
|
| 220 |
-
return items, products
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from preprocess.utils.items.attrs import *
|
| 4 |
+
from preprocess.utils.common.extracters import *
|
| 5 |
+
from preprocess.utils.common.brand_matching import *
|
| 6 |
+
from preprocess.utils.common.parallel_brand_matching import *
|
| 7 |
+
from preprocess.utils.common.utils import *
|
| 8 |
+
from preprocess.utils.common.top_inserts import *
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Preprocessor():
|
| 14 |
+
|
| 15 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 18 |
+
|
| 19 |
+
self.long_types_list=long_types_list
|
| 20 |
+
self.short_types_list=short_types_list
|
| 21 |
+
self.sour=sour_list
|
| 22 |
+
self.type_wine=type_wine
|
| 23 |
+
self.gbs=gbs
|
| 24 |
+
self.colors_ft=colors_for_trim
|
| 25 |
+
self.grapes=grapes
|
| 26 |
+
self.other_words=other_words
|
| 27 |
+
self.types_n_others=long_types_list+other_words
|
| 28 |
+
self.sour_dict=sour_merge_dict
|
| 29 |
+
self.type_dict=type_merge_dict
|
| 30 |
+
self.color_merge_dict=color_merge_dict
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def process_items(self, df):
|
| 34 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 35 |
+
#counter=0
|
| 36 |
+
for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
i=json.loads(i)
|
| 40 |
+
result['id'].append(idf)
|
| 41 |
+
if 'brand' in i.keys():
|
| 42 |
+
result['brand'].append(i['brand'])
|
| 43 |
+
else: result['brand'].append(None)
|
| 44 |
+
result['name'].append(i['name'])
|
| 45 |
+
drink_type=get_type(i, self.long_types_list)
|
| 46 |
+
if drink_type is None:
|
| 47 |
+
drink_type=check_spark(i)
|
| 48 |
+
if drink_type is None:
|
| 49 |
+
drink_type=check_color_and_sour(i)
|
| 50 |
+
if drink_type is None:
|
| 51 |
+
drink_type=check_spark(i, col_name='type_wine')
|
| 52 |
+
if drink_type is None:
|
| 53 |
+
drink_type=check_color_and_sour(i, types=self.sour)
|
| 54 |
+
#if 'type' in i.keys():
|
| 55 |
+
result['type'].append(drink_type)#i['type'])
|
| 56 |
+
#else: dd['type'].append(None)
|
| 57 |
+
if 'volume' in i.keys():
|
| 58 |
+
result['volume'].append(i['volume'])
|
| 59 |
+
else:
|
| 60 |
+
vol=extract_volume_or_number(i['name'])
|
| 61 |
+
result['volume'].append(vol)
|
| 62 |
+
if 'year' in i.keys():
|
| 63 |
+
result['year'].append(i['year'])
|
| 64 |
+
else:
|
| 65 |
+
year=extract_production_year(i['name'])
|
| 66 |
+
result['year'].append(year)
|
| 67 |
+
alco=extract_alcohol_content(i['name'])
|
| 68 |
+
if 'type_wine' in i.keys():
|
| 69 |
+
result['type_wine'].append(i['type_wine'])
|
| 70 |
+
else: result['type_wine'].append(None)
|
| 71 |
+
#f alco is not None:
|
| 72 |
+
result['alco'].append(alco)
|
| 73 |
+
#else: dd['type_wine'].append(None)
|
| 74 |
+
except Exception as ex:
|
| 75 |
+
print(idf, ex)
|
| 76 |
+
return pd.DataFrame(result)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def process_products(self, products):
|
| 80 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 81 |
+
for idx, row in tqdm(products.iterrows()):
|
| 82 |
+
try:
|
| 83 |
+
result['id'].append(row['id'])
|
| 84 |
+
result['brand'].append(row['brand'])
|
| 85 |
+
result['type_wine'].append(row['category'])
|
| 86 |
+
result['type'].append(row['product_type'])
|
| 87 |
+
result['name'].append(row['name_long'])
|
| 88 |
+
vol=extract_volume_or_number(row['name'])
|
| 89 |
+
result['volume'].append(vol)
|
| 90 |
+
#year=extract_production_year(row['name'])
|
| 91 |
+
year=extract_production_year(str(row['name_postfix']))
|
| 92 |
+
result['year'].append(year)
|
| 93 |
+
#rr['year'].append(row['name_postfix'])
|
| 94 |
+
alco=extract_alcohol_content(row['name'])
|
| 95 |
+
#f alco is not None:
|
| 96 |
+
result['alco'].append(alco)
|
| 97 |
+
except Exception as ex:
|
| 98 |
+
print(ex)
|
| 99 |
+
return pd.DataFrame(result)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def prcess_text(self, text):
|
| 103 |
+
#text=''+origin
|
| 104 |
+
#text=str(split_russian_and_english(text))
|
| 105 |
+
gb=find_full_word(text, self.gbs)#get_GB(text)
|
| 106 |
+
if gb is not None:
|
| 107 |
+
text=text.replace(str(gb), '')
|
| 108 |
+
|
| 109 |
+
alcohol = extract_alcohol_content(text)
|
| 110 |
+
if alcohol is not None:
|
| 111 |
+
alco_w_comma=alcohol.replace('.', ',')
|
| 112 |
+
text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
|
| 113 |
+
volume_or_number = extract_volume_or_number(text)
|
| 114 |
+
if volume_or_number is not None:
|
| 115 |
+
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 116 |
+
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 117 |
+
test=clean_wine_name(text) #remove_l(text)
|
| 118 |
+
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 119 |
+
# else:
|
| 120 |
+
# volume_or_number=re_extract_volume(text)
|
| 121 |
+
# if volume_or_number is not None:
|
| 122 |
+
# volume_with_comma=volume_or_number.replace('.', ',')
|
| 123 |
+
# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 124 |
+
years = extract_years(text)
|
| 125 |
+
if years is not None:
|
| 126 |
+
text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
|
| 127 |
+
production_year = extract_production_year(text)
|
| 128 |
+
if production_year is not None:
|
| 129 |
+
text=text.replace(str(production_year), '')
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
color=find_full_word(text, self.colors_ft)
|
| 133 |
+
if color is not None:
|
| 134 |
+
text=text.replace(str(color), '')
|
| 135 |
+
sour=find_full_word(text, self.sour) #get_sour(text)
|
| 136 |
+
if sour is not None:
|
| 137 |
+
text=text.replace(str(sour), '')
|
| 138 |
+
# re_extracted_volume=re_extract_volume(text)
|
| 139 |
+
# if re_extracted_volume is not None:
|
| 140 |
+
# volume_with_comma=re_extracted_volume.replace('.', ',')
|
| 141 |
+
# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
|
| 142 |
+
|
| 143 |
+
# else:
|
| 144 |
+
# re_extracted_volume=re_extract_volume(str(volume_or_number))
|
| 145 |
+
# volume_or_number=re_extracted_volume
|
| 146 |
+
|
| 147 |
+
return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def process(self, products, items):
|
| 151 |
+
|
| 152 |
+
print('------*-----Prepare items catalogue-----*-----')
|
| 153 |
+
items=self.process_items(items.copy())
|
| 154 |
+
print('-----*-----Prepare products catalogue-----*-----')
|
| 155 |
+
products=self.process_products(products.copy())
|
| 156 |
+
|
| 157 |
+
items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
|
| 158 |
+
products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
|
| 159 |
+
|
| 160 |
+
print('-----*-----Split n match-----*-----')
|
| 161 |
+
splited=split_n_match(products, items)
|
| 162 |
+
items["brand"] = items["brand"].replace(splited)
|
| 163 |
+
|
| 164 |
+
print('-----*-----Fill brands in items-----*-----')
|
| 165 |
+
fill_brands_in_dataframe(products['brand'].unique(), items)
|
| 166 |
+
|
| 167 |
+
print('-----*-----Brand matching-----*-----')
|
| 168 |
+
comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
|
| 169 |
+
out_prods=list(set(prod_brand_list)-set(comp_list))
|
| 170 |
+
out_items=list(set(items_brand_list)-set(comp_list))
|
| 171 |
+
brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
|
| 172 |
+
items["new_brand"] = items["new_brand"].replace(brand_map_improved)
|
| 173 |
+
|
| 174 |
+
items['type']=items['type'].replace(self.type_dict)
|
| 175 |
+
|
| 176 |
+
print('-----*-----Unwrap brend cats step 1-----*-----')
|
| 177 |
+
unwrap_b_match=unwrap_brands(products)
|
| 178 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 179 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 180 |
+
|
| 181 |
+
print('-----*-----Unwrap brend cats step 2-----*-----')
|
| 182 |
+
unwrap_b_match=unwrap_brands(products)
|
| 183 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 184 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 185 |
+
|
| 186 |
+
print('-----*-----Finding brands in names-----*-----')
|
| 187 |
+
items['new_brand']=items['new_brand'].replace('none', None)
|
| 188 |
+
i_brands=items[items['new_brand'].isna()]['name'].values
|
| 189 |
+
p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
|
| 190 |
+
new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
|
| 191 |
+
items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
|
| 192 |
+
|
| 193 |
+
print('-----*-----Top inserts-----*-----')
|
| 194 |
+
process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
|
| 195 |
+
self.grapes, self.other_words)
|
| 196 |
+
|
| 197 |
+
print('-----*-----Adding service categories-----*-----')
|
| 198 |
+
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 199 |
+
merge_types(items, products)
|
| 200 |
+
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 201 |
+
merge_types(products, products)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
print('-----*-----Name trimming-----*-----')
|
| 205 |
+
item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
|
| 206 |
+
#items['name']=items['id'].replace(item_timed_names)
|
| 207 |
+
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
|
| 208 |
+
items['gb']=gb
|
| 209 |
+
items['sour']=sour
|
| 210 |
+
items['sour']=items['sour'].replace(self.sour_dict)
|
| 211 |
+
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 212 |
+
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 213 |
+
products['gb']=gb
|
| 214 |
+
products['sour']=sour
|
| 215 |
+
products['sour']=products['sour'].replace(self.sour_dict)
|
| 216 |
+
|
| 217 |
+
print('-----*-----Replacing product types-----*-----')
|
| 218 |
+
products['type']=products['type'].replace(self.type_dict)
|
| 219 |
+
|
| 220 |
+
return items, products
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
preprocess/utils/common/utils.py
CHANGED
|
@@ -1,138 +1,150 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def
|
| 26 |
-
"
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
"""
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
return
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
result.append(
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return result, gbs, sours
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
|
| 3 |
+
from tqdm import tqdm
|
| 4 |
+
|
| 5 |
+
'''def get_delimiter(file_path):
|
| 6 |
+
with open(file_path, 'r') as f:
|
| 7 |
+
sample = f.read(1024) # читаем часть файла для анализа
|
| 8 |
+
dialect = csv.Sniffer().sniff(sample)
|
| 9 |
+
return dialect.delimiter'''
|
| 10 |
+
|
| 11 |
+
def get_delimiter(file_path):
|
| 12 |
+
with open(file_path, 'r', encoding="utf-8") as f:
|
| 13 |
+
ln = f.readline()
|
| 14 |
+
if ',' in ln:
|
| 15 |
+
return ','
|
| 16 |
+
if ';' in ln:
|
| 17 |
+
return ';'
|
| 18 |
+
if '\t' in ln:
|
| 19 |
+
return '\t'
|
| 20 |
+
if '|' in ln:
|
| 21 |
+
return '|'
|
| 22 |
+
|
| 23 |
+
raise ValueError(None, "Error parsing CSV file. Cannot detect delimiter")
|
| 24 |
+
|
| 25 |
+
def remove_quotes(text):
|
| 26 |
+
return re.sub(r'["\']', '', text)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def remove_l(text):
|
| 30 |
+
result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
|
| 31 |
+
|
| 32 |
+
# Убираем возможные лишние пробелы, возникающие после удаления
|
| 33 |
+
result = re.sub(r'\s{2,}', ' ', result).strip()
|
| 34 |
+
return result
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def clean_wine_name(name):
|
| 38 |
+
"""
|
| 39 |
+
Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
|
| 40 |
+
Например, "токай л" превратится в "токай".
|
| 41 |
+
"""
|
| 42 |
+
# Регулярное выражение ищет:
|
| 43 |
+
# \s+ – один или несколько пробельных символов;
|
| 44 |
+
# \b – граница слова;
|
| 45 |
+
# [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
|
| 46 |
+
# \b – граница слова;
|
| 47 |
+
# \s*$ – любые пробелы до конца строки.
|
| 48 |
+
return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def find_full_word(text, word_list):
|
| 52 |
+
"""
|
| 53 |
+
Ищет первое полное вхождение слова из word_list в строке text.
|
| 54 |
+
Возвращает найденное слово или None, если совпадение не найдено.
|
| 55 |
+
"""
|
| 56 |
+
for word in word_list:
|
| 57 |
+
pattern = r'\b' + re.escape(word) + r'\b'
|
| 58 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 59 |
+
return word
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def merge_wine_type(items, colors=None, color_merge_dict=None):
|
| 64 |
+
result=[]
|
| 65 |
+
for row in tqdm(items.iterrows()):
|
| 66 |
+
try:
|
| 67 |
+
if row[1]['type_wine'] is not None:
|
| 68 |
+
color=find_full_word(row[1]['type_wine'], colors)
|
| 69 |
+
if color is not None:
|
| 70 |
+
result.append(color)
|
| 71 |
+
else:
|
| 72 |
+
color=find_full_word(row[1]['name'], colors)
|
| 73 |
+
if color is not None:
|
| 74 |
+
result.append(color)
|
| 75 |
+
else:
|
| 76 |
+
result.append(None)
|
| 77 |
+
else:
|
| 78 |
+
color=find_full_word(row[1]['name'], colors)
|
| 79 |
+
if color is not None:
|
| 80 |
+
result.append(color)
|
| 81 |
+
else:
|
| 82 |
+
result.append(None)
|
| 83 |
+
except Exception as ex:
|
| 84 |
+
print(ex)
|
| 85 |
+
result.append(None)
|
| 86 |
+
|
| 87 |
+
items['new_type_wine']=result
|
| 88 |
+
items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def merge_types(items, products):
|
| 92 |
+
alco_types=[i.strip().lower() for i in products['type'].unique()]
|
| 93 |
+
alco_types.append('ликёр')
|
| 94 |
+
result=[]
|
| 95 |
+
for row in tqdm(items.iterrows()):
|
| 96 |
+
try:
|
| 97 |
+
type_in_name=find_full_word(row[1]['name'], alco_types)
|
| 98 |
+
if type_in_name is not None:
|
| 99 |
+
result.append(type_in_name)
|
| 100 |
+
continue
|
| 101 |
+
if row[1]['type'] is not None:
|
| 102 |
+
type_in_type=find_full_word(row[1]['type'], alco_types)
|
| 103 |
+
if type_in_type is not None:
|
| 104 |
+
result.append(type_in_type)
|
| 105 |
+
else:
|
| 106 |
+
result.append(row[1]['type'])
|
| 107 |
+
else:
|
| 108 |
+
result.append(None)
|
| 109 |
+
except Exception as ex:
|
| 110 |
+
print(ex)
|
| 111 |
+
result.append(None)
|
| 112 |
+
|
| 113 |
+
items['new_type']=result
|
| 114 |
+
items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def trim_name(text, words_to_remove):
|
| 118 |
+
"""
|
| 119 |
+
Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
|
| 120 |
+
|
| 121 |
+
:param text: Исходная строка.
|
| 122 |
+
:param words_to_remove: Список слов, которые необходимо удалить.
|
| 123 |
+
:return: Обновлённая строка с удалёнными словами.
|
| 124 |
+
"""
|
| 125 |
+
# Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
|
| 126 |
+
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 127 |
+
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 128 |
+
#print(pattern)
|
| 129 |
+
|
| 130 |
+
# Заменяем найденные полные слова на пустую строку.
|
| 131 |
+
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 132 |
+
|
| 133 |
+
# Убираем лишние пробелы, возникающие после удаления слов.
|
| 134 |
+
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
| 135 |
+
|
| 136 |
+
return new_text
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def name_trimmer(df, prcess_text, types_and_others):
|
| 140 |
+
result={}
|
| 141 |
+
gbs=[]
|
| 142 |
+
sours=[]
|
| 143 |
+
for idx, row in tqdm(df.iterrows()):
|
| 144 |
+
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
| 145 |
+
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
| 146 |
+
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 147 |
+
|
| 148 |
+
gbs.append(gb)
|
| 149 |
+
sours.append(sour)
|
| 150 |
return result, gbs, sours
|
processor/matching.py
CHANGED
|
@@ -1,159 +1,159 @@
|
|
| 1 |
-
from tqdm import tqdm
|
| 2 |
-
from transliterate import translit, detect_language
|
| 3 |
-
import pandas as pd
|
| 4 |
-
from rapidfuzz import fuzz, process
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
def normalize_name(name):
|
| 8 |
-
"""
|
| 9 |
-
Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
|
| 10 |
-
приводит к нижнему регистру.
|
| 11 |
-
"""
|
| 12 |
-
try:
|
| 13 |
-
if detect_language(name) == 'ru':
|
| 14 |
-
return translit(name, 'ru', reversed=True).lower()
|
| 15 |
-
except Exception:
|
| 16 |
-
pass
|
| 17 |
-
return name.lower()
|
| 18 |
-
|
| 19 |
-
def prepare_groups_with_ids(items_df):
|
| 20 |
-
"""
|
| 21 |
-
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
| 22 |
-
с учетом нормализованного названия.
|
| 23 |
-
|
| 24 |
-
Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
|
| 25 |
-
|
| 26 |
-
:param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
|
| 27 |
-
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 28 |
-
"""
|
| 29 |
-
items_df = items_df.copy()
|
| 30 |
-
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 31 |
-
|
| 32 |
-
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 33 |
-
lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 34 |
-
).to_dict()
|
| 35 |
-
return grouped
|
| 36 |
-
|
| 37 |
-
def prepare_groups_by_alternative_keys(items_df):
|
| 38 |
-
"""
|
| 39 |
-
Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
|
| 40 |
-
оригинального и нормализованного имени.
|
| 41 |
-
|
| 42 |
-
:param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
|
| 43 |
-
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 44 |
-
"""
|
| 45 |
-
items_df = items_df.copy()
|
| 46 |
-
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 47 |
-
|
| 48 |
-
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
|
| 49 |
-
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 50 |
-
).to_dict()
|
| 51 |
-
return grouped
|
| 52 |
-
|
| 53 |
-
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
|
| 54 |
-
"""
|
| 55 |
-
Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
|
| 56 |
-
нормализованные группы.
|
| 57 |
-
|
| 58 |
-
Производится два прохода:
|
| 59 |
-
- Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
|
| 60 |
-
- Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
|
| 61 |
-
исключая итемы с исходным брендом.
|
| 62 |
-
|
| 63 |
-
Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
|
| 64 |
-
|
| 65 |
-
:param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
|
| 66 |
-
:param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
|
| 67 |
-
:param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
|
| 68 |
-
:param name_threshold: Порог сходства для fuzzy matching.
|
| 69 |
-
:return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
|
| 70 |
-
"""
|
| 71 |
-
results = []
|
| 72 |
-
no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
|
| 73 |
-
|
| 74 |
-
# Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
|
| 75 |
-
for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
|
| 76 |
-
product_brand = product['brand']
|
| 77 |
-
product_type = product['type']
|
| 78 |
-
product_name = product['name']
|
| 79 |
-
product_volume = product['volume']
|
| 80 |
-
product_type_wine = product['new_type_wine']
|
| 81 |
-
product_sour = product['sour']
|
| 82 |
-
|
| 83 |
-
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 84 |
-
items_data = items_groups.get(key, [])
|
| 85 |
-
if items_data:
|
| 86 |
-
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 87 |
-
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 88 |
-
else:
|
| 89 |
-
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
|
| 90 |
-
|
| 91 |
-
norm_product_name = normalize_name(product_name)
|
| 92 |
-
matches = process.extract(
|
| 93 |
-
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 94 |
-
)
|
| 95 |
-
matched_items = [
|
| 96 |
-
{
|
| 97 |
-
'item_id': items_ids[idx_candidate],
|
| 98 |
-
'item_name': items_names[idx_candidate],
|
| 99 |
-
'score': score,
|
| 100 |
-
'volume': items_volumes[idx_candidate],
|
| 101 |
-
'color': item_type_wine[idx_candidate],
|
| 102 |
-
'sour': items_sour[idx_candidate],
|
| 103 |
-
'year': items_year[idx_candidate],
|
| 104 |
-
}
|
| 105 |
-
for match, score, idx_candidate in matches
|
| 106 |
-
]
|
| 107 |
-
|
| 108 |
-
if not matched_items:
|
| 109 |
-
no_match_products.append((idx, product))
|
| 110 |
-
|
| 111 |
-
results.append({
|
| 112 |
-
'product_id': product['id'],
|
| 113 |
-
'matched_items': matched_items,
|
| 114 |
-
'alternative': [] # Заполняется во втором проходе
|
| 115 |
-
})
|
| 116 |
-
|
| 117 |
-
# Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
|
| 118 |
-
groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
|
| 119 |
-
|
| 120 |
-
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 121 |
-
for idx, product in tqdm(no_match_products):
|
| 122 |
-
product_brand = product['brand']
|
| 123 |
-
product_type_wine = product['new_type_wine']
|
| 124 |
-
product_type = product['new_type']
|
| 125 |
-
product_volume = product['volume']
|
| 126 |
-
product_name = product['name']
|
| 127 |
-
product_sour = product['sour']
|
| 128 |
-
|
| 129 |
-
alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 130 |
-
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 131 |
-
# Фильтруем, исключая итемы с исходным брендом
|
| 132 |
-
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 133 |
-
if filtered_items:
|
| 134 |
-
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 135 |
-
else:
|
| 136 |
-
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
|
| 137 |
-
|
| 138 |
-
norm_product_name = normalize_name(product_name)
|
| 139 |
-
alt_matches = process.extract(
|
| 140 |
-
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 141 |
-
)
|
| 142 |
-
alt_matched_items = [
|
| 143 |
-
{
|
| 144 |
-
'item_id': alt_ids[idx_candidate],
|
| 145 |
-
'item_name': alt_names[idx_candidate],
|
| 146 |
-
'score': score,
|
| 147 |
-
'volume': alt_volumes[idx_candidate],
|
| 148 |
-
'color': alt_type_wine[idx_candidate],
|
| 149 |
-
'sour': alt_sour[idx_candidate],
|
| 150 |
-
'year': alt_year[idx_candidate],
|
| 151 |
-
}
|
| 152 |
-
for match, score, idx_candidate in alt_matches
|
| 153 |
-
]
|
| 154 |
-
|
| 155 |
-
results[idx]['alternative'] = alt_matched_items
|
| 156 |
-
|
| 157 |
-
results_df = pd.DataFrame(results)
|
| 158 |
-
merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
|
| 159 |
return merged_df
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
from transliterate import translit, detect_language
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from rapidfuzz import fuzz, process
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def normalize_name(name):
|
| 8 |
+
"""
|
| 9 |
+
Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
|
| 10 |
+
приводит к нижнему регистру.
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
if detect_language(name) == 'ru':
|
| 14 |
+
return translit(name, 'ru', reversed=True).lower()
|
| 15 |
+
except Exception:
|
| 16 |
+
pass
|
| 17 |
+
return name.lower()
|
| 18 |
+
|
| 19 |
+
def prepare_groups_with_ids(items_df):
|
| 20 |
+
"""
|
| 21 |
+
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
| 22 |
+
с учетом нормализованного названия.
|
| 23 |
+
|
| 24 |
+
Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
|
| 25 |
+
|
| 26 |
+
:param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
|
| 27 |
+
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 28 |
+
"""
|
| 29 |
+
items_df = items_df.copy()
|
| 30 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 31 |
+
|
| 32 |
+
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 33 |
+
lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 34 |
+
).to_dict()
|
| 35 |
+
return grouped
|
| 36 |
+
|
| 37 |
+
def prepare_groups_by_alternative_keys(items_df):
|
| 38 |
+
"""
|
| 39 |
+
Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
|
| 40 |
+
оригинального и нормализованного имени.
|
| 41 |
+
|
| 42 |
+
:param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
|
| 43 |
+
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 44 |
+
"""
|
| 45 |
+
items_df = items_df.copy()
|
| 46 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 47 |
+
|
| 48 |
+
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
|
| 49 |
+
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 50 |
+
).to_dict()
|
| 51 |
+
return grouped
|
| 52 |
+
|
| 53 |
+
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
|
| 54 |
+
"""
|
| 55 |
+
Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
|
| 56 |
+
нормализованные группы.
|
| 57 |
+
|
| 58 |
+
Производится два прохода:
|
| 59 |
+
- Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
|
| 60 |
+
- Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
|
| 61 |
+
исключая итемы с исходным брендом.
|
| 62 |
+
|
| 63 |
+
Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
|
| 64 |
+
|
| 65 |
+
:param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
|
| 66 |
+
:param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
|
| 67 |
+
:param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
|
| 68 |
+
:param name_threshold: Порог сходства для fuzzy matching.
|
| 69 |
+
:return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
|
| 70 |
+
"""
|
| 71 |
+
results = []
|
| 72 |
+
no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
|
| 73 |
+
|
| 74 |
+
# Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
|
| 75 |
+
for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
|
| 76 |
+
product_brand = product['brand']
|
| 77 |
+
product_type = product['type']
|
| 78 |
+
product_name = product['name']
|
| 79 |
+
product_volume = product['volume']
|
| 80 |
+
product_type_wine = product['new_type_wine']
|
| 81 |
+
product_sour = product['sour']
|
| 82 |
+
|
| 83 |
+
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 84 |
+
items_data = items_groups.get(key, [])
|
| 85 |
+
if items_data:
|
| 86 |
+
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 87 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 88 |
+
else:
|
| 89 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
|
| 90 |
+
|
| 91 |
+
norm_product_name = normalize_name(product_name)
|
| 92 |
+
matches = process.extract(
|
| 93 |
+
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 94 |
+
)
|
| 95 |
+
matched_items = [
|
| 96 |
+
{
|
| 97 |
+
'item_id': items_ids[idx_candidate],
|
| 98 |
+
'item_name': items_names[idx_candidate],
|
| 99 |
+
'score': score,
|
| 100 |
+
'volume': items_volumes[idx_candidate],
|
| 101 |
+
'color': item_type_wine[idx_candidate],
|
| 102 |
+
'sour': items_sour[idx_candidate],
|
| 103 |
+
'year': items_year[idx_candidate],
|
| 104 |
+
}
|
| 105 |
+
for match, score, idx_candidate in matches
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
if not matched_items:
|
| 109 |
+
no_match_products.append((idx, product))
|
| 110 |
+
|
| 111 |
+
results.append({
|
| 112 |
+
'product_id': product['id'],
|
| 113 |
+
'matched_items': matched_items,
|
| 114 |
+
'alternative': [] # Заполняется во втором проходе
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
# Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
|
| 118 |
+
groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
|
| 119 |
+
|
| 120 |
+
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 121 |
+
for idx, product in tqdm(no_match_products):
|
| 122 |
+
product_brand = product['brand']
|
| 123 |
+
product_type_wine = product['new_type_wine']
|
| 124 |
+
product_type = product['new_type']
|
| 125 |
+
product_volume = product['volume']
|
| 126 |
+
product_name = product['name']
|
| 127 |
+
product_sour = product['sour']
|
| 128 |
+
|
| 129 |
+
alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 130 |
+
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 131 |
+
# Фильтруем, исключая итемы с исходным брендом
|
| 132 |
+
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 133 |
+
if filtered_items:
|
| 134 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 135 |
+
else:
|
| 136 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
|
| 137 |
+
|
| 138 |
+
norm_product_name = normalize_name(product_name)
|
| 139 |
+
alt_matches = process.extract(
|
| 140 |
+
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 141 |
+
)
|
| 142 |
+
alt_matched_items = [
|
| 143 |
+
{
|
| 144 |
+
'item_id': alt_ids[idx_candidate],
|
| 145 |
+
'item_name': alt_names[idx_candidate],
|
| 146 |
+
'score': score,
|
| 147 |
+
'volume': alt_volumes[idx_candidate],
|
| 148 |
+
'color': alt_type_wine[idx_candidate],
|
| 149 |
+
'sour': alt_sour[idx_candidate],
|
| 150 |
+
'year': alt_year[idx_candidate],
|
| 151 |
+
}
|
| 152 |
+
for match, score, idx_candidate in alt_matches
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
results[idx]['alternative'] = alt_matched_items
|
| 156 |
+
|
| 157 |
+
results_df = pd.DataFrame(results)
|
| 158 |
+
merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
|
| 159 |
return merged_df
|
processor/processor.py
CHANGED
|
@@ -1,32 +1,28 @@
|
|
| 1 |
-
from preprocess.preprocess import Preprocessor
|
| 2 |
-
from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class Processor():
|
| 6 |
-
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
-
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
-
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 9 |
-
|
| 10 |
-
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 11 |
-
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 12 |
-
sour_merge_dict, type_merge_dict, color_merge_dict)
|
| 13 |
-
|
| 14 |
-
def process(self, products, items, is_items_first=False, th=65):
|
| 15 |
-
items, products=self.preprocessor.process(products, items)
|
| 16 |
-
|
| 17 |
-
print('-----*-----Matching-----*-----')
|
| 18 |
-
|
| 19 |
-
if is_items_first:
|
| 20 |
-
products['new_brand']=products['brand']
|
| 21 |
-
items['brand']=items['new_brand']
|
| 22 |
-
products_groups = prepare_groups_with_ids(products)
|
| 23 |
-
res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
|
| 24 |
-
else:
|
| 25 |
-
items_groups = prepare_groups_with_ids(items)
|
| 26 |
-
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
|
| 27 |
-
|
| 28 |
-
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
|
|
|
| 1 |
+
from preprocess.preprocess import Preprocessor
|
| 2 |
+
from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Processor():
|
| 6 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 9 |
+
|
| 10 |
+
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 11 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 12 |
+
sour_merge_dict, type_merge_dict, color_merge_dict)
|
| 13 |
+
|
| 14 |
+
def process(self, products, items, is_items_first=False, th=65):
|
| 15 |
+
items, products=self.preprocessor.process(products, items)
|
| 16 |
+
|
| 17 |
+
print('-----*-----Matching-----*-----')
|
| 18 |
+
|
| 19 |
+
if is_items_first:
|
| 20 |
+
products['new_brand']=products['brand']
|
| 21 |
+
items['brand']=items['new_brand']
|
| 22 |
+
products_groups = prepare_groups_with_ids(products)
|
| 23 |
+
res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
|
| 24 |
+
else:
|
| 25 |
+
items_groups = prepare_groups_with_ids(items)
|
| 26 |
+
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
|
| 27 |
+
|
| 28 |
+
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
python-Levenshtein
|
| 2 |
-
transliterate
|
| 3 |
-
rapidfuzz
|
| 4 |
-
pyahocorasick
|
| 5 |
-
unidecode
|
| 6 |
-
pqdm
|
| 7 |
tqdm
|
|
|
|
| 1 |
+
python-Levenshtein
|
| 2 |
+
transliterate
|
| 3 |
+
rapidfuzz
|
| 4 |
+
pyahocorasick
|
| 5 |
+
unidecode
|
| 6 |
+
pqdm
|
| 7 |
tqdm
|
search/search_by_id.py
CHANGED
|
@@ -1,24 +1,53 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import ast
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class Searcher():
|
| 7 |
-
def __init__(self):
|
| 8 |
-
self.df = None
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
return result
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import ast
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Searcher():
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.df = None
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def set_df(self, df):
|
| 12 |
+
self.df = df
|
| 13 |
+
try:
|
| 14 |
+
self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
|
| 15 |
+
self.df['alternative'] = self.df['alternative'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
|
| 16 |
+
except Exception as e:
|
| 17 |
+
print(e)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def search(self, query):
|
| 21 |
+
data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
|
| 22 |
+
return pd.DataFrame(data)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def search(self, resultfn, query):
|
| 26 |
+
is_alternative_items = False
|
| 27 |
+
df_matched_items = pd.DataFrame()
|
| 28 |
+
|
| 29 |
+
matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
|
| 30 |
+
self.set_df(matching_result)
|
| 31 |
+
|
| 32 |
+
items = self.df[self.df['id']==query]
|
| 33 |
+
matched_items = items['matched_items']
|
| 34 |
+
if (len(matched_items) != 0) and (len(matched_items.values[0])):
|
| 35 |
+
data = json.loads(json.dumps(matched_items.values[0]))
|
| 36 |
+
df_matched_items = pd.DataFrame(data)
|
| 37 |
+
is_alternative_items = False
|
| 38 |
+
else:
|
| 39 |
+
alter_items = items['alternative']
|
| 40 |
+
|
| 41 |
+
if (len(alter_items) != 0) and (len(alter_items.values[0])):
|
| 42 |
+
data = json.loads(json.dumps(alter_items.values[0]))
|
| 43 |
+
df_matched_items = pd.DataFrame(data)
|
| 44 |
+
is_alternative_items = True
|
| 45 |
+
|
| 46 |
+
return (df_matched_items, is_alternative_items)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def search_in_uploaded_file(self, path, query):
|
| 50 |
+
matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
|
| 51 |
+
self.set_df(matching_result)
|
| 52 |
+
result=self.search(query)
|
| 53 |
return result
|
tmp/prod.csv
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
|
|
|
| 1 |
+
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
tmp/service/prod.csv
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
|
|
|
| 1 |
+
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
tmp/utils.py
CHANGED
|
@@ -1,37 +1,48 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
from preprocess.utils.common.utils import get_delimiter
|
| 3 |
-
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from preprocess.utils.common.utils import get_delimiter
|
| 3 |
+
import shutil
|
| 4 |
+
import os
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def update_products_csv(new_csv_path, prods_file, overwrite_existing):
|
| 8 |
+
if os.path.isfile(prods_file) and not overwrite_existing:
|
| 9 |
+
main_sep=get_delimiter(prods_file)
|
| 10 |
+
main_csv=pd.read_csv(prods_file, sep=main_sep, on_bad_lines="warn")
|
| 11 |
+
|
| 12 |
+
new_sep=get_delimiter(new_csv_path)
|
| 13 |
+
new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
|
| 14 |
+
if 'attrs' in new_csv.columns.values:
|
| 15 |
+
raise Exception("Uploaded Products CSV does not seem to be valid")
|
| 16 |
+
|
| 17 |
+
result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
|
| 18 |
+
result.to_csv(prods_file, sep=main_sep, index=False)
|
| 19 |
+
else:
|
| 20 |
+
new_sep=get_delimiter(new_csv_path)
|
| 21 |
+
new_csv=pd.read_csv(new_csv_path, sep=new_sep, on_bad_lines="warn")
|
| 22 |
+
new_csv.to_csv(prods_file, sep=new_sep, index=False)
|
| 23 |
+
|
| 24 |
+
return prods_file
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
'''def is_csv_exist(path):
|
| 28 |
+
file_list=glob(path+'/*.csv')
|
| 29 |
+
if len(file_list)>0:
|
| 30 |
+
return file_list[0]
|
| 31 |
+
else:
|
| 32 |
+
None
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
|
| 36 |
+
main_path=is_csv_exist(main_dir)
|
| 37 |
+
if main_path==None:
|
| 38 |
+
new_path = shutil.move(new_path, main_dir)
|
| 39 |
+
return new_path
|
| 40 |
+
else:
|
| 41 |
+
update_products_csv(main_path, new_path)
|
| 42 |
+
return main_path
|
| 43 |
+
|
| 44 |
+
def remover(data_path):
|
| 45 |
+
#path=is_csv_exist('/home/user/app/tmp/prod.csv')
|
| 46 |
+
#if path!=None:
|
| 47 |
+
os.remove(os.getcwd()+'/tmp/prod.csv')
|
| 48 |
+
shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''
|
ui/gradio_ui.py
CHANGED
|
@@ -1,121 +1,170 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
import
|
| 4 |
-
|
| 5 |
-
from
|
| 6 |
-
from
|
| 7 |
-
import os
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
self.
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
#
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
demo.launch()
|
|
|
|
| 1 |
+
from argparse import ArgumentError
|
| 2 |
+
|
| 3 |
+
import gradio as gr
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from preprocess.utils.common.utils import get_delimiter
|
| 6 |
+
from tmp.utils import remover, update_products_csv
|
| 7 |
+
import os
|
| 8 |
+
import datetime, time
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class GradioUI():
|
| 12 |
+
|
| 13 |
+
def __init__(self, processor, searcher, data_path):
|
| 14 |
+
self.processor=processor
|
| 15 |
+
self.searcher=searcher
|
| 16 |
+
self.data_path = data_path
|
| 17 |
+
|
| 18 |
+
def get_data_dir(self):
|
| 19 |
+
return self.data_path
|
| 20 |
+
|
| 21 |
+
def get_products_dir(self):
|
| 22 |
+
return os.path.join(self.get_data_dir(), "products")
|
| 23 |
+
|
| 24 |
+
def get_items_dir(self):
|
| 25 |
+
return os.path.join(self.get_data_dir(), "items")
|
| 26 |
+
|
| 27 |
+
def get_results_dir(self):
|
| 28 |
+
return os.path.join(self.get_data_dir(), "results")
|
| 29 |
+
|
| 30 |
+
def get_products_file_date(self):
|
| 31 |
+
fullfn = os.path.join(self.data_path, "products", "products.csv")
|
| 32 |
+
if not os.path.isfile(fullfn):
|
| 33 |
+
return "Файл Products не найден"
|
| 34 |
+
|
| 35 |
+
stinfo = os.stat(fullfn)
|
| 36 |
+
return time.ctime(stinfo.st_mtime)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def upload_products_file(self, prods_file, overwrite_existing):
|
| 40 |
+
try:
|
| 41 |
+
if not os.path.exists(self.get_products_dir()):
|
| 42 |
+
os.makedirs(self.get_products_dir())
|
| 43 |
+
|
| 44 |
+
fullfn = os.path.join(self.get_products_dir(), "products.csv")
|
| 45 |
+
|
| 46 |
+
if prods_file != None:
|
| 47 |
+
update_products_csv(prods_file, fullfn, overwrite_existing)
|
| 48 |
+
|
| 49 |
+
gr.Info("Файл Products успешно загружен")
|
| 50 |
+
except Exception as ex:
|
| 51 |
+
raise gr.Error("An error occurred 💥!" + "\n\n" + str(ex), duration=5)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def process_items(self, items_file, is_items_first, threshold): #, q_id):
|
| 55 |
+
try:
|
| 56 |
+
prods_file = os.path.join(self.get_products_dir(), "products.csv")
|
| 57 |
+
if not os.path.isfile(prods_file):
|
| 58 |
+
raise Exception("Файл Products не найден")
|
| 59 |
+
|
| 60 |
+
if items_file != None:
|
| 61 |
+
items_delimiter=get_delimiter(items_file)
|
| 62 |
+
print('items delimiter: '+items_delimiter)
|
| 63 |
+
row_items=pd.read_csv(items_file, sep=items_delimiter, on_bad_lines='skip')
|
| 64 |
+
if not 'attrs' in row_items.columns.values:
|
| 65 |
+
raise Exception("Uploaded Items CSV does not seem to be valid")
|
| 66 |
+
|
| 67 |
+
products_delimiter=get_delimiter(prods_file)
|
| 68 |
+
print('products delimiter: '+products_delimiter)
|
| 69 |
+
row_products=pd.read_csv(prods_file, sep=products_delimiter, on_bad_lines='skip')
|
| 70 |
+
|
| 71 |
+
# if q_id in row_products['id'].unique():
|
| 72 |
+
# row_products=row_products[row_products['id']==q_id]
|
| 73 |
+
|
| 74 |
+
#print("product id: " + str(q_id))
|
| 75 |
+
|
| 76 |
+
df, items, products = self.processor.process(row_products, row_items, is_items_first, threshold)
|
| 77 |
+
|
| 78 |
+
self.searcher.set_df(df.copy())
|
| 79 |
+
#with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
| 80 |
+
# output_csv = tmp.name
|
| 81 |
+
results_path = self.get_results_dir()
|
| 82 |
+
if not os.path.exists(results_path):
|
| 83 |
+
os.makedirs(results_path)
|
| 84 |
+
|
| 85 |
+
output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
|
| 86 |
+
output_csv = os.path.join(results_path, output_csv)
|
| 87 |
+
df.to_csv(output_csv, sep='\t', index=False)
|
| 88 |
+
return output_csv
|
| 89 |
+
except Exception as ex:
|
| 90 |
+
raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
|
| 91 |
+
|
| 92 |
+
def on_page_load(self, r: gr.Request):
|
| 93 |
+
m_time = self.get_products_file_date()
|
| 94 |
+
return [f"Дата последнего обновления файла Products: {m_time}", f"Дата последнего обновления файла Products: {m_time}"]
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
def run_ui(self):
|
| 98 |
+
with gr.Blocks() as demo:
|
| 99 |
+
tabs = gr.Tabs()
|
| 100 |
+
with tabs:
|
| 101 |
+
|
| 102 |
+
# with gr.Row():
|
| 103 |
+
# file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 104 |
+
# process_button = gr.Button("Обновить")
|
| 105 |
+
|
| 106 |
+
with gr.TabItem("Загрузка файла Products"):
|
| 107 |
+
prod_file_info1 = gr.Markdown("## Загрузка файла Products")
|
| 108 |
+
with gr.Row():
|
| 109 |
+
file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 110 |
+
with gr.Row():
|
| 111 |
+
toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
|
| 112 |
+
upload_button = gr.Button("Загрузить файл")
|
| 113 |
+
upload_button.click(
|
| 114 |
+
fn=self.upload_products_file,
|
| 115 |
+
inputs=[file_input1, toggle_input],
|
| 116 |
+
#outputs=output_file
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Вкладка для обработки CSV файлов
|
| 120 |
+
with gr.TabItem("Обработка каталога поставщика"):
|
| 121 |
+
gr.Markdown("## Обработка каталога поставщика")
|
| 122 |
+
|
| 123 |
+
m_time = self.get_products_file_date()
|
| 124 |
+
prod_file_info2 = gr.Markdown(f"Дата последнего обновления файла Products: {m_time}")
|
| 125 |
+
with gr.Row():
|
| 126 |
+
#file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 127 |
+
file_items = gr.File(label="Items", type="filepath", file_types=[".csv"])
|
| 128 |
+
#search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 129 |
+
with gr.Row():
|
| 130 |
+
toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
|
| 131 |
+
threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
|
| 132 |
+
process_button = gr.Button("Загрузить файл с каталогом и сравнить")
|
| 133 |
+
output_file = gr.File(label="Скачать результат (CSV)")
|
| 134 |
+
process_button.click(
|
| 135 |
+
fn=self.process_items,
|
| 136 |
+
inputs=[file_items, toggle_input, threshold_input], #, search_number],
|
| 137 |
+
outputs=output_file
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Вкладка для поиска
|
| 141 |
+
with gr.TabItem("Поиск в обработанном csv"):
|
| 142 |
+
gr.Markdown("## Поиск")
|
| 143 |
+
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 144 |
+
search_button = gr.Button("Найти")
|
| 145 |
+
search_table = gr.Dataframe(label="Результаты поиска")
|
| 146 |
+
search_button.click(
|
| 147 |
+
fn=self.searcher.search,
|
| 148 |
+
inputs=[search_number],
|
| 149 |
+
outputs=search_table
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
with gr.TabItem("Загрузка результат и поиск в нем"):
|
| 153 |
+
gr.Markdown("## Поиск")
|
| 154 |
+
with gr.Row():
|
| 155 |
+
input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
|
| 156 |
+
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 157 |
+
search_button = gr.Button("Найти")
|
| 158 |
+
search_table = gr.Dataframe(label="Результаты поиска")
|
| 159 |
+
search_button.click(
|
| 160 |
+
fn=self.searcher.search_in_uploaded_file,
|
| 161 |
+
inputs=[input_path, search_number],
|
| 162 |
+
outputs=search_table
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
#with gr.TabItem("Удалить сохраненные продукты"):
|
| 166 |
+
# del_button = gr.Button("Удалить")
|
| 167 |
+
# process_button.click(fn=remover)
|
| 168 |
+
|
| 169 |
+
demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
|
| 170 |
demo.launch()
|