Spaces:
Sleeping
Sleeping
API and Changes
#2
by j-s-v - opened
- .gitattributes +35 -35
- .gitignore +0 -3
- README.md +12 -12
- api.py +0 -205
- app.py +31 -33
- constants/constants.py +4 -31
- preprocess/preprocess.py +224 -243
- preprocess/utils/common/utils.py +137 -164
- preprocess/utils/items/attrs.py +1 -1
- processor/matching.py +158 -301
- processor/processor.py +32 -30
- requirements.txt +6 -6
- search/matching_judge.py +0 -156
- search/search_by_id.py +23 -52
- tmp/prod.csv +1 -1
- tmp/service/prod.csv +1 -1
- tmp/utils.py +37 -48
- ui/gradio_ui.py +120 -177
.gitattributes
CHANGED
|
@@ -1,35 +1,35 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
.gitignore
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
*.pyc
|
| 2 |
-
.idea/*
|
| 3 |
-
_data/*
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Product Matching
|
| 3 |
-
emoji: 🏃
|
| 4 |
-
colorFrom: gray
|
| 5 |
-
colorTo: purple
|
| 6 |
-
sdk: gradio
|
| 7 |
-
sdk_version: 5.19.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Product Matching
|
| 3 |
+
emoji: 🏃
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 5.19.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
api.py
DELETED
|
@@ -1,205 +0,0 @@
|
|
| 1 |
-
import csv
|
| 2 |
-
import json
|
| 3 |
-
import os
|
| 4 |
-
import datetime
|
| 5 |
-
|
| 6 |
-
from processor.processor import Processor
|
| 7 |
-
from constants.constants import *
|
| 8 |
-
from search.search_by_id import Searcher
|
| 9 |
-
from fastapi import FastAPI, File, UploadFile, HTTPException
|
| 10 |
-
import uvicorn
|
| 11 |
-
from pydantic import BaseModel
|
| 12 |
-
import pandas as pd
|
| 13 |
-
from tmp.utils import update_products_csv
|
| 14 |
-
from search.matching_judge import compare_matching_with_manual
|
| 15 |
-
|
| 16 |
-
'''compare_matching_with_manual("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New5)\\products.csv",
|
| 17 |
-
"C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\ws-items-for-test.csv",
|
| 18 |
-
"C:\\Projects (Mediterra)\\!TechLead\\WineMatching\m1-50-250325-133739.csv",
|
| 19 |
-
"C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\matching-20250318.csv")'''
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
processor=Processor(LONG_TYPES_LIST,
|
| 23 |
-
SHORT_TYPES_LIST,
|
| 24 |
-
SOUR,
|
| 25 |
-
WINE_TYPES,
|
| 26 |
-
GBS,
|
| 27 |
-
COLORS_FOR_TRIM,
|
| 28 |
-
GRAPES,
|
| 29 |
-
OTHER_WORDS,
|
| 30 |
-
SOUR_MERGE_DICT,
|
| 31 |
-
TYPES_WINES_DICT,
|
| 32 |
-
COLOR_MERGE_DICT)
|
| 33 |
-
|
| 34 |
-
searcher=Searcher()
|
| 35 |
-
|
| 36 |
-
class item_by_id(BaseModel):
|
| 37 |
-
result_file: str
|
| 38 |
-
id: str
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
class match_request(BaseModel):
|
| 42 |
-
items: str
|
| 43 |
-
threshold: int
|
| 44 |
-
items_first: int
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
def get_data_dir():
|
| 48 |
-
return "/home/user/app/_data/"
|
| 49 |
-
#return "_data"
|
| 50 |
-
|
| 51 |
-
def get_products_dir():
|
| 52 |
-
return os.path.join(get_data_dir(), "products")
|
| 53 |
-
|
| 54 |
-
def get_items_dir():
|
| 55 |
-
return os.path.join(get_data_dir(), "items")
|
| 56 |
-
|
| 57 |
-
def get_results_dir():
|
| 58 |
-
return os.path.join(get_data_dir(), "results")
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
app = FastAPI()
|
| 62 |
-
|
| 63 |
-
@app.get("/api/get_result_csv")
|
| 64 |
-
async def get_result_csv():
|
| 65 |
-
results = []
|
| 66 |
-
for file in os.listdir(get_results_dir()):
|
| 67 |
-
if file.endswith(".csv"):
|
| 68 |
-
results.append(file)
|
| 69 |
-
|
| 70 |
-
results_json = json.dumps(results)
|
| 71 |
-
return results_json
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
@app.post("/api/upload_result_csv")
|
| 75 |
-
async def upload_result_csv(file: UploadFile = File(...)):
|
| 76 |
-
try:
|
| 77 |
-
contents = file.file.read()
|
| 78 |
-
|
| 79 |
-
with open(os.path.join(get_results_dir(), file.filename), 'wb') as f:
|
| 80 |
-
f.write(contents)
|
| 81 |
-
except Exception:
|
| 82 |
-
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 83 |
-
finally:
|
| 84 |
-
file.file.close()
|
| 85 |
-
|
| 86 |
-
return {"message": f"Successfully uploaded {file.filename}"}
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
@app.post("/api/upload_products_csv")
|
| 90 |
-
async def upload_products_csv(file: UploadFile, overwrite_existing: int):
|
| 91 |
-
try:
|
| 92 |
-
datadir = get_products_dir()
|
| 93 |
-
if not os.path.exists(datadir):
|
| 94 |
-
os.makedirs(datadir)
|
| 95 |
-
|
| 96 |
-
tempfile = os.path.join(datadir, "products.csv_upload")
|
| 97 |
-
|
| 98 |
-
contents = file.file.read()
|
| 99 |
-
|
| 100 |
-
with open(tempfile, 'wb') as f:
|
| 101 |
-
f.write(contents)
|
| 102 |
-
|
| 103 |
-
fullfn = os.path.join(datadir, "products.csv")
|
| 104 |
-
update_products_csv(tempfile, fullfn, overwrite_existing)
|
| 105 |
-
|
| 106 |
-
os.remove(tempfile)
|
| 107 |
-
|
| 108 |
-
except Exception:
|
| 109 |
-
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 110 |
-
finally:
|
| 111 |
-
file.file.close()
|
| 112 |
-
|
| 113 |
-
return {"message": f"Successfully uploaded {file.filename}"}
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
#@app.post("/api/upload_items_csv")
|
| 117 |
-
def upload_items_csv(file: UploadFile):
|
| 118 |
-
try:
|
| 119 |
-
itemsdir = get_items_dir()
|
| 120 |
-
|
| 121 |
-
if not os.path.exists(itemsdir):
|
| 122 |
-
os.makedirs(itemsdir)
|
| 123 |
-
|
| 124 |
-
contents = file.file.read()
|
| 125 |
-
|
| 126 |
-
fullfn = os.path.join(itemsdir, file.filename)
|
| 127 |
-
with open(fullfn, 'wb') as f:
|
| 128 |
-
f.write(contents)
|
| 129 |
-
except Exception:
|
| 130 |
-
raise HTTPException(status_code=500, detail='Something went wrong')
|
| 131 |
-
finally:
|
| 132 |
-
file.file.close()
|
| 133 |
-
|
| 134 |
-
#return {"message": f"Successfully uploaded {file.filename}"}
|
| 135 |
-
return fullfn
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
@app.get("/api/get_items_csv")
|
| 139 |
-
async def get_items_csv():
|
| 140 |
-
itemsdir = get_items_dir()
|
| 141 |
-
|
| 142 |
-
results = []
|
| 143 |
-
for file in os.listdir(itemsdir):
|
| 144 |
-
if file.endswith(".csv"):
|
| 145 |
-
results.append(file)
|
| 146 |
-
|
| 147 |
-
results_json = json.dumps(results)
|
| 148 |
-
return results_json
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
@app.post("/api/match")
|
| 152 |
-
async def match(items_file: UploadFile, threshold: int, items_first: int):
|
| 153 |
-
prods_file = os.path.join(get_products_dir(), "products.csv")
|
| 154 |
-
if not os.path.isfile(prods_file):
|
| 155 |
-
return {"Status": "Error", "ErrorDesc": "File 'Products.csv' not found"}
|
| 156 |
-
|
| 157 |
-
items_fn = upload_items_csv(items_file)
|
| 158 |
-
#if len(r.items) == 0:
|
| 159 |
-
# return {"Status": "Error", "ErrorDesc": "Items file not specified"}
|
| 160 |
-
|
| 161 |
-
if not threshold:
|
| 162 |
-
threshold = 50
|
| 163 |
-
|
| 164 |
-
#items_fn = os.path.join(get_items_dir(), r.items)
|
| 165 |
-
#if not os.path.isfile(items_fn):
|
| 166 |
-
# return {"Status": "Error", "ErrorDesc": "Items file not found"}
|
| 167 |
-
|
| 168 |
-
row_items = pd.read_csv(items_fn, sep='\t')
|
| 169 |
-
os.remove(items_fn)
|
| 170 |
-
|
| 171 |
-
row_products = pd.read_csv(prods_file, sep='\t', on_bad_lines='skip')
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
df, items, products = processor.process(row_products, row_items, items_first, threshold)
|
| 175 |
-
|
| 176 |
-
results_dir = get_results_dir()
|
| 177 |
-
if not os.path.exists(results_dir):
|
| 178 |
-
os.makedirs(results_dir)
|
| 179 |
-
|
| 180 |
-
output_csv = "m1-" + str(threshold) + "-" + datetime.datetime.now().strftime('%y%m%d-%H%M%S') + ".csv"
|
| 181 |
-
df.to_csv(os.path.join(results_dir, output_csv), sep='\t', index=False)
|
| 182 |
-
|
| 183 |
-
return {"Status": "Success", "result_file" : output_csv}
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
@app.get("/api/get_matched_by_id")
|
| 187 |
-
async def get_matched_by_id(item: item_by_id):
|
| 188 |
-
fullfn = os.path.join(get_results_dir(), item.result_file)
|
| 189 |
-
if not os.path.isfile(fullfn):
|
| 190 |
-
return {"Status": "Error", "ErrorDesc": "Specified result CSV file not found"}
|
| 191 |
-
|
| 192 |
-
(df, is_alternative) = searcher.search(fullfn, int(item.id))
|
| 193 |
-
if df.empty:
|
| 194 |
-
return {"Status": "Success", "IsAlternative": False, "Data": ""}
|
| 195 |
-
|
| 196 |
-
return {"Status": "Success", "IsAlternative": is_alternative, "Data": df.to_json(orient='records')}
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
if __name__ == "__main__":
|
| 200 |
-
uvicorn.run(
|
| 201 |
-
app,
|
| 202 |
-
host="0.0.0.0",
|
| 203 |
-
port=8000,
|
| 204 |
-
log_level="debug"
|
| 205 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,33 +1,31 @@
|
|
| 1 |
-
from processor.processor import Processor
|
| 2 |
-
from constants.constants import *
|
| 3 |
-
from ui.gradio_ui import GradioUI
|
| 4 |
-
from search.search_by_id import Searcher
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
ui
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
| 1 |
+
from processor.processor import Processor
|
| 2 |
+
from constants.constants import *
|
| 3 |
+
from ui.gradio_ui import GradioUI
|
| 4 |
+
from search.search_by_id import Searcher
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
processor=Processor(LONG_TYPES_LIST,
|
| 9 |
+
SHORT_TYPES_LIST,
|
| 10 |
+
SOUR,
|
| 11 |
+
WINE_TYPES,
|
| 12 |
+
GBS,
|
| 13 |
+
COLORS_FOR_TRIM,
|
| 14 |
+
GRAPES,
|
| 15 |
+
OTHER_WORDS,
|
| 16 |
+
SOUR_MERGE_DICT,
|
| 17 |
+
TYPES_WINES_DICT,
|
| 18 |
+
COLOR_MERGE_DICT)
|
| 19 |
+
|
| 20 |
+
searcher=Searcher()
|
| 21 |
+
|
| 22 |
+
ui=GradioUI(processor, searcher)
|
| 23 |
+
ui.run_ui()
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
|
|
|
|
|
constants/constants.py
CHANGED
|
@@ -75,9 +75,7 @@ SOUR = [
|
|
| 75 |
'п/сл',
|
| 76 |
'п/с',
|
| 77 |
'сл',
|
| 78 |
-
'сл.',
|
| 79 |
'сух',
|
| 80 |
-
'сух.'
|
| 81 |
]
|
| 82 |
|
| 83 |
|
|
@@ -87,8 +85,7 @@ WINE_TYPES = [
|
|
| 87 |
'розовое',
|
| 88 |
'роз',
|
| 89 |
'кр',
|
| 90 |
-
'
|
| 91 |
-
'бел',
|
| 92 |
'розе',
|
| 93 |
'rosso',
|
| 94 |
'roso',
|
|
@@ -154,13 +151,11 @@ GBS = [
|
|
| 154 |
|
| 155 |
COLORS_FOR_TRIM = [
|
| 156 |
'красное',
|
| 157 |
-
'крас',
|
| 158 |
-
'кр',
|
| 159 |
'белое',
|
|
|
|
|
|
|
| 160 |
'бел',
|
| 161 |
-
'розовое',
|
| 162 |
'розе',
|
| 163 |
-
'rose',
|
| 164 |
'rosso',
|
| 165 |
'roso',
|
| 166 |
'roseto',
|
|
@@ -212,8 +207,6 @@ GRAPES = [
|
|
| 212 |
|
| 213 |
|
| 214 |
OTHER_WORDS=[
|
| 215 |
-
"Шампанское",
|
| 216 |
-
"Шампань",
|
| 217 |
"Игристое",
|
| 218 |
"Жемчужное",
|
| 219 |
"Газированный",
|
|
@@ -234,7 +227,6 @@ OTHER_WORDS=[
|
|
| 234 |
"Десертный",
|
| 235 |
"Вкус",
|
| 236 |
"Сорт",
|
| 237 |
-
"односолод."
|
| 238 |
]
|
| 239 |
|
| 240 |
|
|
@@ -244,14 +236,10 @@ SOUR_MERGE_DICT={
|
|
| 244 |
'sweet':'сладкое',
|
| 245 |
'сухое':'сухое',
|
| 246 |
'п/сух':'полусухое',
|
| 247 |
-
'п/сух.':'полусухое',
|
| 248 |
'п/сл':'полусладкое',
|
| 249 |
-
'п/сл.':'полусладкое',
|
| 250 |
'п/с':'полусухое',
|
| 251 |
'сл':'сладкое',
|
| 252 |
-
'сл.':'сладкое',
|
| 253 |
'сух':'сухое',
|
| 254 |
-
'сух.':'сухое',
|
| 255 |
None: 'unmatched',
|
| 256 |
}
|
| 257 |
|
|
@@ -265,8 +253,7 @@ TYPES_WINES_DICT={
|
|
| 265 |
'Сироп':'Сиропы',
|
| 266 |
'Арманьяк':'Коньяк',
|
| 267 |
'Бренди':'Коньяк',
|
| 268 |
-
'Ликер':'Ликер',
|
| 269 |
-
'Ликёр': 'Ликер',
|
| 270 |
'Граппа':'Водка',
|
| 271 |
'Настойка':'Водка',
|
| 272 |
'Конфеты':'Сладости',
|
|
@@ -276,13 +263,11 @@ TYPES_WINES_DICT={
|
|
| 276 |
'Винный напиток': "Вино",
|
| 277 |
"Игристое вино":'Шампанское',
|
| 278 |
"Самогон": "Водка",
|
| 279 |
-
None: 'unmatched'
|
| 280 |
}
|
| 281 |
|
| 282 |
|
| 283 |
COLOR_MERGE_DICT={
|
| 284 |
"кр":'красное',
|
| 285 |
-
"крас":'красное',
|
| 286 |
"red":"красное",
|
| 287 |
"бел":"белое",
|
| 288 |
"white":"белое",
|
|
@@ -298,15 +283,3 @@ COLOR_MERGE_DICT={
|
|
| 298 |
None: 'unmatched'
|
| 299 |
}
|
| 300 |
|
| 301 |
-
COUNTRY_LIST=[
|
| 302 |
-
"Франция",
|
| 303 |
-
"Испания",
|
| 304 |
-
"Италия",
|
| 305 |
-
"Шотландия",
|
| 306 |
-
]
|
| 307 |
-
|
| 308 |
-
NORMALIZED_NAMES_ALTERNATIVES_DICT={
|
| 309 |
-
"M&H" : ["em end ejch"],
|
| 310 |
-
"peats beast" : ["pits bist"],
|
| 311 |
-
"xo": ["ho"]
|
| 312 |
-
}
|
|
|
|
| 75 |
'п/сл',
|
| 76 |
'п/с',
|
| 77 |
'сл',
|
|
|
|
| 78 |
'сух',
|
|
|
|
| 79 |
]
|
| 80 |
|
| 81 |
|
|
|
|
| 85 |
'розовое',
|
| 86 |
'роз',
|
| 87 |
'кр',
|
| 88 |
+
'бел',
|
|
|
|
| 89 |
'розе',
|
| 90 |
'rosso',
|
| 91 |
'roso',
|
|
|
|
| 151 |
|
| 152 |
COLORS_FOR_TRIM = [
|
| 153 |
'красное',
|
|
|
|
|
|
|
| 154 |
'белое',
|
| 155 |
+
'розовое'
|
| 156 |
+
'кр',
|
| 157 |
'бел',
|
|
|
|
| 158 |
'розе',
|
|
|
|
| 159 |
'rosso',
|
| 160 |
'roso',
|
| 161 |
'roseto',
|
|
|
|
| 207 |
|
| 208 |
|
| 209 |
OTHER_WORDS=[
|
|
|
|
|
|
|
| 210 |
"Игристое",
|
| 211 |
"Жемчужное",
|
| 212 |
"Газированный",
|
|
|
|
| 227 |
"Десертный",
|
| 228 |
"Вкус",
|
| 229 |
"Сорт",
|
|
|
|
| 230 |
]
|
| 231 |
|
| 232 |
|
|
|
|
| 236 |
'sweet':'сладкое',
|
| 237 |
'сухое':'сухое',
|
| 238 |
'п/сух':'полусухое',
|
|
|
|
| 239 |
'п/сл':'полусладкое',
|
|
|
|
| 240 |
'п/с':'полусухое',
|
| 241 |
'сл':'сладкое',
|
|
|
|
| 242 |
'сух':'сухое',
|
|
|
|
| 243 |
None: 'unmatched',
|
| 244 |
}
|
| 245 |
|
|
|
|
| 253 |
'Сироп':'Сиропы',
|
| 254 |
'Арманьяк':'Коньяк',
|
| 255 |
'Бренди':'Коньяк',
|
| 256 |
+
'Ликер':'Ликеры',
|
|
|
|
| 257 |
'Граппа':'Водка',
|
| 258 |
'Настойка':'Водка',
|
| 259 |
'Конфеты':'Сладости',
|
|
|
|
| 263 |
'Винный напиток': "Вино",
|
| 264 |
"Игристое вино":'Шампанское',
|
| 265 |
"Самогон": "Водка",
|
|
|
|
| 266 |
}
|
| 267 |
|
| 268 |
|
| 269 |
COLOR_MERGE_DICT={
|
| 270 |
"кр":'красное',
|
|
|
|
| 271 |
"red":"красное",
|
| 272 |
"бел":"белое",
|
| 273 |
"white":"белое",
|
|
|
|
| 283 |
None: 'unmatched'
|
| 284 |
}
|
| 285 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/preprocess.py
CHANGED
|
@@ -1,243 +1,224 @@
|
|
| 1 |
-
import json
|
| 2 |
-
from tqdm import tqdm
|
| 3 |
-
from preprocess.utils.items.attrs import *
|
| 4 |
-
from preprocess.utils.common.extracters import *
|
| 5 |
-
from preprocess.utils.common.brand_matching import *
|
| 6 |
-
from preprocess.utils.common.parallel_brand_matching import *
|
| 7 |
-
from preprocess.utils.common.utils import *
|
| 8 |
-
from preprocess.utils.common.top_inserts import *
|
| 9 |
-
import pandas as pd
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
class Preprocessor():
|
| 14 |
-
|
| 15 |
-
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
-
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
-
sour_merge_dict, type_merge_dict, color_merge_dict
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
self.
|
| 21 |
-
self.
|
| 22 |
-
self.
|
| 23 |
-
self.
|
| 24 |
-
self.
|
| 25 |
-
self.
|
| 26 |
-
self.
|
| 27 |
-
self.
|
| 28 |
-
|
| 29 |
-
self.
|
| 30 |
-
self.
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
result['
|
| 71 |
-
#
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
result['
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
#
|
| 120 |
-
#
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
text
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
#
|
| 139 |
-
#
|
| 140 |
-
#
|
| 141 |
-
#
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
print('-----*-----
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
items
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
|
| 226 |
-
items['gb']=gb
|
| 227 |
-
items['sour']=sour
|
| 228 |
-
items['sour']=items['sour'].replace(self.sour_dict)
|
| 229 |
-
|
| 230 |
-
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 231 |
-
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 232 |
-
products['gb']=gb
|
| 233 |
-
products['sour']=sour
|
| 234 |
-
products['sour']=products['sour'].replace(self.sour_dict)
|
| 235 |
-
|
| 236 |
-
print('-----*-----Replacing product types-----*-----')
|
| 237 |
-
products['type']=products['type'].replace(self.type_dict)
|
| 238 |
-
|
| 239 |
-
return items, products
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
from preprocess.utils.items.attrs import *
|
| 4 |
+
from preprocess.utils.common.extracters import *
|
| 5 |
+
from preprocess.utils.common.brand_matching import *
|
| 6 |
+
from preprocess.utils.common.parallel_brand_matching import *
|
| 7 |
+
from preprocess.utils.common.utils import *
|
| 8 |
+
from preprocess.utils.common.top_inserts import *
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class Preprocessor():
|
| 14 |
+
|
| 15 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 16 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 17 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 18 |
+
|
| 19 |
+
self.long_types_list=long_types_list
|
| 20 |
+
self.short_types_list=short_types_list
|
| 21 |
+
self.sour=sour_list
|
| 22 |
+
self.type_wine=type_wine
|
| 23 |
+
self.gbs=gbs
|
| 24 |
+
self.colors_ft=colors_for_trim
|
| 25 |
+
self.grapes=grapes
|
| 26 |
+
self.other_words=other_words
|
| 27 |
+
self.types_n_others=long_types_list+other_words
|
| 28 |
+
self.sour_dict=sour_merge_dict
|
| 29 |
+
self.type_dict=type_merge_dict
|
| 30 |
+
self.color_merge_dict=color_merge_dict
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def process_items(self, df):
|
| 34 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 35 |
+
#counter=0
|
| 36 |
+
for idf, i in tqdm(zip(df['id'].values, df['attrs'].values)):
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
i=json.loads(i)
|
| 40 |
+
result['id'].append(idf)
|
| 41 |
+
if 'brand' in i.keys():
|
| 42 |
+
result['brand'].append(i['brand'])
|
| 43 |
+
else: result['brand'].append(None)
|
| 44 |
+
result['name'].append(i['name'])
|
| 45 |
+
drink_type=get_type(i, self.long_types_list)
|
| 46 |
+
if drink_type is None:
|
| 47 |
+
drink_type=check_spark(i)
|
| 48 |
+
if drink_type is None:
|
| 49 |
+
drink_type=check_color_and_sour(i)
|
| 50 |
+
if drink_type is None:
|
| 51 |
+
drink_type=check_spark(i, col_name='type_wine')
|
| 52 |
+
if drink_type is None:
|
| 53 |
+
drink_type=check_color_and_sour(i, types=self.sour)
|
| 54 |
+
#if 'type' in i.keys():
|
| 55 |
+
result['type'].append(drink_type)#i['type'])
|
| 56 |
+
#else: dd['type'].append(None)
|
| 57 |
+
if 'volume' in i.keys():
|
| 58 |
+
result['volume'].append(i['volume'])
|
| 59 |
+
else:
|
| 60 |
+
vol=extract_volume_or_number(i['name'])
|
| 61 |
+
result['volume'].append(vol)
|
| 62 |
+
if 'year' in i.keys():
|
| 63 |
+
result['year'].append(i['year'])
|
| 64 |
+
else:
|
| 65 |
+
year=extract_production_year(i['name'])
|
| 66 |
+
result['year'].append(year)
|
| 67 |
+
alco=extract_alcohol_content(i['name'])
|
| 68 |
+
if 'type_wine' in i.keys():
|
| 69 |
+
result['type_wine'].append(i['type_wine'])
|
| 70 |
+
else: result['type_wine'].append(None)
|
| 71 |
+
#f alco is not None:
|
| 72 |
+
result['alco'].append(alco)
|
| 73 |
+
#else: dd['type_wine'].append(None)
|
| 74 |
+
except Exception as ex:
|
| 75 |
+
print(idf, ex)
|
| 76 |
+
return pd.DataFrame(result)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def process_products(self, products):
|
| 80 |
+
result={'id':[], 'brand':[], 'name':[], 'type':[], "type_wine":[], "volume":[], "year":[], 'alco':[]}#, 'embeddings':[]}
|
| 81 |
+
for idx, row in tqdm(products.iterrows()):
|
| 82 |
+
try:
|
| 83 |
+
result['id'].append(row['id'])
|
| 84 |
+
result['brand'].append(row['brand'])
|
| 85 |
+
result['type_wine'].append(row['category'])
|
| 86 |
+
result['type'].append(row['product_type'])
|
| 87 |
+
result['name'].append(row['name_long'])
|
| 88 |
+
vol=extract_volume_or_number(row['name'])
|
| 89 |
+
result['volume'].append(vol)
|
| 90 |
+
#year=extract_production_year(row['name'])
|
| 91 |
+
year=extract_production_year(str(row['name_postfix']))
|
| 92 |
+
result['year'].append(year)
|
| 93 |
+
#rr['year'].append(row['name_postfix'])
|
| 94 |
+
alco=extract_alcohol_content(row['name'])
|
| 95 |
+
#f alco is not None:
|
| 96 |
+
result['alco'].append(alco)
|
| 97 |
+
except Exception as ex:
|
| 98 |
+
print(ex)
|
| 99 |
+
return pd.DataFrame(result)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def prcess_text(self, text):
|
| 103 |
+
#text=''+origin
|
| 104 |
+
#text=str(split_russian_and_english(text))
|
| 105 |
+
gb=find_full_word(text, self.gbs)#get_GB(text)
|
| 106 |
+
if gb is not None:
|
| 107 |
+
text=text.replace(str(gb), '')
|
| 108 |
+
|
| 109 |
+
alcohol = extract_alcohol_content(text)
|
| 110 |
+
if alcohol is not None:
|
| 111 |
+
alco_w_comma=alcohol.replace('.', ',')
|
| 112 |
+
text=text.replace(str(alcohol), '').replace(str(alco_w_comma), '')
|
| 113 |
+
volume_or_number = extract_volume_or_number(text)
|
| 114 |
+
if volume_or_number is not None:
|
| 115 |
+
volume_with_comma=str(volume_or_number).replace('.', ',')
|
| 116 |
+
text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 117 |
+
test=clean_wine_name(text) #remove_l(text)
|
| 118 |
+
#text=text.replace(str(volume_or_number)+' л', '').replace(str(volume_with_comma)+' л', '')
|
| 119 |
+
# else:
|
| 120 |
+
# volume_or_number=re_extract_volume(text)
|
| 121 |
+
# if volume_or_number is not None:
|
| 122 |
+
# volume_with_comma=volume_or_number.replace('.', ',')
|
| 123 |
+
# text=text.replace(str(volume_or_number), '').replace(str(volume_with_comma), '')
|
| 124 |
+
years = extract_years(text)
|
| 125 |
+
if years is not None:
|
| 126 |
+
text=text.replace(str(years), '').replace(str('выдержка'), '').replace(str('Выдержка'), '').replace(str('aging'), '')
|
| 127 |
+
production_year = extract_production_year(text)
|
| 128 |
+
if production_year is not None:
|
| 129 |
+
text=text.replace(str(production_year), '')
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
color=find_full_word(text, self.colors_ft)
|
| 133 |
+
if color is not None:
|
| 134 |
+
text=text.replace(str(color), '')
|
| 135 |
+
sour=find_full_word(text, self.sour) #get_sour(text)
|
| 136 |
+
if sour is not None:
|
| 137 |
+
text=text.replace(str(sour), '')
|
| 138 |
+
# re_extracted_volume=re_extract_volume(text)
|
| 139 |
+
# if re_extracted_volume is not None:
|
| 140 |
+
# volume_with_comma=re_extracted_volume.replace('.', ',')
|
| 141 |
+
# text=text.replace(str(re_extracted_volume), '').replace(str(volume_with_comma), '')
|
| 142 |
+
|
| 143 |
+
# else:
|
| 144 |
+
# re_extracted_volume=re_extract_volume(str(volume_or_number))
|
| 145 |
+
# volume_or_number=re_extracted_volume
|
| 146 |
+
|
| 147 |
+
return remove_quotes(text), alcohol, volume_or_number, years, production_year, gb, color, sour
|
| 148 |
+
|
| 149 |
+
|
| 150 |
+
def process(self, products, items):
|
| 151 |
+
|
| 152 |
+
print('------*-----Prepare items catalogue-----*-----')
|
| 153 |
+
items=self.process_items(items.copy())
|
| 154 |
+
print('-----*-----Prepare products catalogue-----*-----')
|
| 155 |
+
products=self.process_products(products.copy())
|
| 156 |
+
|
| 157 |
+
items['brand']=items['brand'].apply(lambda x: str(x).strip().lower())
|
| 158 |
+
products['brand']=products['brand'].apply(lambda x: str(x).strip().lower())
|
| 159 |
+
|
| 160 |
+
print('-----*-----Split n match-----*-----')
|
| 161 |
+
splited=split_n_match(products, items)
|
| 162 |
+
items["brand"] = items["brand"].replace(splited)
|
| 163 |
+
|
| 164 |
+
print('-----*-----Fill brands in items-----*-----')
|
| 165 |
+
fill_brands_in_dataframe(products['brand'].unique(), items)
|
| 166 |
+
|
| 167 |
+
print('-----*-----Brand matching-----*-----')
|
| 168 |
+
comp_list, prod_brand_list, items_brand_list=get_same_brands(products, items)
|
| 169 |
+
out_prods=list(set(prod_brand_list)-set(comp_list))
|
| 170 |
+
out_items=list(set(items_brand_list)-set(comp_list))
|
| 171 |
+
brand_map_improved=match_brands_improved(out_items, list(products['brand'].unique()))
|
| 172 |
+
items["new_brand"] = items["new_brand"].replace(brand_map_improved)
|
| 173 |
+
|
| 174 |
+
items['type']=items['type'].replace(self.type_dict)
|
| 175 |
+
|
| 176 |
+
print('-----*-----Unwrap brend cats step 1-----*-----')
|
| 177 |
+
unwrap_b_match=unwrap_brands(products)
|
| 178 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 179 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 180 |
+
|
| 181 |
+
print('-----*-----Unwrap brend cats step 2-----*-----')
|
| 182 |
+
unwrap_b_match=unwrap_brands(products)
|
| 183 |
+
items["new_brand"] = items["new_brand"].replace(unwrap_b_match)
|
| 184 |
+
products["brand"] = products["brand"].replace(unwrap_b_match)
|
| 185 |
+
|
| 186 |
+
print('-----*-----Finding brands in names-----*-----')
|
| 187 |
+
items['new_brand']=items['new_brand'].replace('none', None)
|
| 188 |
+
i_brands=items[items['new_brand'].isna()]['name'].values
|
| 189 |
+
p_brands=[i for i in products['brand'].unique() if i is not None and len(i)>3]
|
| 190 |
+
new_found_brands=check_brands_in_strings_pqdm(i_brands, p_brands)
|
| 191 |
+
items.loc[items['name'].isin(new_found_brands.keys()), 'new_brand'] = items['name'].map(new_found_brands)
|
| 192 |
+
|
| 193 |
+
print('-----*-----Top inserts-----*-----')
|
| 194 |
+
process_unbrended_names(items, p_brands, self.prcess_text, self.short_types_list, #self.long_type_list
|
| 195 |
+
self.grapes, self.other_words)
|
| 196 |
+
|
| 197 |
+
print('-----*-----Adding service categories-----*-----')
|
| 198 |
+
merge_wine_type(items, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 199 |
+
merge_types(items, products)
|
| 200 |
+
merge_wine_type(products, colors=self.type_wine, color_merge_dict=self.color_merge_dict)
|
| 201 |
+
merge_types(products, products)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
print('-----*-----Name trimming-----*-----')
|
| 205 |
+
item_timed_names, gb, sour=name_trimmer(items, self.prcess_text, self.types_n_others)
|
| 206 |
+
#items['name']=items['id'].replace(item_timed_names)
|
| 207 |
+
items.loc[items['id'].isin(item_timed_names.keys()), 'name'] = items['id'].map(item_timed_names)
|
| 208 |
+
items['gb']=gb
|
| 209 |
+
items['sour']=sour
|
| 210 |
+
items['sour']=items['sour'].replace(self.sour_dict)
|
| 211 |
+
products_trimed_names, gb, sour=name_trimmer(products, self.prcess_text, self.types_n_others)
|
| 212 |
+
products.loc[products['id'].isin(products_trimed_names.keys()), 'name'] = products['id'].map(products_trimed_names)
|
| 213 |
+
products['gb']=gb
|
| 214 |
+
products['sour']=sour
|
| 215 |
+
products['sour']=products['sour'].replace(self.sour_dict)
|
| 216 |
+
|
| 217 |
+
print('-----*-----Replacing product types-----*-----')
|
| 218 |
+
products['type']=products['type'].replace(self.type_dict)
|
| 219 |
+
|
| 220 |
+
return items, products
|
| 221 |
+
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preprocess/utils/common/utils.py
CHANGED
|
@@ -1,165 +1,138 @@
|
|
| 1 |
-
import re
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
return
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
def
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 139 |
-
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 140 |
-
#print("Pattern: " + pattern)
|
| 141 |
-
|
| 142 |
-
# Заменяем найденные полные слова на пустую строку.
|
| 143 |
-
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 144 |
-
|
| 145 |
-
# Убираем лишние пробелы, возникающие после удаления слов.
|
| 146 |
-
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
| 147 |
-
|
| 148 |
-
return new_text
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
def name_trimmer(df, prcess_text, types_and_others):
|
| 152 |
-
result={}
|
| 153 |
-
gbs=[]
|
| 154 |
-
sours=[]
|
| 155 |
-
for idx, row in tqdm(df.iterrows()):
|
| 156 |
-
#print("Name1: " + str(row['name']))
|
| 157 |
-
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
| 158 |
-
#print("Name2: " + text)
|
| 159 |
-
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
| 160 |
-
#print("Name3: " + text)
|
| 161 |
-
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 162 |
-
|
| 163 |
-
gbs.append(gb)
|
| 164 |
-
sours.append(sour)
|
| 165 |
return result, gbs, sours
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
from tqdm import tqdm
|
| 3 |
+
import csv
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def get_delimiter(file_path):
|
| 7 |
+
with open(file_path, 'r') as f:
|
| 8 |
+
sample = f.read(1024) # читаем часть файла для анализа
|
| 9 |
+
dialect = csv.Sniffer().sniff(sample)
|
| 10 |
+
return dialect.delimiter
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def remove_quotes(text):
|
| 14 |
+
return re.sub(r'["\']', '', text)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def remove_l(text):
|
| 18 |
+
result = re.sub(r'\bл\b', '', text, flags=re.IGNORECASE)
|
| 19 |
+
|
| 20 |
+
# Убираем возможные лишние пробелы, возникающие после удаления
|
| 21 |
+
result = re.sub(r'\s{2,}', ' ', result).strip()
|
| 22 |
+
return result
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def clean_wine_name(name):
|
| 26 |
+
"""
|
| 27 |
+
Удаляет в конце строки отдельно стоящие буквы (однобуквенные слова), не входящие в состав других слов.
|
| 28 |
+
Например, "токай л" превратится в "токай".
|
| 29 |
+
"""
|
| 30 |
+
# Регулярное выражение ищет:
|
| 31 |
+
# \s+ – один или несколько пробельных символов;
|
| 32 |
+
# \b – граница слова;
|
| 33 |
+
# [A-Za-zА-ЯЁа-яё] – ровно одна буква (латинская или кириллическая);
|
| 34 |
+
# \b – граница слова;
|
| 35 |
+
# \s*$ – любые пробелы до конца строки.
|
| 36 |
+
return re.sub(r'\s+\b[A-Za-zА-ЯЁа-яё]\b\s*$', '', name)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def find_full_word(text, word_list):
|
| 40 |
+
"""
|
| 41 |
+
Ищет первое полное вхождение слова из word_list в строке text.
|
| 42 |
+
Возвращает найденное слово или None, если совпадение не найдено.
|
| 43 |
+
"""
|
| 44 |
+
for word in word_list:
|
| 45 |
+
pattern = r'\b' + re.escape(word) + r'\b'
|
| 46 |
+
if re.search(pattern, text, re.IGNORECASE):
|
| 47 |
+
return word
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def merge_wine_type(items, colors=None, color_merge_dict=None):
|
| 52 |
+
result=[]
|
| 53 |
+
for row in tqdm(items.iterrows()):
|
| 54 |
+
try:
|
| 55 |
+
if row[1]['type_wine'] is not None:
|
| 56 |
+
color=find_full_word(row[1]['type_wine'], colors)
|
| 57 |
+
if color is not None:
|
| 58 |
+
result.append(color)
|
| 59 |
+
else:
|
| 60 |
+
color=find_full_word(row[1]['name'], colors)
|
| 61 |
+
if color is not None:
|
| 62 |
+
result.append(color)
|
| 63 |
+
else:
|
| 64 |
+
result.append(None)
|
| 65 |
+
else:
|
| 66 |
+
color=find_full_word(row[1]['name'], colors)
|
| 67 |
+
if color is not None:
|
| 68 |
+
result.append(color)
|
| 69 |
+
else:
|
| 70 |
+
result.append(None)
|
| 71 |
+
except Exception as ex:
|
| 72 |
+
print(ex)
|
| 73 |
+
result.append(None)
|
| 74 |
+
|
| 75 |
+
items['new_type_wine']=result
|
| 76 |
+
items['new_type_wine']=items['new_type_wine'].replace(color_merge_dict)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def merge_types(items, products):
|
| 80 |
+
alco_types=[i.strip().lower() for i in products['type'].unique()]
|
| 81 |
+
alco_types.append('ликёр')
|
| 82 |
+
result=[]
|
| 83 |
+
for row in tqdm(items.iterrows()):
|
| 84 |
+
try:
|
| 85 |
+
type_in_name=find_full_word(row[1]['name'], alco_types)
|
| 86 |
+
if type_in_name is not None:
|
| 87 |
+
result.append(type_in_name)
|
| 88 |
+
continue
|
| 89 |
+
if row[1]['type'] is not None:
|
| 90 |
+
type_in_type=find_full_word(row[1]['type'], alco_types)
|
| 91 |
+
if type_in_type is not None:
|
| 92 |
+
result.append(type_in_type)
|
| 93 |
+
else:
|
| 94 |
+
result.append(row[1]['type'])
|
| 95 |
+
else:
|
| 96 |
+
result.append(None)
|
| 97 |
+
except Exception as ex:
|
| 98 |
+
print(ex)
|
| 99 |
+
result.append(None)
|
| 100 |
+
|
| 101 |
+
items['new_type']=result
|
| 102 |
+
items['new_type']=items['new_type'].replace({'ликёр': 'ликер', None: 'unmatched'})
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def trim_name(text, words_to_remove):
|
| 106 |
+
"""
|
| 107 |
+
Удаляет из текста только те слова, которые полностью совпадают с элементами списка words_to_remove.
|
| 108 |
+
|
| 109 |
+
:param text: Исходная строка.
|
| 110 |
+
:param words_to_remove: Список слов, которые необходимо удалить.
|
| 111 |
+
:return: Обновлённая строка с удалёнными словами.
|
| 112 |
+
"""
|
| 113 |
+
# Создаём регулярное выражение, которое ищет любое из указанных слов как отдельное слово.
|
| 114 |
+
# Используем re.escape, чтобы экранировать спецсимволы в словах.
|
| 115 |
+
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
| 116 |
+
#print(pattern)
|
| 117 |
+
|
| 118 |
+
# Заменяем найденные полные слова на пустую строку.
|
| 119 |
+
new_text = re.sub(pattern, '', text, flags=re.IGNORECASE)
|
| 120 |
+
|
| 121 |
+
# Убираем лишние пробелы, возникающие после удаления слов.
|
| 122 |
+
new_text = re.sub(r'\s+', ' ', new_text).strip()
|
| 123 |
+
|
| 124 |
+
return new_text
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
def name_trimmer(df, prcess_text, types_and_others):
|
| 128 |
+
result={}
|
| 129 |
+
gbs=[]
|
| 130 |
+
sours=[]
|
| 131 |
+
for idx, row in tqdm(df.iterrows()):
|
| 132 |
+
text, alcohol, volume_or_number, years, production_year, gb, color, sour=prcess_text(str(row['name']))
|
| 133 |
+
text=trim_name(text, types_and_others).replace(',','').replace('.','')
|
| 134 |
+
result[row['id']]=text.lower().strip() #remove_l(text).lower().strip()
|
| 135 |
+
|
| 136 |
+
gbs.append(gb)
|
| 137 |
+
sours.append(sour)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
return result, gbs, sours
|
preprocess/utils/items/attrs.py
CHANGED
|
@@ -6,7 +6,7 @@ def check_spark(row, col_name='name', types=['Игристое', 'игр']):
|
|
| 6 |
return None
|
| 7 |
|
| 8 |
|
| 9 |
-
def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное'
|
| 10 |
if col_name in row.keys():
|
| 11 |
for t in types:
|
| 12 |
if t.lower() in row[col_name].lower():
|
|
|
|
| 6 |
return None
|
| 7 |
|
| 8 |
|
| 9 |
+
def check_color_and_sour(row, col_name='type_wine', types=['Белое', 'Розовое', 'Красное']):
|
| 10 |
if col_name in row.keys():
|
| 11 |
for t in types:
|
| 12 |
if t.lower() in row[col_name].lower():
|
processor/matching.py
CHANGED
|
@@ -1,302 +1,159 @@
|
|
| 1 |
-
import
|
| 2 |
-
from
|
| 3 |
-
|
| 4 |
-
from
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
:param items_df: DataFrame с колонками '
|
| 68 |
-
:
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
:param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
|
| 160 |
-
:param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
|
| 161 |
-
:param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
|
| 162 |
-
:param name_threshold: Порог сходства для fuzzy matching.
|
| 163 |
-
:return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
|
| 164 |
-
"""
|
| 165 |
-
results = []
|
| 166 |
-
no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
|
| 167 |
-
|
| 168 |
-
if name_threshold < 50:
|
| 169 |
-
name_threshold = 50
|
| 170 |
-
|
| 171 |
-
# Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
|
| 172 |
-
for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
|
| 173 |
-
product_brand = product['brand']
|
| 174 |
-
product_type = product['type']
|
| 175 |
-
product_name = product['name']
|
| 176 |
-
product_volume = product['volume']
|
| 177 |
-
product_type_wine = product['new_type_wine']
|
| 178 |
-
product_sour = product['sour']
|
| 179 |
-
|
| 180 |
-
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 181 |
-
#print("Name: " + product_name)
|
| 182 |
-
#print("Key: " + str(key))
|
| 183 |
-
#print("Groups: " + str(items_groups))
|
| 184 |
-
items_data = items_groups.get(key, [])
|
| 185 |
-
if items_data:
|
| 186 |
-
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 187 |
-
#print("Data: " + str(items_data))
|
| 188 |
-
items_ids, items_names, items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 189 |
-
else:
|
| 190 |
-
#print("Data: No")
|
| 191 |
-
items_ids, items_names,items_full_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [],[], [], [])
|
| 192 |
-
|
| 193 |
-
norm_product_name = normalize_name_ex(product_name)
|
| 194 |
-
matches = process.extract(
|
| 195 |
-
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold, limit=20
|
| 196 |
-
)
|
| 197 |
-
|
| 198 |
-
matched_items = [
|
| 199 |
-
{
|
| 200 |
-
'item_id': items_ids[idx_candidate],
|
| 201 |
-
'brand': product_brand,
|
| 202 |
-
'item_name': items_full_names[idx_candidate],
|
| 203 |
-
#'item_name': items_names[idx_candidate],
|
| 204 |
-
'score': score,
|
| 205 |
-
'volume': items_volumes[idx_candidate],
|
| 206 |
-
'color': item_type_wine[idx_candidate],
|
| 207 |
-
'sour': items_sour[idx_candidate],
|
| 208 |
-
'year': items_year[idx_candidate],
|
| 209 |
-
}
|
| 210 |
-
for match, score, idx_candidate in matches
|
| 211 |
-
]
|
| 212 |
-
|
| 213 |
-
if matched_items:
|
| 214 |
-
matched_items = order_by_best_year(matched_items, product['year'])
|
| 215 |
-
matched_items = matched_items[:5]
|
| 216 |
-
else:
|
| 217 |
-
no_match_products.append((idx, product))
|
| 218 |
-
|
| 219 |
-
results.append({
|
| 220 |
-
'product_id': product['id'],
|
| 221 |
-
#"matched_top_id": top_matched_id,
|
| 222 |
-
'matched_items': matched_items,
|
| 223 |
-
#"alternative_top_id": "",
|
| 224 |
-
#'alternative': [] # Заполняется во втором проходе
|
| 225 |
-
})
|
| 226 |
-
|
| 227 |
-
if include_alternatives:
|
| 228 |
-
# Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
|
| 229 |
-
groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
|
| 230 |
-
|
| 231 |
-
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 232 |
-
for idx, product in tqdm(no_match_products):
|
| 233 |
-
#print("Product: " + str(product))
|
| 234 |
-
product_brand = product['brand']
|
| 235 |
-
product_type_wine = product['new_type_wine']
|
| 236 |
-
product_type = product['new_type']
|
| 237 |
-
product_volume = product['volume']
|
| 238 |
-
product_name = product['name']
|
| 239 |
-
product_sour = product['sour']
|
| 240 |
-
|
| 241 |
-
#alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 242 |
-
alt_key = (product_type_wine, product_type, product_volume)
|
| 243 |
-
|
| 244 |
-
#print("AltName: " + str(product))
|
| 245 |
-
#print("AltKey: " + str(alt_key))
|
| 246 |
-
#print("AltGroups: " + str(groups_by_alternative_keys))
|
| 247 |
-
#print("AltGroups Keys: " + str(groups_by_alternative_keys.keys()))
|
| 248 |
-
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 249 |
-
#print("AltGroups2: " + str(type_items))
|
| 250 |
-
# Фильтруем, исключая итемы с исходным брендом
|
| 251 |
-
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 252 |
-
if filtered_items:
|
| 253 |
-
#print("AltData: " + str(filtered_items))
|
| 254 |
-
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 255 |
-
else:
|
| 256 |
-
#print("AltData: No")
|
| 257 |
-
alt_ids, alt_brands, alt_names, alt_full_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[], [])
|
| 258 |
-
|
| 259 |
-
norm_product_name = normalize_name_ex(product_name)
|
| 260 |
-
#print("norm_product_name: " + str(norm_product_name))
|
| 261 |
-
#print("alt_norm_names: " + str(alt_norm_names))
|
| 262 |
-
alt_matches = process.extract(
|
| 263 |
-
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=50
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
#alt_matches = compare_name_with_list(
|
| 268 |
-
# norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=70
|
| 269 |
-
#)
|
| 270 |
-
|
| 271 |
-
#print("alt_matches: " + str(alt_matches))
|
| 272 |
-
alt_matched_items = [
|
| 273 |
-
{
|
| 274 |
-
'item_id': alt_ids[idx_candidate],
|
| 275 |
-
'brand': alt_brands[idx_candidate],
|
| 276 |
-
#'item_name': alt_names[idx_candidate],
|
| 277 |
-
'item_name': alt_full_names[idx_candidate],
|
| 278 |
-
'score': score / 2,
|
| 279 |
-
'volume': alt_volumes[idx_candidate],
|
| 280 |
-
'color': alt_type_wine[idx_candidate],
|
| 281 |
-
'sour': alt_sour[idx_candidate],
|
| 282 |
-
'year': alt_year[idx_candidate],
|
| 283 |
-
}
|
| 284 |
-
for match, score, idx_candidate in alt_matches
|
| 285 |
-
]
|
| 286 |
-
|
| 287 |
-
alt_matched_items = order_by_best_year(alt_matched_items, product['year'])
|
| 288 |
-
alt_matched_items = alt_matched_items[:5]
|
| 289 |
-
|
| 290 |
-
results[idx]['matched_items'].extend(alt_matched_items)
|
| 291 |
-
|
| 292 |
-
for r in results:
|
| 293 |
-
r['matched_items'] = json.dumps(r['matched_items'], ensure_ascii=False)
|
| 294 |
-
|
| 295 |
-
#if alt_matched_items:
|
| 296 |
-
# results[idx]['alternative_top_id'] = alt_matched_items[0]["item_id"]
|
| 297 |
-
|
| 298 |
-
#results[idx]['alternative'] = alt_matched_items
|
| 299 |
-
|
| 300 |
-
results_df = pd.DataFrame(results)
|
| 301 |
-
merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
|
| 302 |
return merged_df
|
|
|
|
| 1 |
+
from tqdm import tqdm
|
| 2 |
+
from transliterate import translit, detect_language
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from rapidfuzz import fuzz, process
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def normalize_name(name):
|
| 8 |
+
"""
|
| 9 |
+
Нормализует строку: если обнаруживается русский язык, транслитерирует её в латиницу,
|
| 10 |
+
приводит к нижнему регистру.
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
if detect_language(name) == 'ru':
|
| 14 |
+
return translit(name, 'ru', reversed=True).lower()
|
| 15 |
+
except Exception:
|
| 16 |
+
pass
|
| 17 |
+
return name.lower()
|
| 18 |
+
|
| 19 |
+
def prepare_groups_with_ids(items_df):
|
| 20 |
+
"""
|
| 21 |
+
Предварительная группировка данных из items по (new_brand, type, volume, new_type_wine, sour)
|
| 22 |
+
с учетом нормализованного названия.
|
| 23 |
+
|
| 24 |
+
Добавляем столбец 'norm_name', чтобы нормализовать значение name один раз заранее.
|
| 25 |
+
|
| 26 |
+
:param items_df: DataFrame с колонками 'new_brand', 'type', 'name', 'id', 'volume', 'new_type_wine', 'sour'.
|
| 27 |
+
:return: Словарь {(new_brand, type, volume, new_type_wine, sour): [(id, name, norm_name, volume, new_type_wine, sour)]}.
|
| 28 |
+
"""
|
| 29 |
+
items_df = items_df.copy()
|
| 30 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 31 |
+
|
| 32 |
+
grouped = items_df.groupby(['new_brand', 'type', 'volume', 'new_type_wine', 'sour']).apply(
|
| 33 |
+
lambda x: list(zip(x['id'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 34 |
+
).to_dict()
|
| 35 |
+
return grouped
|
| 36 |
+
|
| 37 |
+
def prepare_groups_by_alternative_keys(items_df):
|
| 38 |
+
"""
|
| 39 |
+
Группировка данных из items по (new_type_wine, new_type, volume, sour) с сохранением id, new_brand,
|
| 40 |
+
оригинального и нормализованного имени.
|
| 41 |
+
|
| 42 |
+
:param items_df: DataFrame с колонками 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'id', 'sour'.
|
| 43 |
+
:return: Словарь {(new_type_wine, new_type, volume, sour): [(id, new_brand, name, norm_name, volume, new_type_wine, sour)]}.
|
| 44 |
+
"""
|
| 45 |
+
items_df = items_df.copy()
|
| 46 |
+
items_df['norm_name'] = items_df['name'].apply(normalize_name)
|
| 47 |
+
|
| 48 |
+
grouped = items_df.groupby(['new_type_wine', 'new_type', 'volume', 'sour']).apply(
|
| 49 |
+
lambda x: list(zip(x['id'], x['new_brand'], x['name'], x['norm_name'], x['volume'], x['new_type_wine'], x['sour'], x['year']))
|
| 50 |
+
).to_dict()
|
| 51 |
+
return grouped
|
| 52 |
+
|
| 53 |
+
def new_find_matches_with_ids(products_df, items_groups, items_df, name_threshold=85):
|
| 54 |
+
"""
|
| 55 |
+
Поиск совпадений с сохранением id найденных итемов, используя заранее подготовленные
|
| 56 |
+
нормализованные группы.
|
| 57 |
+
|
| 58 |
+
Производится два прохода:
|
| 59 |
+
- Первый: поиск по группам (brand, type, volume, new_type_wine, sour);
|
| 60 |
+
- Второй: для продуктов без совпадения ищем по альтернативным группам (new_type_wine, new_type, volume, sour),
|
| 61 |
+
исключая итемы с исходным брендом.
|
| 62 |
+
|
| 63 |
+
Сравнение производится по столбцу norm_name, а для вывода используется оригинальное name.
|
| 64 |
+
|
| 65 |
+
:param products_df: DataFrame с колонками 'id', 'brand', 'type', 'name', 'volume', 'new_type_wine', 'sour', 'new_type'.
|
| 66 |
+
:param items_groups: Словарь, сформированный функцией prepare_groups_with_ids.
|
| 67 |
+
:param items_df: DataFrame итемов с колонками 'id', 'new_brand', 'new_type_wine', 'new_type', 'volume', 'name', 'sour'.
|
| 68 |
+
:param name_threshold: Порог сходства для fuzzy matching.
|
| 69 |
+
:return: DataFrame с добавленными столбцами 'matched_items' (список совпадений) и 'alternative' (альтернативные совпадения).
|
| 70 |
+
"""
|
| 71 |
+
results = []
|
| 72 |
+
no_match_products = [] # Список для хранения продуктов без совпадения в исходной группе
|
| 73 |
+
|
| 74 |
+
# Первый проход: поиск по группам (brand, type, volume, new_type_wine, sour)
|
| 75 |
+
for idx, product in tqdm(products_df.iterrows(), total=len(products_df)):
|
| 76 |
+
product_brand = product['brand']
|
| 77 |
+
product_type = product['type']
|
| 78 |
+
product_name = product['name']
|
| 79 |
+
product_volume = product['volume']
|
| 80 |
+
product_type_wine = product['new_type_wine']
|
| 81 |
+
product_sour = product['sour']
|
| 82 |
+
|
| 83 |
+
key = (product_brand, product_type, product_volume, product_type_wine, product_sour)
|
| 84 |
+
items_data = items_groups.get(key, [])
|
| 85 |
+
if items_data:
|
| 86 |
+
# Распаковываем: id, оригинальное имя, нормализованное имя, volume, new_type_wine, sour
|
| 87 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = zip(*items_data)
|
| 88 |
+
else:
|
| 89 |
+
items_ids, items_names, items_norm_names, items_volumes, item_type_wine, items_sour, items_year = ([], [], [], [], [], [],[])
|
| 90 |
+
|
| 91 |
+
norm_product_name = normalize_name(product_name)
|
| 92 |
+
matches = process.extract(
|
| 93 |
+
norm_product_name, list(items_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 94 |
+
)
|
| 95 |
+
matched_items = [
|
| 96 |
+
{
|
| 97 |
+
'item_id': items_ids[idx_candidate],
|
| 98 |
+
'item_name': items_names[idx_candidate],
|
| 99 |
+
'score': score,
|
| 100 |
+
'volume': items_volumes[idx_candidate],
|
| 101 |
+
'color': item_type_wine[idx_candidate],
|
| 102 |
+
'sour': items_sour[idx_candidate],
|
| 103 |
+
'year': items_year[idx_candidate],
|
| 104 |
+
}
|
| 105 |
+
for match, score, idx_candidate in matches
|
| 106 |
+
]
|
| 107 |
+
|
| 108 |
+
if not matched_items:
|
| 109 |
+
no_match_products.append((idx, product))
|
| 110 |
+
|
| 111 |
+
results.append({
|
| 112 |
+
'product_id': product['id'],
|
| 113 |
+
'matched_items': matched_items,
|
| 114 |
+
'alternative': [] # Заполняется во втором проход��
|
| 115 |
+
})
|
| 116 |
+
|
| 117 |
+
# Подготовка альтернативной группировки по (new_type_wine, new_type, volume, sour)
|
| 118 |
+
groups_by_alternative_keys = prepare_groups_by_alternative_keys(items_df)
|
| 119 |
+
|
| 120 |
+
# Второй проход: для продуктов без совпадений ищем по альтернативным группам
|
| 121 |
+
for idx, product in tqdm(no_match_products):
|
| 122 |
+
product_brand = product['brand']
|
| 123 |
+
product_type_wine = product['new_type_wine']
|
| 124 |
+
product_type = product['new_type']
|
| 125 |
+
product_volume = product['volume']
|
| 126 |
+
product_name = product['name']
|
| 127 |
+
product_sour = product['sour']
|
| 128 |
+
|
| 129 |
+
alt_key = (product_type_wine, product_type, product_volume, product_sour)
|
| 130 |
+
type_items = groups_by_alternative_keys.get(alt_key, [])
|
| 131 |
+
# Фильтруем, исключая итемы с исходным брендом
|
| 132 |
+
filtered_items = [item for item in type_items if item[1] != product_brand]
|
| 133 |
+
if filtered_items:
|
| 134 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = zip(*filtered_items)
|
| 135 |
+
else:
|
| 136 |
+
alt_ids, alt_brands, alt_names, alt_norm_names, alt_volumes, alt_type_wine, alt_sour, alt_year = ([], [], [], [], [], [], [],[])
|
| 137 |
+
|
| 138 |
+
norm_product_name = normalize_name(product_name)
|
| 139 |
+
alt_matches = process.extract(
|
| 140 |
+
norm_product_name, list(alt_norm_names), scorer=fuzz.ratio, score_cutoff=name_threshold
|
| 141 |
+
)
|
| 142 |
+
alt_matched_items = [
|
| 143 |
+
{
|
| 144 |
+
'item_id': alt_ids[idx_candidate],
|
| 145 |
+
'item_name': alt_names[idx_candidate],
|
| 146 |
+
'score': score,
|
| 147 |
+
'volume': alt_volumes[idx_candidate],
|
| 148 |
+
'color': alt_type_wine[idx_candidate],
|
| 149 |
+
'sour': alt_sour[idx_candidate],
|
| 150 |
+
'year': alt_year[idx_candidate],
|
| 151 |
+
}
|
| 152 |
+
for match, score, idx_candidate in alt_matches
|
| 153 |
+
]
|
| 154 |
+
|
| 155 |
+
results[idx]['alternative'] = alt_matched_items
|
| 156 |
+
|
| 157 |
+
results_df = pd.DataFrame(results)
|
| 158 |
+
merged_df = products_df.merge(results_df, left_on='id', right_on='product_id').drop(columns=['product_id'])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
return merged_df
|
processor/processor.py
CHANGED
|
@@ -1,30 +1,32 @@
|
|
| 1 |
-
from preprocess.preprocess import Preprocessor
|
| 2 |
-
from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
class Processor():
|
| 6 |
-
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
-
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
-
sour_merge_dict, type_merge_dict, color_merge_dict
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
|
|
|
|
|
| 1 |
+
from preprocess.preprocess import Preprocessor
|
| 2 |
+
from processor.matching import prepare_groups_with_ids,new_find_matches_with_ids
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
class Processor():
|
| 6 |
+
def __init__(self, long_types_list, short_types_list, sour_list,
|
| 7 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 8 |
+
sour_merge_dict, type_merge_dict, color_merge_dict):
|
| 9 |
+
|
| 10 |
+
self.preprocessor=Preprocessor(long_types_list, short_types_list, sour_list,
|
| 11 |
+
type_wine, gbs, colors_for_trim, grapes, other_words,
|
| 12 |
+
sour_merge_dict, type_merge_dict, color_merge_dict)
|
| 13 |
+
|
| 14 |
+
def process(self, products, items, is_items_first=False, th=65):
|
| 15 |
+
items, products=self.preprocessor.process(products, items)
|
| 16 |
+
|
| 17 |
+
print('-----*-----Matching-----*-----')
|
| 18 |
+
|
| 19 |
+
if is_items_first:
|
| 20 |
+
products['new_brand']=products['brand']
|
| 21 |
+
items['brand']=items['new_brand']
|
| 22 |
+
products_groups = prepare_groups_with_ids(products)
|
| 23 |
+
res=new_find_matches_with_ids(items, products_groups, products, name_threshold=th)
|
| 24 |
+
else:
|
| 25 |
+
items_groups = prepare_groups_with_ids(items)
|
| 26 |
+
res=new_find_matches_with_ids(products, items_groups, items, name_threshold=th)
|
| 27 |
+
|
| 28 |
+
return res.drop(['type','type_wine','alco','gb'], axis=1), items, products #'year',
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
|
requirements.txt
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
python-Levenshtein
|
| 2 |
-
transliterate
|
| 3 |
-
rapidfuzz
|
| 4 |
-
pyahocorasick
|
| 5 |
-
unidecode
|
| 6 |
-
pqdm
|
| 7 |
tqdm
|
|
|
|
| 1 |
+
python-Levenshtein
|
| 2 |
+
transliterate
|
| 3 |
+
rapidfuzz
|
| 4 |
+
pyahocorasick
|
| 5 |
+
unidecode
|
| 6 |
+
pqdm
|
| 7 |
tqdm
|
search/matching_judge.py
DELETED
|
@@ -1,156 +0,0 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import ast
|
| 4 |
-
import csv
|
| 5 |
-
|
| 6 |
-
def verify_csv(csv_file):
|
| 7 |
-
lnnum = 1
|
| 8 |
-
w = open(csv_file + ".1", "w", encoding="utf-8")
|
| 9 |
-
|
| 10 |
-
with open(csv_file, "r", encoding="utf-8") as f:
|
| 11 |
-
while True:
|
| 12 |
-
ln = f.readline()
|
| 13 |
-
if lnnum == 1:
|
| 14 |
-
w.write(ln)
|
| 15 |
-
|
| 16 |
-
if len(ln) == 0:
|
| 17 |
-
break
|
| 18 |
-
|
| 19 |
-
if ln.count('"') % 2 == 1:
|
| 20 |
-
#raise Exception("Incorrect quotes at line " + str(lnnum) + " in file [" + csv_file + "]")
|
| 21 |
-
w.write(ln)
|
| 22 |
-
|
| 23 |
-
lnnum = lnnum + 1
|
| 24 |
-
|
| 25 |
-
w.close()
|
| 26 |
-
return True
|
| 27 |
-
|
| 28 |
-
def compare_matching_with_manual(products_file, items_file, match_result_file, manual_result_file):
|
| 29 |
-
'''with open(products_file, mode="r", encoding="utf-8", newline='') as csvfile:
|
| 30 |
-
csvreader = csv.reader(csvfile, dialect="excel-tab")
|
| 31 |
-
for row in csvreader:
|
| 32 |
-
print(', '.join(row))'''
|
| 33 |
-
|
| 34 |
-
if not verify_csv(products_file):
|
| 35 |
-
raise Exception
|
| 36 |
-
|
| 37 |
-
products_df = pd.read_csv(products_file, sep="\t")
|
| 38 |
-
items_df = pd.read_csv(items_file, sep=";")
|
| 39 |
-
match_df = pd.read_csv(match_result_file, sep="\t")
|
| 40 |
-
manual_df = pd.read_csv(manual_result_file, sep="\t")
|
| 41 |
-
|
| 42 |
-
results = {
|
| 43 |
-
"item_count" : int(items_df.count()[0]),
|
| 44 |
-
"product_count" : int(products_df.count()[0]),
|
| 45 |
-
"match_count" : int(match_df.count()[0]),
|
| 46 |
-
"manual_count" : int(manual_df.count()[0]),
|
| 47 |
-
}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
items_to_manual = {}
|
| 51 |
-
for index, row in items_df.iterrows():
|
| 52 |
-
x = manual_df[manual_df['item_id'] == row["id"]]['state']
|
| 53 |
-
if (len(x) > 0) and (x.values[0] == 1):
|
| 54 |
-
p = products_df[products_df["id"] == manual_df.iloc[int(x.index[0])]["product_id"]]
|
| 55 |
-
items_to_manual[row["id"]] = int(manual_df.iloc[int(x.index[0])]["product_id"])
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
'''items_to_auto = {}
|
| 59 |
-
for index, row in match_df.iterrows():
|
| 60 |
-
if row["matched_top_id"] > 0:
|
| 61 |
-
p = products_df[products_df["id"] == int(row["matched_top_id"])]
|
| 62 |
-
items_to_auto[row["id"]] = int(row["matched_top_id"])
|
| 63 |
-
|
| 64 |
-
results["items_to_manual_count"] = len(items_to_manual)
|
| 65 |
-
results["items_to_auto_count"] = len(items_to_auto)'''
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
result_list = []
|
| 69 |
-
|
| 70 |
-
for index, row in items_df.iterrows():
|
| 71 |
-
result_data = {}
|
| 72 |
-
|
| 73 |
-
result_data["id"] = row["id"]
|
| 74 |
-
result_data["match_side"] = "no_match"
|
| 75 |
-
result_data["auto_score"] = ""
|
| 76 |
-
result_data["manual_score"] = ""
|
| 77 |
-
result_data["discuss"] = ""
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
auto_match = match_df[match_df['id'] == row["id"]]["matched_items"].values[0]
|
| 81 |
-
'''if len(auto_match) > 2:
|
| 82 |
-
if auto_match.find("\\'") >= 0:
|
| 83 |
-
auto_match = auto_match
|
| 84 |
-
|
| 85 |
-
auto_match = auto_match.replace("\\'", "$$$$$$").replace(": None}", ": \"\"}").replace("'", '"').replace("$$$$$$", "\\'")
|
| 86 |
-
|
| 87 |
-
auto_match = json.loads(auto_match)'''
|
| 88 |
-
|
| 89 |
-
manual_match = None
|
| 90 |
-
manual = manual_df[manual_df['item_id'] == row["id"]]['state']
|
| 91 |
-
if (len(manual) > 0) and (manual.values[0] == 1):
|
| 92 |
-
p = products_df[products_df["id"] == manual_df.iloc[int(manual.index[0])]["product_id"]]
|
| 93 |
-
|
| 94 |
-
if len(p.values) > 0:
|
| 95 |
-
manual_match = p
|
| 96 |
-
else:
|
| 97 |
-
print("Manually matched product id=" + str(manual_df.iloc[int(manual.index[0])]["product_id"]) + " for item=" + str(row["id"]) + " not found")
|
| 98 |
-
|
| 99 |
-
if (auto_match is not None) and len(auto_match) > 2 and (manual_match is not None):
|
| 100 |
-
result_data["match_side"] = "both"
|
| 101 |
-
|
| 102 |
-
manual_id = int(manual_match["id"].values[0])
|
| 103 |
-
auto_match_ns = auto_match.replace(" ", "")
|
| 104 |
-
i1 = auto_match_ns.find("'item_id':")
|
| 105 |
-
i2 = auto_match_ns.find("'item_id':" + str(manual_id) + ",")
|
| 106 |
-
|
| 107 |
-
if i1 == i2:
|
| 108 |
-
result_data["auto_score"] = 1
|
| 109 |
-
result_data["manual_score"] = 1
|
| 110 |
-
elif i2 >= 0:
|
| 111 |
-
result_data["auto_score"] = 0.5
|
| 112 |
-
result_data["manual_score"] = 0.5
|
| 113 |
-
elif (auto_match is not None) and len(auto_match) > 2:
|
| 114 |
-
result_data["match_side"] = "only_auto"
|
| 115 |
-
elif manual_match is not None:
|
| 116 |
-
result_data["match_side"] = "only_manual"
|
| 117 |
-
|
| 118 |
-
result_data["discuss"] = ""
|
| 119 |
-
result_data["item"] = row["attrs"]
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
result_data["auto_match"] = auto_match
|
| 123 |
-
|
| 124 |
-
manual_string = ""
|
| 125 |
-
if (manual_match is not None):
|
| 126 |
-
manual_string = '{' + \
|
| 127 |
-
'"id": ' + str(manual_match["id"].values[0]) + ',' + \
|
| 128 |
-
'"brand": "' + str(manual_match["brand"].values[0]) + '",' + \
|
| 129 |
-
'"name": "' + str(manual_match["name_long"].values[0]) + '",' + \
|
| 130 |
-
'"volume": ' + str(manual_match["volume"].values[0]) + '",' + \
|
| 131 |
-
'"year": ' + str(manual_match["year"].values[0]) + '"}'
|
| 132 |
-
|
| 133 |
-
result_data["manual_match"] = manual_string
|
| 134 |
-
result_list.append(result_data)
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
results_df = pd.DataFrame(result_list)
|
| 138 |
-
results_df.to_csv("C:\\Projects (Mediterra)\\!TechLead\\WineMatching\\Data (New4)\\mjudge_new.csv")
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
'''common_match = {}
|
| 143 |
-
full_match = {}
|
| 144 |
-
for a_match in items_to_auto:
|
| 145 |
-
if a_match in items_to_manual:
|
| 146 |
-
common_match[a_match] = [items_to_auto[a_match], items_to_manual[a_match]]
|
| 147 |
-
if items_to_auto[a_match] == items_to_manual[a_match]:
|
| 148 |
-
full_match[a_match] = items_to_auto[a_match]'''
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
#results["items_to_manual"] = len(items_to_manual)
|
| 152 |
-
#results["items_to_auto"] = len(items_to_auto
|
| 153 |
-
print(results)
|
| 154 |
-
|
| 155 |
-
return results
|
| 156 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
search/search_by_id.py
CHANGED
|
@@ -1,53 +1,24 @@
|
|
| 1 |
-
import json
|
| 2 |
-
import pandas as pd
|
| 3 |
-
import ast
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
class Searcher():
|
| 7 |
-
def __init__(self):
|
| 8 |
-
self.df = None
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
def
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
def search(self, resultfn, query):
|
| 26 |
-
is_alternative_items = False
|
| 27 |
-
df_matched_items = pd.DataFrame()
|
| 28 |
-
|
| 29 |
-
matching_result = pd.read_csv(resultfn, sep='\t', on_bad_lines='skip')
|
| 30 |
-
self.set_df(matching_result)
|
| 31 |
-
|
| 32 |
-
items = self.df[self.df['id']==query]
|
| 33 |
-
matched_items = items['matched_items']
|
| 34 |
-
if (len(matched_items) != 0) and (len(matched_items.values[0])):
|
| 35 |
-
data = json.loads(json.dumps(matched_items.values[0]))
|
| 36 |
-
df_matched_items = pd.DataFrame(data)
|
| 37 |
-
is_alternative_items = False
|
| 38 |
-
else:
|
| 39 |
-
alter_items = items['alternative']
|
| 40 |
-
|
| 41 |
-
if (len(alter_items) != 0) and (len(alter_items.values[0])):
|
| 42 |
-
data = json.loads(json.dumps(alter_items.values[0]))
|
| 43 |
-
df_matched_items = pd.DataFrame(data)
|
| 44 |
-
is_alternative_items = True
|
| 45 |
-
|
| 46 |
-
return (df_matched_items, is_alternative_items)
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
def search_in_uploaded_file(self, path, query):
|
| 50 |
-
matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
|
| 51 |
-
self.set_df(matching_result)
|
| 52 |
-
result=self.search(query)
|
| 53 |
return result
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import ast
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class Searcher():
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.df = None
|
| 9 |
+
def set_df(self, df):
|
| 10 |
+
self.df = df
|
| 11 |
+
try:
|
| 12 |
+
self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
|
| 13 |
+
except Exception as e:
|
| 14 |
+
print(e)
|
| 15 |
+
#self.df['matched_items'] = self.df['matched_items'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)
|
| 16 |
+
def search(self, query):
|
| 17 |
+
data = json.loads(json.dumps(self.df[self.df['id']==query]['matched_items'].values[0]))
|
| 18 |
+
return pd.DataFrame(data)
|
| 19 |
+
|
| 20 |
+
def search_in_uploaded_file(self, path, query):
|
| 21 |
+
matching_result=pd.read_csv(path, sep='\t', on_bad_lines='skip')
|
| 22 |
+
self.set_df(matching_result)
|
| 23 |
+
result=self.search(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
return result
|
tmp/prod.csv
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
|
|
|
| 1 |
+
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
tmp/service/prod.csv
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
|
|
|
| 1 |
+
id product_type brand category type_prefix name name_postfix name_long name_translit price year volume
|
tmp/utils.py
CHANGED
|
@@ -1,48 +1,37 @@
|
|
| 1 |
-
import pandas as pd
|
| 2 |
-
from preprocess.utils.common.utils import get_delimiter
|
| 3 |
-
import
|
| 4 |
-
import
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
new_path = shutil.move(new_path, main_dir)
|
| 39 |
-
return new_path
|
| 40 |
-
else:
|
| 41 |
-
update_products_csv(main_path, new_path)
|
| 42 |
-
return main_path
|
| 43 |
-
|
| 44 |
-
def remover(data_path):
|
| 45 |
-
#path=is_csv_exist('/home/user/app/tmp/prod.csv')
|
| 46 |
-
#if path!=None:
|
| 47 |
-
os.remove(os.getcwd()+'/tmp/prod.csv')
|
| 48 |
-
shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')'''
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from preprocess.utils.common.utils import get_delimiter
|
| 3 |
+
from glob import glob
|
| 4 |
+
import shutil
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def update_products_csv(new_csv_path, main_csv_path='/home/user/app/tmp/prod.csv'):
|
| 9 |
+
main_sep=get_delimiter(main_csv_path)
|
| 10 |
+
main_csv=pd.read_csv(main_csv_path, sep=main_sep)
|
| 11 |
+
new_sep=get_delimiter(new_csv_path)
|
| 12 |
+
new_csv=pd.read_csv(new_csv_path, sep=new_sep)
|
| 13 |
+
result=pd.concat([main_csv, new_csv]).drop_duplicates(subset='id', keep='last').reset_index(drop=True)
|
| 14 |
+
result.to_csv(main_csv_path, sep=main_sep, index=False)
|
| 15 |
+
|
| 16 |
+
def is_csv_exist(path):
|
| 17 |
+
file_list=glob(path+'/*.csv')
|
| 18 |
+
if len(file_list)>0:
|
| 19 |
+
return file_list[0]
|
| 20 |
+
else:
|
| 21 |
+
None
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def uploader(new_path, main_dir='/home/user/app/tmp/prod.csv'):
|
| 25 |
+
main_path=is_csv_exist(main_dir)
|
| 26 |
+
if main_path==None:
|
| 27 |
+
new_path = shutil.move(new_path, main_dir)
|
| 28 |
+
return new_path
|
| 29 |
+
else:
|
| 30 |
+
update_products_csv(main_path, new_path)
|
| 31 |
+
return main_path
|
| 32 |
+
|
| 33 |
+
def remover():
|
| 34 |
+
#path=is_csv_exist('/home/user/app/tmp/prod.csv')
|
| 35 |
+
#if path!=None:
|
| 36 |
+
os.remove(os.getcwd()+'/tmp/prod.csv')
|
| 37 |
+
shutil.copy2('/home/user/app/tmp/service/prod.csv', '/home/user/app/tmp/prod.csv')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ui/gradio_ui.py
CHANGED
|
@@ -1,178 +1,121 @@
|
|
| 1 |
-
import gradio as gr
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
from
|
| 5 |
-
import
|
| 6 |
-
import
|
| 7 |
-
import
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
class GradioUI():
|
| 11 |
-
|
| 12 |
-
def __init__(self, processor, searcher
|
| 13 |
-
self.processor=processor
|
| 14 |
-
self.searcher=searcher
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
#
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
toggle_alternative = gr.Checkbox(label="Включать в результаты альтернативные варианты", value=True)
|
| 122 |
-
|
| 123 |
-
threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
|
| 124 |
-
process_button = gr.Button("Загрузить файл с каталогом и сравнить")
|
| 125 |
-
output_file = gr.File(label="Скачать результат (CSV)")
|
| 126 |
-
process_button.click(
|
| 127 |
-
fn=self.process_items,
|
| 128 |
-
inputs=[file_items, toggle_input, threshold_input, toggle_alternative], #, search_number],
|
| 129 |
-
outputs=output_file
|
| 130 |
-
)
|
| 131 |
-
|
| 132 |
-
with gr.TabItem("Загрузка файла Products"):
|
| 133 |
-
with gr.Row():
|
| 134 |
-
prod_file_info1 = gr.Markdown("## Загрузка файла Products")
|
| 135 |
-
product_download_button = gr.DownloadButton(label="Скачать", value=os.path.join(self.get_products_dir(), "products.csv"), visible=True)
|
| 136 |
-
with gr.Row():
|
| 137 |
-
file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 138 |
-
with gr.Row():
|
| 139 |
-
toggle_input = gr.Checkbox(label="Перезаписать существующий файл Product", value=False)
|
| 140 |
-
upload_button = gr.Button("Загрузить файл")
|
| 141 |
-
upload_button.click(
|
| 142 |
-
fn=self.upload_products_file,
|
| 143 |
-
inputs=[file_input1, toggle_input],
|
| 144 |
-
#outputs=output_file
|
| 145 |
-
)
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# Вкладка для поиска
|
| 149 |
-
with gr.TabItem("Поиск в обработанном csv"):
|
| 150 |
-
gr.Markdown("## Поиск")
|
| 151 |
-
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 152 |
-
search_button = gr.Button("Найти")
|
| 153 |
-
search_table = gr.Dataframe(label="Результаты поиска")
|
| 154 |
-
search_button.click(
|
| 155 |
-
fn=self.searcher.search,
|
| 156 |
-
inputs=[search_number],
|
| 157 |
-
outputs=search_table
|
| 158 |
-
)
|
| 159 |
-
|
| 160 |
-
with gr.TabItem("Загрузка результат и поиск в нем"):
|
| 161 |
-
gr.Markdown("## Поиск")
|
| 162 |
-
with gr.Row():
|
| 163 |
-
input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
|
| 164 |
-
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 165 |
-
search_button = gr.Button("Найти")
|
| 166 |
-
search_table = gr.Dataframe(label="Результаты поиска")
|
| 167 |
-
search_button.click(
|
| 168 |
-
fn=self.searcher.search_in_uploaded_file,
|
| 169 |
-
inputs=[input_path, search_number],
|
| 170 |
-
outputs=search_table
|
| 171 |
-
)
|
| 172 |
-
|
| 173 |
-
#with gr.TabItem("Удалить сохраненные продукты"):
|
| 174 |
-
# del_button = gr.Button("Удалить")
|
| 175 |
-
# process_button.click(fn=remover)
|
| 176 |
-
|
| 177 |
-
demo.load(fn=self.on_page_load, inputs=None, outputs=[prod_file_info1, prod_file_info2])
|
| 178 |
demo.launch()
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import tempfile
|
| 4 |
+
from preprocess.utils.common.utils import get_delimiter
|
| 5 |
+
from tmp.utils import uploader, remover, update_products_csv
|
| 6 |
+
from glob import glob
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class GradioUI():
|
| 11 |
+
|
| 12 |
+
def __init__(self, processor, searcher=None):
|
| 13 |
+
self.processor=processor
|
| 14 |
+
self.searcher=searcher
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def process_files(self, file1, file2, is_items_first, threshold): #, q_id):
|
| 19 |
+
try:
|
| 20 |
+
print(file1)
|
| 21 |
+
|
| 22 |
+
print()
|
| 23 |
+
print(os.getcwd())
|
| 24 |
+
print(os.path.dirname(os.path.abspath(__file__)))
|
| 25 |
+
print()
|
| 26 |
+
|
| 27 |
+
if file1!=None:
|
| 28 |
+
#file1=uploader(file1)
|
| 29 |
+
update_products_csv(file1)
|
| 30 |
+
#else:
|
| 31 |
+
#file1=glob('./home/user/app/tmp/*.csv')[0]
|
| 32 |
+
file1=os.getcwd()+'/tmp/prod.csv'
|
| 33 |
+
|
| 34 |
+
#print()
|
| 35 |
+
#print(file1)
|
| 36 |
+
#print()
|
| 37 |
+
|
| 38 |
+
if file2!=None:
|
| 39 |
+
items_delimiter=get_delimiter(file2)
|
| 40 |
+
print('items delimiter: '+items_delimiter)
|
| 41 |
+
row_items=pd.read_csv(file2, sep=items_delimiter, on_bad_lines='skip')
|
| 42 |
+
|
| 43 |
+
products_delimiter=get_delimiter(file1)
|
| 44 |
+
print('products delimiter: '+products_delimiter)
|
| 45 |
+
row_products=pd.read_csv(file1, sep=products_delimiter, on_bad_lines='skip')
|
| 46 |
+
|
| 47 |
+
# if q_id in row_products['id'].unique():
|
| 48 |
+
# row_products=row_products[row_products['id']==q_id]
|
| 49 |
+
|
| 50 |
+
#print("product id: " + str(q_id))
|
| 51 |
+
|
| 52 |
+
df, items, products= self.processor.process(row_products, row_items, is_items_first, threshold)
|
| 53 |
+
# Создаём временный CSV файл для сохранения результата
|
| 54 |
+
|
| 55 |
+
self.searcher.set_df(df.copy())
|
| 56 |
+
|
| 57 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as tmp:
|
| 58 |
+
output_csv = tmp.name
|
| 59 |
+
df.to_csv(output_csv, sep='\t', index=False)
|
| 60 |
+
return output_csv
|
| 61 |
+
except Exception as ex:
|
| 62 |
+
raise gr.Error("An error occurred 💥!"+"\n\n"+str(ex), duration=5)
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def run_ui(self):
|
| 67 |
+
with gr.Blocks() as demo:
|
| 68 |
+
with gr.Tabs():
|
| 69 |
+
|
| 70 |
+
# with gr.Row():
|
| 71 |
+
# file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 72 |
+
# process_button = gr.Button("Обновить")
|
| 73 |
+
|
| 74 |
+
# Вкладка для обработки CSV файлов
|
| 75 |
+
with gr.TabItem("Обработка CSV файлов"):
|
| 76 |
+
gr.Markdown("## Обработка CSV файлов")
|
| 77 |
+
with gr.Row():
|
| 78 |
+
file_input1 = gr.File(label="Products", type="filepath", file_types=[".csv"])
|
| 79 |
+
file_input2 = gr.File(label="Items", type="filepath", file_types=[".csv"])
|
| 80 |
+
#search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 81 |
+
with gr.Row():
|
| 82 |
+
toggle_input = gr.Checkbox(label="Инвертировать поиск", value=False)
|
| 83 |
+
threshold_input = gr.Slider(minimum=0, maximum=100, step=1, label="Threshold", value=50)
|
| 84 |
+
process_button = gr.Button("Обработать файлы")
|
| 85 |
+
output_file = gr.File(label="Скачать результат (CSV)")
|
| 86 |
+
process_button.click(
|
| 87 |
+
fn=self.process_files,
|
| 88 |
+
inputs=[file_input1, file_input2, toggle_input, threshold_input], #, search_number],
|
| 89 |
+
outputs=output_file
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Вкладка для поиска
|
| 93 |
+
with gr.TabItem("Поиск в обработанном csv"):
|
| 94 |
+
gr.Markdown("## Поиск")
|
| 95 |
+
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 96 |
+
search_button = gr.Button("Найти")
|
| 97 |
+
search_table = gr.Dataframe(label="Результаты поиска")
|
| 98 |
+
search_button.click(
|
| 99 |
+
fn=self.searcher.search,
|
| 100 |
+
inputs=[search_number],
|
| 101 |
+
outputs=search_table
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
with gr.TabItem("Загрузка результат и поиск в нем"):
|
| 105 |
+
gr.Markdown("## Поиск")
|
| 106 |
+
with gr.Row():
|
| 107 |
+
input_path = gr.File(label="Matching result", type="filepath", file_types=[".csv"])
|
| 108 |
+
search_number = gr.Number(label="Введите ID продукта", value=0, precision=0)
|
| 109 |
+
search_button = gr.Button("Найти")
|
| 110 |
+
search_table = gr.Dataframe(label="Результаты поиска")
|
| 111 |
+
search_button.click(
|
| 112 |
+
fn=self.searcher.search_in_uploaded_file,
|
| 113 |
+
inputs=[input_path, search_number],
|
| 114 |
+
outputs=search_table
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
with gr.TabItem("Удалить сохраненные продукты"):
|
| 118 |
+
del_button = gr.Button("Удалить")
|
| 119 |
+
process_button.click(fn=remover)
|
| 120 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
demo.launch()
|