Spaces:
Sleeping
Sleeping
delete fuzzy
Browse files
app.py
CHANGED
|
@@ -29,10 +29,6 @@ TABLE_NAME = "Feedback_search"
|
|
| 29 |
api = Api(AIRTABLE_API_KEY)
|
| 30 |
table = api.table(BASE_ID, TABLE_NAME)
|
| 31 |
|
| 32 |
-
# Load whitelist
|
| 33 |
-
with open("keyword_whitelist.pkl", "rb") as f:
|
| 34 |
-
keyword_whitelist = pickle.load(f)
|
| 35 |
-
|
| 36 |
# Preload Models
|
| 37 |
model = SentenceTransformer("BAAI/bge-m3")
|
| 38 |
collection_name = "product_bge-m3"
|
|
@@ -43,53 +39,13 @@ reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
|
|
| 43 |
# Utils
|
| 44 |
def is_non_thai(text):
|
| 45 |
return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
|
| 46 |
-
|
| 47 |
-
def join_corrected_tokens(corrected: list) -> str:
|
| 48 |
-
if corrected and is_non_thai("".join(corrected)):
|
| 49 |
-
return " ".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
|
| 50 |
-
else:
|
| 51 |
-
return "".join([w for w in corrected if len(w) > 1 or w in keyword_whitelist])
|
| 52 |
-
|
| 53 |
def normalize(text: str) -> str:
|
| 54 |
if is_non_thai(text):
|
| 55 |
return text.strip()
|
| 56 |
text = unicodedata.normalize("NFC", text)
|
| 57 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
| 58 |
|
| 59 |
-
def smart_tokenize(text: str) -> list:
|
| 60 |
-
tokens = word_tokenize(text.strip(), engine="newmm")
|
| 61 |
-
return tokens if tokens and len("".join(tokens)) >= len(text.strip()) * 0.5 else [text.strip()]
|
| 62 |
-
|
| 63 |
-
def correct_query_merge_phrases(query: str, whitelist, threshold=80, max_ngram=3):
|
| 64 |
-
query_norm = normalize(query)
|
| 65 |
-
tokens = smart_tokenize(query_norm)
|
| 66 |
-
corrected = []
|
| 67 |
-
i = 0
|
| 68 |
-
while i < len(tokens):
|
| 69 |
-
matched = False
|
| 70 |
-
for n in range(min(max_ngram, len(tokens) - i), 0, -1):
|
| 71 |
-
phrase = "".join(tokens[i:i+n])
|
| 72 |
-
if phrase in whitelist:
|
| 73 |
-
corrected.append(phrase)
|
| 74 |
-
i += n
|
| 75 |
-
matched = True
|
| 76 |
-
break
|
| 77 |
-
match, score, _ = process.extractOne(
|
| 78 |
-
phrase,
|
| 79 |
-
whitelist,
|
| 80 |
-
scorer=fuzz.token_sort_ratio,
|
| 81 |
-
processor=lambda x: x.lower()
|
| 82 |
-
)
|
| 83 |
-
if score >= threshold:
|
| 84 |
-
corrected.append(match)
|
| 85 |
-
i += n
|
| 86 |
-
matched = True
|
| 87 |
-
break
|
| 88 |
-
if not matched:
|
| 89 |
-
corrected.append(tokens[i])
|
| 90 |
-
i += 1
|
| 91 |
-
return join_corrected_tokens(corrected)
|
| 92 |
-
|
| 93 |
# Global state
|
| 94 |
latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
|
| 95 |
|
|
@@ -100,7 +56,7 @@ def search_product(query):
|
|
| 100 |
start_time = time.time()
|
| 101 |
latest_query_result["raw_query"] = query
|
| 102 |
|
| 103 |
-
corrected_query =
|
| 104 |
query_embed = model.encode(corrected_query)
|
| 105 |
|
| 106 |
try:
|
|
|
|
| 29 |
api = Api(AIRTABLE_API_KEY)
|
| 30 |
table = api.table(BASE_ID, TABLE_NAME)
|
| 31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
# Preload Models
|
| 33 |
model = SentenceTransformer("BAAI/bge-m3")
|
| 34 |
collection_name = "product_bge-m3"
|
|
|
|
| 39 |
# Utils
|
| 40 |
def is_non_thai(text):
|
| 41 |
return re.match(r'^[A-Za-z0-9&\-\s]+$', text) is not None
|
| 42 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
def normalize(text: str) -> str:
|
| 44 |
if is_non_thai(text):
|
| 45 |
return text.strip()
|
| 46 |
text = unicodedata.normalize("NFC", text)
|
| 47 |
return text.replace("เแ", "แ").replace("เเ", "แ").strip().lower()
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
# Global state
|
| 50 |
latest_query_result = {"query": "", "result": "", "raw_query": "", "time": ""}
|
| 51 |
|
|
|
|
| 56 |
start_time = time.time()
|
| 57 |
latest_query_result["raw_query"] = query
|
| 58 |
|
| 59 |
+
corrected_query = normalize(query)
|
| 60 |
query_embed = model.encode(corrected_query)
|
| 61 |
|
| 62 |
try:
|