stephenhoang commited on
Commit
5afc7ff
·
verified ·
1 Parent(s): e43c8b1

Upload 5 files

Browse files
Files changed (5) hide show
  1. app.py +184 -0
  2. bm25_model.pkl +3 -0
  3. df_products.pkl +3 -0
  4. requirements.txt +6 -0
  5. sbert_embeddings.npy +3 -0
app.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import numpy as np
5
+ import pickle
6
+ import faiss
7
+ import re
8
+ from rank_bm25 import BM25Okapi
9
+ from sentence_transformers import SentenceTransformer
10
+
11
+ # ============================================
12
+ # 1. SETUP GIAO DIỆN & CONFIG
13
+ # ============================================
14
+ st.set_page_config(page_title="H&M Semantic Search", page_icon="🛍️", layout="wide")
15
+
16
+ st.markdown("""
17
+ <style>
18
+ .main {background-color: #f5f5f5;}
19
+ .stButton>button {width: 100%; background-color: #ff4b4b; color: white;}
20
+ .metric-card {background-color: white; padding: 15px; border-radius: 10px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);}
21
+ </style>
22
+ """, unsafe_allow_html=True)
23
+
24
+ # ============================================
25
+ # 2. LOAD MODEL (CACHING ĐỂ CHẠY NHANH)
26
+ # ============================================
27
+ @st.cache_resource
28
+ def load_models():
29
+ # Đường dẫn đến thư mục bro đã lưu
30
+ MODEL_PATH = "models_best"
31
+
32
+ print("⏳ Loading Artifacts...")
33
+
34
+ # Load DataFrame
35
+ with open(f'{MODEL_PATH}/df_products.pkl', 'rb') as f:
36
+ df = pickle.load(f)
37
+
38
+ # Load BM25
39
+ with open(f'{MODEL_PATH}/bm25_model.pkl', 'rb') as f:
40
+ bm25 = pickle.load(f)
41
+
42
+ # Load Embeddings
43
+ embeddings = np.load(f'{MODEL_PATH}/sbert_embeddings.npy')
44
+
45
+ # Load SBERT Model (Cần để encode query của user)
46
+ sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
47
+
48
+ return df, bm25, embeddings, sbert_model
49
+
50
+ try:
51
+ df, bm25, embeddings, sbert_model = load_models()
52
+
53
+ # Re-build FAISS Index (Nhanh lắm, không cần lưu file index)
54
+ faiss.normalize_L2(embeddings)
55
+ index = faiss.IndexFlatIP(embeddings.shape[1])
56
+ index.add(embeddings)
57
+
58
+ except Exception as e:
59
+ st.error(f"❌ Không tìm thấy model ở thư mục 'models_best'. Lỗi: {e}")
60
+ st.stop()
61
+
62
+ # ============================================
63
+ # 3. SEARCH ENGINE CLASS (LOGIC MASTERPIECE)
64
+ # ============================================
65
+ class StreamlitSearchEngine:
66
+ def __init__(self, df, bm25, index, sbert_model):
67
+ self.df = df
68
+ self.bm25 = bm25
69
+ self.index = index
70
+ self.sbert_model = sbert_model
71
+
72
+ # --- DICTIONARY TINH GỌN (FINAL VERSION) ---
73
+ self.phrase_synonyms = {
74
+ 'running shoes': ['trainers', 'sneakers', 'runners', 'athletic footwear'],
75
+ 'running shoe': ['trainers', 'sneakers', 'runners'],
76
+ 'gym shoes': ['trainers', 'sneakers'],
77
+ 'joggers': ['sweatpants', 'track pants'],
78
+ 'denim jeans': ['blue jeans', 'denim'],
79
+ 'hoodie': ['sweatshirt', 'hooded'],
80
+ 'summer dress': ['sundress', 'floral dress']
81
+ }
82
+
83
+ def _min_max_normalize(self, scores):
84
+ min_s, max_s = np.min(scores), np.max(scores)
85
+ if max_s - min_s == 0: return np.zeros_like(scores)
86
+ return (scores - min_s) / (max_s - min_s)
87
+
88
+ def _expand_query_phrase(self, query):
89
+ """Mở rộng query thông minh"""
90
+ query_lower = str(query).lower()
91
+ expansion_terms = []
92
+ for phrase, synonyms in self.phrase_synonyms.items():
93
+ if phrase in query_lower:
94
+ expansion_terms.extend(synonyms)
95
+ if expansion_terms:
96
+ return query_lower + " " + " ".join(list(set(expansion_terms)))
97
+ return query_lower
98
+
99
+ def search(self, query, top_k=10, alpha=0.5):
100
+ # 1. Expand
101
+ expanded_q = self._expand_query_phrase(query)
102
+
103
+ # 2. Lexical (BM25)
104
+ q_lexical = re.sub(r"[^a-z0-9\s\-\%]", " ", expanded_q).split()
105
+ bm25_raw = self.bm25.get_scores(q_lexical)
106
+ bm25_norm = self._min_max_normalize(bm25_raw)
107
+
108
+ # 3. Semantic (SBERT)
109
+ q_vec = self.sbert_model.encode([query]).astype('float32')
110
+ faiss.normalize_L2(q_vec)
111
+ D, I = self.index.search(q_vec, len(self.df))
112
+
113
+ sbert_raw = np.zeros(len(self.df))
114
+ sbert_raw[I[0]] = D[0]
115
+ sbert_norm = self._min_max_normalize(sbert_raw)
116
+
117
+ # 4. Fusion
118
+ final_scores = (alpha * bm25_norm) + ((1 - alpha) * sbert_norm)
119
+
120
+ # 5. Result
121
+ top_indices = np.argsort(final_scores)[::-1][:top_k]
122
+ results = self.df.iloc[top_indices].copy()
123
+
124
+ results['score'] = final_scores[top_indices]
125
+ results['bm25'] = bm25_norm[top_indices]
126
+ results['sbert'] = sbert_norm[top_indices]
127
+
128
+ return results, expanded_q
129
+
130
+ engine = StreamlitSearchEngine(df, bm25, index, sbert_model)
131
+
132
+ # ============================================
133
+ # 4. GIAO DIỆN NGƯỜI DÙNG (UI)
134
+ # ============================================
135
+ st.title("🛍️ H&M AI Hybrid Search")
136
+ st.caption("Project Semantic Search - Demo")
137
+
138
+ with st.sidebar:
139
+ st.header("⚙�� Cấu hình")
140
+ alpha = st.slider("Trọng số Hybrid (Alpha)", 0.0, 1.0, 0.5, 0.1, help="0: Chỉ Semantic, 1: Chỉ Keyword")
141
+ top_k = st.slider("Số lượng kết quả", 5, 20, 10)
142
+ st.markdown("---")
143
+ st.info("💡 **Mẹo:** Thử tìm *'Black running shoes'* để xem AI tự động hiểu là *'Sneakers'* như thế nào!")
144
+
145
+ # Search Box
146
+ col1, col2 = st.columns([4, 1])
147
+ with col1:
148
+ query = st.text_input("Nhập mô tả sản phẩm...", placeholder="Ví dụ: Black running shoes, Floral summer dress...")
149
+ with col2:
150
+ st.write("")
151
+ st.write("")
152
+ btn_search = st.button("🔍 Tìm kiếm")
153
+
154
+ if btn_search or query:
155
+ with st.spinner('AI đang phân tích & tìm kiếm...'):
156
+ results, expanded_q = engine.search(query, top_k=top_k, alpha=alpha)
157
+
158
+ # Hiển thị thông tin Debug (để thầy cô thấy mình khôn)
159
+ with st.expander("🕵️‍♂️ Xem cơ chế hoạt động của AI (Debug Info)", expanded=True):
160
+ st.write(f"**Query gốc:** `{query}`")
161
+ if query.lower() != expanded_q:
162
+ st.success(f"**✨ Query đã mở rộng (Expanded):** `{expanded_q}`")
163
+ st.caption("👉 Hệ thống đã tự động thêm từ đồng nghĩa chuyên ngành để tìm chính xác hơn.")
164
+ else:
165
+ st.info("**Query không thay đổi** (Không tìm thấy cụm từ chuyên ngành cần mở rộng).")
166
+
167
+ st.markdown(f"### Kết quả tìm thấy: {len(results)}")
168
+
169
+ for idx, row in results.iterrows():
170
+ with st.container():
171
+ c1, c2, c3 = st.columns([1, 6, 2])
172
+ with c1:
173
+ st.write(f"#{idx+1}")
174
+ st.markdown("👕") # Icon thay cho ảnh
175
+ with c2:
176
+ st.subheader(row['prod_name'])
177
+ st.markdown(f"**{row['colour_group_name']} | {row['product_type_name']}**")
178
+ st.caption(f"_{row['detail_desc']}_")
179
+ st.caption(f"📝 *Smart Text:* `{row['rich_source']}`")
180
+ with c3:
181
+ st.metric("Total Score", f"{row['score']:.3f}")
182
+ st.progress(row['score'])
183
+ st.caption(f"BM25: {row['bm25']:.2f} | SBERT: {row['sbert']:.2f}")
184
+ st.divider()
bm25_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7afe6ac075a94bfd56eb0eb9d420f4affba53542204529961c45361c723fec25
3
+ size 19404667
df_products.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8842440d317123eb4a350a804abd8283102f73ff14ed63f723d72aeb253d1629
3
+ size 28109934
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ sentence-transformers
5
+ rank-bm25
6
+ faiss-cpu
sbert_embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab5aa7143148c6c730acbc3a1d2070414d6b8bf76cc2a87245441e7437a03b7b
3
+ size 162112640