Spaces:

stephenhoang
/

hm-semantic-search

Sleeping

App Files Files Community

stephenhoang commited on Dec 20, 2025

Commit

5afc7ff

verified ·

1 Parent(s): e43c8b1

Upload 5 files

Browse files

Files changed (5) hide show

app.py +184 -0
bm25_model.pkl +3 -0
df_products.pkl +3 -0
requirements.txt +6 -0
sbert_embeddings.npy +3 -0

app.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import streamlit as st
+import pandas as pd
+import numpy as np
+import pickle
+import faiss
+import re
+from rank_bm25 import BM25Okapi
+from sentence_transformers import SentenceTransformer
+# ============================================
+# 1. SETUP GIAO DIỆN & CONFIG
+# ============================================
+st.set_page_config(page_title="H&M Semantic Search", page_icon="🛍️", layout="wide")
+st.markdown("""
+<style>
+    .main {background-color: #f5f5f5;}
+    .stButton>button {width: 100%; background-color: #ff4b4b; color: white;}
+    .metric-card {background-color: white; padding: 15px; border-radius: 10px; box-shadow: 2px 2px 5px rgba(0,0,0,0.1);}
+</style>
+""", unsafe_allow_html=True)
+# ============================================
+# 2. LOAD MODEL (CACHING ĐỂ CHẠY NHANH)
+# ============================================
+@st.cache_resource
+def load_models():
+    # Đường dẫn đến thư mục bro đã lưu
+    MODEL_PATH = "models_best"
+    print("⏳ Loading Artifacts...")
+    # Load DataFrame
+    with open(f'{MODEL_PATH}/df_products.pkl', 'rb') as f:
+        df = pickle.load(f)
+    # Load BM25
+    with open(f'{MODEL_PATH}/bm25_model.pkl', 'rb') as f:
+        bm25 = pickle.load(f)
+    # Load Embeddings
+    embeddings = np.load(f'{MODEL_PATH}/sbert_embeddings.npy')
+    # Load SBERT Model (Cần để encode query của user)
+    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
+    return df, bm25, embeddings, sbert_model
+try:
+    df, bm25, embeddings, sbert_model = load_models()
+    # Re-build FAISS Index (Nhanh lắm, không cần lưu file index)
+    faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(embeddings.shape[1])
+    index.add(embeddings)
+except Exception as e:
+    st.error(f"❌ Không tìm thấy model ở thư mục 'models_best'. Lỗi: {e}")
+    st.stop()
+# ============================================
+# 3. SEARCH ENGINE CLASS (LOGIC MASTERPIECE)
+# ============================================
+class StreamlitSearchEngine:
+    def __init__(self, df, bm25, index, sbert_model):
+        self.df = df
+        self.bm25 = bm25
+        self.index = index
+        self.sbert_model = sbert_model
+        # --- DICTIONARY TINH GỌN (FINAL VERSION) ---
+        self.phrase_synonyms = {
+            'running shoes': ['trainers', 'sneakers', 'runners', 'athletic footwear'],
+            'running shoe': ['trainers', 'sneakers', 'runners'],
+            'gym shoes': ['trainers', 'sneakers'],
+            'joggers': ['sweatpants', 'track pants'],
+            'denim jeans': ['blue jeans', 'denim'],
+            'hoodie': ['sweatshirt', 'hooded'],
+            'summer dress': ['sundress', 'floral dress']
+        }
+    def _min_max_normalize(self, scores):
+        min_s, max_s = np.min(scores), np.max(scores)
+        if max_s - min_s == 0: return np.zeros_like(scores)
+        return (scores - min_s) / (max_s - min_s)
+    def _expand_query_phrase(self, query):
+        """Mở rộng query thông minh"""
+        query_lower = str(query).lower()
+        expansion_terms = []
+        for phrase, synonyms in self.phrase_synonyms.items():
+            if phrase in query_lower:
+                expansion_terms.extend(synonyms)
+        if expansion_terms:
+            return query_lower + " " + " ".join(list(set(expansion_terms)))
+        return query_lower
+    def search(self, query, top_k=10, alpha=0.5):
+        # 1. Expand
+        expanded_q = self._expand_query_phrase(query)
+        # 2. Lexical (BM25)
+        q_lexical = re.sub(r"[^a-z0-9\s\-\%]", " ", expanded_q).split()
+        bm25_raw = self.bm25.get_scores(q_lexical)
+        bm25_norm = self._min_max_normalize(bm25_raw)
+        # 3. Semantic (SBERT)
+        q_vec = self.sbert_model.encode([query]).astype('float32')
+        faiss.normalize_L2(q_vec)
+        D, I = self.index.search(q_vec, len(self.df))
+        sbert_raw = np.zeros(len(self.df))
+        sbert_raw[I[0]] = D[0]
+        sbert_norm = self._min_max_normalize(sbert_raw)
+        # 4. Fusion
+        final_scores = (alpha * bm25_norm) + ((1 - alpha) * sbert_norm)
+        # 5. Result
+        top_indices = np.argsort(final_scores)[::-1][:top_k]
+        results = self.df.iloc[top_indices].copy()
+        results['score'] = final_scores[top_indices]
+        results['bm25'] = bm25_norm[top_indices]
+        results['sbert'] = sbert_norm[top_indices]
+        return results, expanded_q
+engine = StreamlitSearchEngine(df, bm25, index, sbert_model)
+# ============================================
+# 4. GIAO DIỆN NGƯỜI DÙNG (UI)
+# ============================================
+st.title("🛍️ H&M AI Hybrid Search")
+st.caption("Project Semantic Search - Demo")
+with st.sidebar:
+    st.header("⚙�� Cấu hình")
+    alpha = st.slider("Trọng số Hybrid (Alpha)", 0.0, 1.0, 0.5, 0.1, help="0: Chỉ Semantic, 1: Chỉ Keyword")
+    top_k = st.slider("Số lượng kết quả", 5, 20, 10)
+    st.markdown("---")
+    st.info("💡 **Mẹo:** Thử tìm *'Black running shoes'* để xem AI tự động hiểu là *'Sneakers'* như thế nào!")
+# Search Box
+col1, col2 = st.columns([4, 1])
+with col1:
+    query = st.text_input("Nhập mô tả sản phẩm...", placeholder="Ví dụ: Black running shoes, Floral summer dress...")
+with col2:
+    st.write("")
+    st.write("")
+    btn_search = st.button("🔍 Tìm kiếm")
+if btn_search or query:
+    with st.spinner('AI đang phân tích & tìm kiếm...'):
+        results, expanded_q = engine.search(query, top_k=top_k, alpha=alpha)
+    # Hiển thị thông tin Debug (để thầy cô thấy mình khôn)
+    with st.expander("🕵️‍♂️ Xem cơ chế hoạt động của AI (Debug Info)", expanded=True):
+        st.write(f"**Query gốc:** `{query}`")
+        if query.lower() != expanded_q:
+            st.success(f"**✨ Query đã mở rộng (Expanded):** `{expanded_q}`")
+            st.caption("👉 Hệ thống đã tự động thêm từ đồng nghĩa chuyên ngành để tìm chính xác hơn.")
+        else:
+            st.info("**Query không thay đổi** (Không tìm thấy cụm từ chuyên ngành cần mở rộng).")
+    st.markdown(f"### Kết quả tìm thấy: {len(results)}")
+    for idx, row in results.iterrows():
+        with st.container():
+            c1, c2, c3 = st.columns([1, 6, 2])
+            with c1:
+                st.write(f"#{idx+1}")
+                st.markdown("👕") # Icon thay cho ảnh
+            with c2:
+                st.subheader(row['prod_name'])
+                st.markdown(f"**{row['colour_group_name']} | {row['product_type_name']}**")
+                st.caption(f"_{row['detail_desc']}_")
+                st.caption(f"📝 *Smart Text:* `{row['rich_source']}`")
+            with c3:
+                st.metric("Total Score", f"{row['score']:.3f}")
+                st.progress(row['score'])
+                st.caption(f"BM25: {row['bm25']:.2f} | SBERT: {row['sbert']:.2f}")
+            st.divider()

bm25_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7afe6ac075a94bfd56eb0eb9d420f4affba53542204529961c45361c723fec25
+size 19404667

df_products.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8842440d317123eb4a350a804abd8283102f73ff14ed63f723d72aeb253d1629
+size 28109934

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+streamlit
+pandas
+numpy
+sentence-transformers
+rank-bm25
+faiss-cpu

sbert_embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab5aa7143148c6c730acbc3a1d2070414d6b8bf76cc2a87245441e7437a03b7b
+size 162112640