anhkhoiphan commited on
Commit
2959ef9
·
1 Parent(s): e781da4

Initial commit on HF Space

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /venv
2
+ .env
__pycache__/config.cpython-310.pyc ADDED
Binary file (1.33 kB). View file
 
__pycache__/data_helper.cpython-310.pyc ADDED
Binary file (3.8 kB). View file
 
__pycache__/data_indexing.cpython-310.pyc ADDED
Binary file (30.8 kB). View file
 
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from data_indexing import create_gradio_interface
2
+
3
+ demo = create_gradio_interface()
4
+ demo.launch()
config.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os
3
+
4
+ load_dotenv()
5
+
6
+ QDRANT_COLLECTION_NAME_SPCHIEUSANG = "spchieusang"
7
+ QDRANT_COLLECTION_NAME_SPCHUYENDUNG = "spchuyendung"
8
+ QDRANT_COLLECTION_NAME_SPNHATHONGMINH = "spnhathongminh"
9
+ QDRANT_COLLECTION_NAME_SPTHIETBIDIEN = "spthietbidien"
10
+ QDRANT_COLLECTION_NAME_SPPHICHNUOC = "spphichnuoc"
11
+ QDRANT_COLLECTION_NAME_GPHOCDUONG = "gphocduong"
12
+ QDRANT_COLLECTION_NAME_GPNHATHONGMINH = "gpnhathongminh"
13
+ QDRANT_COLLECTION_NAME_GPNGUNGHIEP = "gpngunghiep"
14
+ QDRANT_COLLECTION_NAME_GPNLMT = "gpnlmt"
15
+ QDRANT_COLLECTION_NAME_GPCANHQUAN = "gpcanhquan"
16
+ QDRANT_COLLECTION_NAME_GPNNCNC = "gpnongnghiepcnc"
17
+ QDRANT_COLLECTION_NAME_GPDUONGPHO = "gpduongpho"
18
+ QDRANT_COLLECTION_NAME_GPVPCS = "gpvpcs"
19
+ QDRANT_COLLECTION_NAME_GPNMCN = "gpnhamaycongnghiep"
20
+ QDRANT_COLLECTION_NAME_GPNOXH = "gpnhaoxahoi"
21
+
22
+ TEXT_EMBEDDING_MODEL = "keepitreal/vietnamese-sbert"
23
+ TEXT_EMBEDDING_SIZE = 768
24
+
25
+ IMAGE_EMBEDDING_MODEL = "google/efficientnet-b3"
26
+ IMAGE_EMBEDDING_SIZE = 1536
27
+
28
+ MONGODB_URI = os.getenv("LOGGING_URI", "mongodb://localhost:27017/")
29
+
30
+ QDRANT_HOST = os.getenv("QDRANT_HOST")
31
+ QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
data_helper.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+ import re
3
+
4
+ def extract_plugs_max_current(spec: str) -> Optional[int]:
5
+ """Extract max current for plugs product in Ampe"""
6
+ try:
7
+ if not spec:
8
+ return None
9
+
10
+ # Pattern to match "Dòng điện ổ cắm tối đa: XA" where X is the current value
11
+ current_patterns = [
12
+ r'dòng điện ổ cắm tối đa:\s*(\d+)\s*A',
13
+ r'dòng điện ổ cắm tối đa:\s*(\d+)A',
14
+ ]
15
+
16
+ for pattern in current_patterns:
17
+ current_match = re.search(pattern, spec, re.IGNORECASE)
18
+ if current_match:
19
+ return int(current_match.group(1))
20
+
21
+ return None
22
+ except Exception:
23
+ return None
24
+
25
+ def extract_power(spec: str) -> Optional[int]:
26
+ """Extract power consumption in watts"""
27
+ try:
28
+ # # Pattern 1: XW/Ym format - return floor(X/Y)
29
+ # power_per_meter_pattern = r'(?:Công suất danh định|công suất danh định|Công suất|công suất).*?[:]\s*(\d+)[Ww]/(\d+)m'
30
+ # per_meter_match = re.search(power_per_meter_pattern, spec, re.IGNORECASE)
31
+ # if per_meter_match:
32
+ # watts = int(per_meter_match.group(1))
33
+ # meters = int(per_meter_match.group(2))
34
+ # return watts // meters
35
+ # Pattern 2: XW format - return X (exclude tối đa, chịu tải, etc.)
36
+ power_pattern = r'(?:Công suất danh định|công suất danh định|Công suất|công suất)(?!.*(?:tối đa|chịu tải|đầu ra)).*?[:]\s*(\d+)\s*[Ww]'
37
+ power_match = re.search(power_pattern, spec, re.IGNORECASE)
38
+ if power_match:
39
+ return int(power_match.group(1))
40
+ except:
41
+ return None
42
+
43
+
44
+ def extract_ceiling_hole_diameter2(spec: str) -> Optional[int]:
45
+ """Extract ceiling hole diameter in mm for sp chieu sang"""
46
+ hole_patterns = [
47
+ r'(?:[đĐ]ường kính lỗ khoét trần|đường kính lỗ khoét trần).*?(\d+)',
48
+ r'(?:lỗ khoét|Lỗ khoét).*?(\d+)',
49
+ r'(?:lỗ khoét trần).*?(\d+)',
50
+ r"[kK]ích\s*thước\s*lỗ\s*khoét\s*trần\s*:\s*(\d+)\s*mm"
51
+ ]
52
+
53
+ for pattern in hole_patterns:
54
+ hole_match = re.search(pattern, spec, re.IGNORECASE)
55
+ if hole_match:
56
+ return int(hole_match.group(1))
57
+
58
+ def extract_dong_danh_dinh(spec: str):
59
+ try:
60
+ patterns = [
61
+ r"[dD]òng\s*(?:ngắn\s*mạch\s*)?danh\s*định\s*:\s*(\d+)A",
62
+ r'(?:Dòng điện định mức|dòng điện định mức).*?(\d+(?:[.,]\d+)?)\s*(?:A|Ampe|Amp)'
63
+ ]
64
+ for p in patterns:
65
+ match = re.search(p, spec, re.IGNORECASE)
66
+
67
+ if match:
68
+ return int(match.group(1))
69
+ return None
70
+ except:
71
+ return None
72
+
73
+ def extract_cable_length(spec: str) -> Optional[float]:
74
+ """Lấy chiều dài dây"""
75
+ try:
76
+ length_patterns = [
77
+ r'(?:Chiều dài dây|chiều dài dây).*?:?\s*([\d\.,]+)\s*(?:m|mét|meter)',
78
+ r'(?:Dây dài|dây dài).*?:?\s*([\d\.,]+)\s*(?:m|mét|meter)',
79
+ r'(?:Chiều dài|chiều dài).*?dây.*?:?\s*([\d\.,]+)\s*(?:m|mét|meter)',
80
+ r'(?:Dây|dây).*?(?:dài|chiều dài).*?:?\s*([\d\.,]+)\s*(?:m|mét|meter)'
81
+ ]
82
+
83
+ for pattern in length_patterns:
84
+ length_match = re.search(pattern, spec, re.IGNORECASE)
85
+ if length_match:
86
+ # Convert comma to dot for decimal values
87
+ length_str = length_match.group(1).replace(',', '.')
88
+ return float(length_str)
89
+ return None
90
+ except:
91
+ return None
92
+
93
+ def extract_voltage(model: str) -> Optional[int]:
94
+ """Trích xuất thông tin điện áp từ Mã Sản Phẩm"""
95
+ try:
96
+ if not model:
97
+ return None
98
+
99
+ voltage_patterns = [
100
+ r'(\d+)V',
101
+ ]
102
+
103
+ for pattern in voltage_patterns:
104
+ voltage_match = re.search(pattern, model, re.IGNORECASE)
105
+ if voltage_match:
106
+ return int(voltage_match.group(1))
107
+
108
+ return None
109
+ except Exception:
110
+ return None
111
+
112
+ def extract_tinh_nang(model : str, name : str) -> Optional[str]:
113
+ """Trích xuất thông tin về tính năng: đổi màu/xoay góc"""
114
+ try:
115
+ if not model or not name:
116
+ return None
117
+
118
+ model_upper = model.upper()
119
+ name_lower = name.lower()
120
+
121
+ if "ĐM" in model_upper:
122
+ return "đổi màu"
123
+
124
+ if "xoay góc" in name_lower:
125
+ return "xoay góc"
126
+
127
+ return None
128
+
129
+ except:
130
+ return None
131
+
132
+ def extract_he_thong_hoa_luoi_pha(name: str) -> Optional[str]:
133
+ """Trích xuất thông tin hệ thống hoa luợi"""
134
+ try:
135
+ if not name:
136
+ return None
137
+
138
+ name_lower = name.lower()
139
+
140
+ if "1 pha" in name_lower:
141
+ return "1 pha"
142
+
143
+ if "3 pha" in name_lower:
144
+ return "3 pha"
145
+
146
+ return None
147
+
148
+ except:
149
+ return None
150
+
151
+
data_indexing.py ADDED
@@ -0,0 +1,1242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import os
3
+ import requests
4
+ import sys
5
+ # import tempfile
6
+ # import time
7
+ from typing import List, Dict, Tuple, Any, Optional
8
+ import uuid
9
+
10
+ # Add project root to Python path
11
+ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))
12
+ if project_root not in sys.path:
13
+ sys.path.insert(0, project_root)
14
+
15
+ from PIL import Image
16
+ from FlagEmbedding import BGEM3FlagModel
17
+ import gradio as gr
18
+ from langchain_core.documents import Document
19
+ from langchain_huggingface import HuggingFaceEmbeddings
20
+ import qdrant_client
21
+ from qdrant_client.http.models import Modifier, Distance, SparseVectorParams, VectorParams, SparseIndexParams
22
+ import torch
23
+ from transformers import EfficientNetModel, AutoImageProcessor
24
+ from pymongo import MongoClient
25
+
26
+ from config import (
27
+ QDRANT_COLLECTION_NAME_SPCHIEUSANG,
28
+ QDRANT_COLLECTION_NAME_SPCHUYENDUNG,
29
+ QDRANT_COLLECTION_NAME_SPPHICHNUOC,
30
+ QDRANT_COLLECTION_NAME_SPTHIETBIDIEN,
31
+ QDRANT_COLLECTION_NAME_SPNHATHONGMINH,
32
+ QDRANT_COLLECTION_NAME_GPNHATHONGMINH,
33
+ QDRANT_COLLECTION_NAME_GPHOCDUONG,
34
+ QDRANT_COLLECTION_NAME_GPNGUNGHIEP,
35
+ QDRANT_COLLECTION_NAME_GPCANHQUAN,
36
+ QDRANT_COLLECTION_NAME_GPNLMT,
37
+ QDRANT_COLLECTION_NAME_GPNNCNC,
38
+ QDRANT_COLLECTION_NAME_GPDUONGPHO,
39
+ QDRANT_COLLECTION_NAME_GPVPCS,
40
+ QDRANT_COLLECTION_NAME_GPNMCN,
41
+ QDRANT_COLLECTION_NAME_GPNOXH,
42
+ IMAGE_EMBEDDING_SIZE,
43
+ TEXT_EMBEDDING_SIZE,
44
+ IMAGE_EMBEDDING_MODEL,
45
+ TEXT_EMBEDDING_MODEL,
46
+ MONGODB_URI,
47
+ QDRANT_HOST,
48
+ QDRANT_API_KEY
49
+ )
50
+
51
+ from data_helper import *
52
+ # from src.utils.helper import client
53
+
54
+ client = qdrant_client.QdrantClient(
55
+ url=QDRANT_HOST,
56
+ api_key=QDRANT_API_KEY,
57
+ timeout=300.0
58
+ )
59
+
60
+ """=================SETTINGS========================"""
61
+ device = torch.device(
62
+ "cuda" if torch.cuda.is_available() else
63
+ "mps" if torch.mps.is_available() else
64
+ "cpu"
65
+ )
66
+
67
+ product_vectors_config = {
68
+ "product": qdrant_client.http.models.VectorParams(
69
+ size=TEXT_EMBEDDING_SIZE,
70
+ distance=Distance.COSINE
71
+ ),
72
+ "image": qdrant_client.http.models.VectorParams(
73
+ size=IMAGE_EMBEDDING_SIZE,
74
+ distance=Distance.COSINE
75
+ ),
76
+ "product_bgem3_dense": qdrant_client.http.models.VectorParams(
77
+ size=1024,
78
+ distance=Distance.COSINE,
79
+ )
80
+ }
81
+
82
+ sparse_vectors_config={
83
+ "product_bgem3_sparse": SparseVectorParams(
84
+ index=SparseIndexParams(on_disk=False),
85
+ modifier = Modifier.IDF
86
+ )
87
+ }
88
+
89
+ product_collections = [
90
+ QDRANT_COLLECTION_NAME_SPCHIEUSANG,
91
+ QDRANT_COLLECTION_NAME_SPCHUYENDUNG,
92
+ QDRANT_COLLECTION_NAME_SPPHICHNUOC,
93
+ QDRANT_COLLECTION_NAME_SPTHIETBIDIEN,
94
+ QDRANT_COLLECTION_NAME_SPNHATHONGMINH
95
+ ]
96
+
97
+ product_types = [
98
+ "chieu_sang",
99
+ "chuyen_dung",
100
+ "phich_nuoc",
101
+ "thiet_bi_dien",
102
+ "nha_thong_minh"
103
+ ]
104
+
105
+ # MongoDB collections mapping for products
106
+ mongodb_product_collections = {
107
+ "chieu_sang": "sp_chieu_sang",
108
+ "chuyen_dung": "sp_chuyen_dung",
109
+ "phich_nuoc": "sp_phich_nuoc",
110
+ "thiet_bi_dien": "sp_thiet_bi_dien",
111
+ "nha_thong_minh": "sp_nha_thong_minh"
112
+ }
113
+
114
+ solution_collections = [
115
+ QDRANT_COLLECTION_NAME_GPCANHQUAN,
116
+ QDRANT_COLLECTION_NAME_GPDUONGPHO,
117
+ QDRANT_COLLECTION_NAME_GPHOCDUONG,
118
+ QDRANT_COLLECTION_NAME_GPNHATHONGMINH,
119
+ QDRANT_COLLECTION_NAME_GPNGUNGHIEP,
120
+ QDRANT_COLLECTION_NAME_GPNLMT,
121
+ QDRANT_COLLECTION_NAME_GPNNCNC,
122
+ QDRANT_COLLECTION_NAME_GPVPCS,
123
+ QDRANT_COLLECTION_NAME_GPNMCN,
124
+ QDRANT_COLLECTION_NAME_GPNOXH
125
+ ]
126
+
127
+ solution_types = [
128
+ "canh_quan",
129
+ "duong_pho",
130
+ "hoc_duong",
131
+ "nha_thong_minh",
132
+ "ngu_nghiep",
133
+ "nlmt",
134
+ "nong_nghiep_cnc",
135
+ "van_phong_cong_so",
136
+ "nha_may_cong_nghiep",
137
+ "nha_o_xa_hoi"
138
+ ]
139
+
140
+ # MongoDB collections mapping for solutions
141
+ mongodb_solution_collections = {
142
+ "canh_quan": "gp_canh_quan",
143
+ "duong_pho": "gp_duong_pho",
144
+ "hoc_duong": "gp_hoc_duong",
145
+ "nha_thong_minh": "gp_nha_thong_minh",
146
+ "ngu_nghiep": "gp_ngu_nghiep",
147
+ "nlmt": "gp_he_thong_dien_nlmt",
148
+ "nong_nghiep_cnc": "gp_nong_nghiep_cnc",
149
+ "van_phong_cong_so": "gp_van_phong_cong_so",
150
+ "nha_may_cong_nghiep": "gp_nha_may_cong_nghiep",
151
+ "nha_o_xa_hoi": "gp_nha_o_xa_hoi"
152
+ }
153
+
154
+
155
+ """=================MONGODB CONNECTION========================"""
156
+ class MongoDBConnection:
157
+ def __init__(self, connection_string: str = None, db_name: str = "product_database"):
158
+ """
159
+ Initialize MongoDB connection
160
+ Args:
161
+ connection_string: MongoDB Atlas connection string
162
+ db_name: Database name
163
+ """
164
+ self.connection_string = MONGODB_URI if connection_string is None else connection_string
165
+ self.db_name = db_name
166
+ self.client = None
167
+ self.db = None
168
+
169
+ def connect(self):
170
+ """Establish connection to MongoDB"""
171
+ try:
172
+ self.client = MongoClient(self.connection_string)
173
+ self.db = self.client[self.db_name]
174
+ # Test connection
175
+ self.client.admin.command('ping')
176
+ print(f"✅ Connected to MongoDB: {self.db_name}")
177
+ return True
178
+ except Exception as e:
179
+ print(f"❌ Failed to connect to MongoDB: {e}")
180
+ return False
181
+
182
+ def get_collection_data(self, collection_name: str) -> List[Dict]:
183
+ """
184
+ Retrieve all documents from a collection
185
+ Args:
186
+ collection_name: Name of the MongoDB collection
187
+ Returns:
188
+ List of documents
189
+ """
190
+ try:
191
+ collection = self.db[collection_name]
192
+ data = list(collection.find({}))
193
+ # Convert ObjectId to string
194
+ for item in data:
195
+ if '_id' in item:
196
+ item['_id'] = str(item['_id'])
197
+ print(f"✅ Retrieved {len(data)} documents from {collection_name}")
198
+ return data
199
+ except Exception as e:
200
+ print(f"❌ Error retrieving data from {collection_name}: {e}")
201
+ return []
202
+
203
+ def close(self):
204
+ """Close MongoDB connection"""
205
+ if self.client:
206
+ self.client.close()
207
+ print("✅ MongoDB connection closed")
208
+
209
+
210
+ """=================CLASS EMBEDDING========================"""
211
+ class DataEmbedding:
212
+ def __init__(self):
213
+ pass
214
+
215
+ def embed_text_batch(self, contents: List[str], batch_size: int = 32, hybrid_mode: bool = False) -> List[Optional[torch.Tensor]]:
216
+ """Create text embeddings using HuggingFaceEmbeddings (768 dimensions), and optionally BGEM3 (1024 dimensions) in batches."""
217
+ normal_embeddings, bgem3_dense_embeddings, bgem3_sparse_embeddings = [], [], []
218
+
219
+ # Filter out empty contents and keep track of original indices
220
+ valid_contents = []
221
+ valid_indices = []
222
+ for i, content in enumerate(contents):
223
+ if content:
224
+ valid_contents.append(content)
225
+ valid_indices.append(i)
226
+
227
+ if not valid_contents:
228
+ return [None] * len(contents)
229
+
230
+ try:
231
+ text_embedding_model = HuggingFaceEmbeddings(
232
+ model_name=TEXT_EMBEDDING_MODEL,
233
+ model_kwargs={'device': device},
234
+ encode_kwargs={'normalize_embeddings': True}
235
+ )
236
+ if hybrid_mode:
237
+ hybrid_embedding_model = BGEM3FlagModel(
238
+ "BAAI/bge-m3",
239
+ use_fp16=True,
240
+ devices=str(device)
241
+ )
242
+
243
+ for i in range(0, len(valid_contents), batch_size):
244
+ batch_contents = valid_contents[i:i+batch_size]
245
+
246
+ bgem3_dense_embeddings_list, bgem3_sparse_embeddings_list = [], []
247
+ if hybrid_mode:
248
+ bgem3_embeddings = hybrid_embedding_model.encode(
249
+ sentences=batch_contents,
250
+ return_dense=True,
251
+ return_sparse=True
252
+ )
253
+
254
+ bgem3_dense_embeddings_list = bgem3_embeddings['dense_vecs']
255
+ bgem3_sparse_embeddings_list = bgem3_embeddings['lexical_weights']
256
+ bgem3_dense_embeddings.extend([
257
+ torch.tensor(emb, dtype=torch.float32)
258
+ for emb in bgem3_dense_embeddings_list
259
+ ])
260
+ bgem3_sparse_embeddings.extend(bgem3_sparse_embeddings_list)
261
+
262
+ normal_embeddings_list = text_embedding_model.embed_documents(batch_contents)
263
+ normal_embeddings.extend([torch.tensor(emb, dtype=torch.float32) for emb in normal_embeddings_list])
264
+
265
+ # Map back to original order
266
+ result = [None] * len(contents)
267
+ for i, valid_idx in enumerate(valid_indices):
268
+ if hybrid_mode:
269
+ result[valid_idx] = (normal_embeddings[i], bgem3_dense_embeddings[i], bgem3_sparse_embeddings[i])
270
+ else:
271
+ result[valid_idx] = (normal_embeddings[i], [], [])
272
+
273
+ return result
274
+
275
+ except Exception as e:
276
+ print(f"❌ Error in batch text embedding: {str(e)[:100]}...")
277
+ return []
278
+
279
+ def embed_images_batch(self, image_urls: List[str], batch_size: int = 32) -> List[Optional[torch.Tensor]]:
280
+ """Create image embeddings in batches."""
281
+ all_embeddings: List[Optional[torch.Tensor]] = [None] * len(image_urls)
282
+
283
+ # Create a list of images and their original indices that need processing
284
+ images_to_process: List[Tuple[Any, int]] = []
285
+ for i, url in enumerate(image_urls):
286
+ if url:
287
+ try:
288
+ response = requests.get(url, timeout=30)
289
+ response.raise_for_status()
290
+ image = Image.open(io.BytesIO(response.content)).convert('RGB')
291
+ images_to_process.append((image, i))
292
+ except requests.exceptions.RequestException as e:
293
+ print(f"❌ HTTP error for url {url}: {e}")
294
+ pass
295
+ except Exception as e:
296
+ print(f"❌ Error loading image {url}: {e}")
297
+ pass
298
+
299
+ if not images_to_process:
300
+ return all_embeddings
301
+
302
+ image_processor = AutoImageProcessor.from_pretrained(IMAGE_EMBEDDING_MODEL)
303
+ image_embedding_model = EfficientNetModel.from_pretrained(IMAGE_EMBEDDING_MODEL).to(device)
304
+ # Process images in batches
305
+ for i in range(0, len(images_to_process), batch_size):
306
+ batch_data = images_to_process[i:i+batch_size]
307
+ batch_images = [d[0] for d in batch_data]
308
+ batch_indices = [d[1] for d in batch_data]
309
+
310
+ try:
311
+ inputs = image_processor(images=batch_images, return_tensors="pt").to(device)
312
+
313
+ with torch.no_grad():
314
+ outputs = image_embedding_model(**inputs)
315
+
316
+ embeddings = outputs.pooler_output.squeeze()
317
+ normalized_embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
318
+ for j, embedding in enumerate(normalized_embeddings):
319
+ original_index = batch_indices[j]
320
+ all_embeddings[original_index] = embedding.squeeze()
321
+
322
+ except Exception as e:
323
+ print(f"❌ Error embedding image batch: {e}")
324
+ pass
325
+
326
+ return all_embeddings
327
+
328
+
329
+ class ProductEmbedding(DataEmbedding):
330
+ def run_embedding(self, product_type: str, mongodb_conn: MongoDBConnection,
331
+ batch_size: int = 32, hybrid_mode: bool = False):
332
+ """
333
+ Generate embeddings for a specific product type from MongoDB
334
+ Args:
335
+ product_type: Type of product
336
+ mongodb_conn: MongoDB connection object
337
+ batch_size: Batch size for processing
338
+ hybrid_mode: Whether to use hybrid text embedding (BGEM3)
339
+ """
340
+ embeddings = []
341
+
342
+ processed_docs = self.prepare_docs(
343
+ product_type=product_type,
344
+ mongodb_conn=mongodb_conn
345
+ )
346
+
347
+ # Batch text embedding for speed
348
+ text_contents = [doc.page_content for doc in processed_docs]
349
+ text_embeddings = self.embed_text_batch(text_contents, batch_size, hybrid_mode)
350
+
351
+ # Batch image embedding
352
+ image_urls = [doc.metadata.get("image_url") for doc in processed_docs]
353
+ image_embeddings = self.embed_images_batch(image_urls)
354
+
355
+ # Create embeddings with optimized structure creation
356
+ for i, doc in enumerate(processed_docs):
357
+ if i < len(text_embeddings) and text_embeddings[i] is not None:
358
+ normal_text_embedding, bgem3_dense_text_embedding, bgem3_sparse_text_embedding = text_embeddings[i]
359
+ else:
360
+ normal_text_embedding, bgem3_dense_text_embedding, bgem3_sparse_text_embedding = None, None, None
361
+
362
+ image_embedding = image_embeddings[i] if i < len(image_embeddings) else None
363
+
364
+ # Create vectors dict - ensure proper format
365
+ vectors = {
366
+ "product": normal_text_embedding.tolist() if normal_text_embedding is not None else [0.0] * TEXT_EMBEDDING_SIZE,
367
+ "product_bgem3_dense": bgem3_dense_text_embedding.tolist() if bgem3_dense_text_embedding is not None else [0.0] * 1024,
368
+ "image": image_embedding.tolist() if image_embedding is not None else [0.0] * IMAGE_EMBEDDING_SIZE
369
+ }
370
+
371
+ if bgem3_sparse_text_embedding is not None and bgem3_sparse_text_embedding:
372
+ sparse_vectors = {
373
+ "product_bgem3_sparse": {
374
+ "indices": list(bgem3_sparse_text_embedding.keys()),
375
+ "values": [float(v) for v in bgem3_sparse_text_embedding.values()]
376
+ }
377
+ }
378
+ else:
379
+ sparse_vectors = {"product_sparse": {"indices": [], "values": []}}
380
+
381
+ # Create payload with optimized metadata processing
382
+ payload = {
383
+ "product": doc.page_content,
384
+ "metadata": {key: value for key, value in doc.metadata.items()}
385
+ }
386
+
387
+ # Create and append point
388
+ embeddings.append({
389
+ "point_id": str(uuid.uuid4()),
390
+ "vectors": vectors,
391
+ "sparse_vectors": sparse_vectors,
392
+ "payload": payload
393
+ })
394
+
395
+ print(f"Generated {len(embeddings)} embeddings for {product_type}")
396
+ return embeddings
397
+
398
+ def prepare_docs(self, product_type: str, mongodb_conn: MongoDBConnection):
399
+ """
400
+ Prepare documents from MongoDB
401
+ Args:
402
+ product_type: Type of product
403
+ mongodb_conn: MongoDB connection object
404
+ """
405
+ if not mongodb_conn or mongodb_conn.db is None:
406
+ raise ValueError("MongoDB connection not established")
407
+
408
+ collection_name = mongodb_product_collections.get(product_type)
409
+ if not collection_name:
410
+ raise ValueError(f"No MongoDB collection mapping for product type: {product_type}")
411
+
412
+ data = mongodb_conn.get_collection_data(collection_name)
413
+ print(f"🗄️ Loaded data from MongoDB collection: {collection_name}")
414
+
415
+ docs = []
416
+ EXCLUDE_FROM_FLATTENING = {"tags"}
417
+ for item in data:
418
+ content = self.create_content(item)
419
+ metadata = self.extract_metadata(item, product_type)
420
+ # Create a flat metadata structure for indexing
421
+ flat_metadata = {**metadata}
422
+ for key, value in metadata.items():
423
+ if isinstance(value, dict) and key not in EXCLUDE_FROM_FLATTENING:
424
+ flat_metadata.update({f"{key}_{sub_key}": sub_value for sub_key, sub_value in value.items()})
425
+
426
+ doc = Document(page_content=content, metadata=flat_metadata)
427
+ docs.append(doc)
428
+
429
+ print(f"Prepared {len(docs)} documents")
430
+ return docs
431
+
432
+ def create_content(self, item: Dict) -> str:
433
+ """Tạo document content cho sản phẩm"""
434
+ product_name = item.get("Tên sản phẩm", "")
435
+ model = item.get("Mã Sản Phẩm", "")
436
+ summary_specs = item.get("Tóm tắt TSKT", "")
437
+ summary_advantages = item.get("Tóm tắt ưu điểm, tính năng", "")
438
+ specs = item.get("Thông số kỹ thuật", "")
439
+ advantages = item.get("Nội dung Ưu điểm SP\n(- File word/Excel\n- Đặt tên file theo mã SAP)", "")
440
+ instruction = item.get("HDSD", "")
441
+ content = (
442
+ f"# Tên sản phẩm: {product_name}\n\n"
443
+ f"## Mã sản phẩm: {model}\n\n"
444
+ f"## Tóm tắt TSKT\n{summary_specs}\n\n"
445
+ f"### Thông số kỹ thuật chi tiết\n{specs}\n\n"
446
+ f"## Tóm tắt ưu điểm & tính năng\n{summary_advantages}\n\n"
447
+ f"### Ưu điểm & tính năng chi tiết\n{advantages}\n"
448
+ f"## Hướng dẫn sử dụng: \n{instruction}\n"
449
+ )
450
+
451
+ return content
452
+
453
+ def extract_metadata(self, item: Dict, product_type: str) -> Dict:
454
+ """Extract metadata from a product item"""
455
+ additional_info = ProductEmbedding.process_additional_metadata(item, product_type)
456
+ tags = item.get("Tags", {})
457
+ common_metadata = {
458
+ "prod_id": item.get("Product_ID", None),
459
+ "ten_san_pham": item.get("Tên sản phẩm", ""),
460
+ "model": item.get("Mã Sản Phẩm", ""),
461
+ "danh_muc_l1": item.get("category 1", ""),
462
+ "danh_muc_l2": item.get("category 2", ""),
463
+ "danh_muc_l3": item.get("category 3", ""),
464
+ "url": str(item.get("Link sản phẩm", "")).strip(),
465
+ "image_url": item.get("Link ảnh sản phẩm"),
466
+ "buy_url": item.get("Link mua hàng online", ""),
467
+ "gia": item.get("Giá", ""),
468
+ "tags": tags,
469
+ **tags,
470
+ **additional_info
471
+ }
472
+ return common_metadata
473
+
474
+ @staticmethod
475
+ def process_additional_metadata(item: Dict[str, Any], product_type) -> Dict[str, Any]:
476
+ """Process an item and extract additional information"""
477
+ tags = item.get("Tags", {})
478
+ spec_text = item.get("Tóm tắt TSKT", "")
479
+ model = item.get("Mã Sản Phẩm", "")
480
+ prod_name = item.get("Tên sản phẩm", "")
481
+ additional_info = {}
482
+
483
+ # Extract cong_suat
484
+ if "cong_suat" not in tags.keys() or tags["cong_suat"] == "":
485
+ power = extract_power(spec_text)
486
+ if power is not None:
487
+ additional_info["cong_suat"] = power
488
+
489
+ # Extract based on product type
490
+ if product_type == "phich_nuoc":
491
+ pass
492
+
493
+ elif product_type == "chieu_sang":
494
+ ceiling_hole_diameter = extract_ceiling_hole_diameter2(spec_text)
495
+ if ceiling_hole_diameter is not None:
496
+ additional_info["duong_kinh_lo_khoet_tran"] = ceiling_hole_diameter
497
+
498
+ tinh_nang = extract_tinh_nang(model, prod_name)
499
+ if tinh_nang is not None:
500
+ additional_info["tinh_nang"] = tinh_nang
501
+
502
+ elif product_type == "chuyen_dung":
503
+ he_thong_hoa_luoi_pha = extract_he_thong_hoa_luoi_pha(prod_name)
504
+ if he_thong_hoa_luoi_pha is not None:
505
+ additional_info["he_thong_hoa_luoi_pha"] = he_thong_hoa_luoi_pha
506
+
507
+ elif product_type == "thiet_bi_dien":
508
+ dong_danh_dinh = extract_dong_danh_dinh(spec_text)
509
+ if dong_danh_dinh is not None:
510
+ additional_info["dong_danh_dinh"] = dong_danh_dinh
511
+
512
+ elif product_type == "nha_thong_minh":
513
+ cable_length = extract_cable_length(spec_text)
514
+ if cable_length is not None:
515
+ additional_info["chieu_dai_day"] = cable_length
516
+
517
+ plugs_max_current = extract_plugs_max_current(spec_text)
518
+ if plugs_max_current is not None:
519
+ additional_info["dong_dien_o_cam_toi_da"] = plugs_max_current
520
+
521
+ voltage = extract_voltage(model)
522
+ if voltage is not None:
523
+ additional_info["dien_ap"] = voltage
524
+
525
+ return additional_info
526
+
527
+
528
+ class SolutionEmbedding(DataEmbedding):
529
+ def run_embedding(self, solution_type: str, mongodb_conn: MongoDBConnection, batch_size: int = 32):
530
+ """Generate embeddings for a specific solution type from MongoDB"""
531
+ embeddings = []
532
+
533
+ processed_docs, docs_to_embed = self.prepare_docs(solution_type, mongodb_conn)
534
+
535
+ embedding_contents = [doc.page_content for doc in docs_to_embed]
536
+ text_embeddings = self.embed_text_batch(embedding_contents, batch_size)
537
+
538
+ # Create embeddings with optimized structure creation
539
+ for i, doc in enumerate(processed_docs):
540
+ embedding_tuple = text_embeddings[i] if i < len(text_embeddings) else None
541
+ text_embedding = embedding_tuple[0] if embedding_tuple is not None else None
542
+
543
+ # Create payload with optimized metadata processing
544
+ payload = {
545
+ "content": doc.page_content,
546
+ "metadata": {key: value for key, value in doc.metadata.items()}
547
+ }
548
+
549
+ # Create and append point
550
+ embeddings.append({
551
+ "point_id": str(uuid.uuid4()),
552
+ "vectors": text_embedding.tolist() if text_embedding is not None else [0.0] * 768,
553
+ "payload": payload
554
+ })
555
+
556
+ print(f"Generated {len(embeddings)} embeddings for {solution_type}")
557
+ return embeddings
558
+
559
+ def prepare_docs(self, solution_type: str, mongodb_conn: MongoDBConnection):
560
+ """
561
+ Prepare documents from MongoDB
562
+ Args:
563
+ solution_type: Type of solution
564
+ mongodb_conn: MongoDB connection object
565
+ """
566
+ if not mongodb_conn or mongodb_conn.db is None:
567
+ raise ValueError("MongoDB connection not established")
568
+
569
+ collection_name = mongodb_solution_collections.get(solution_type)
570
+ if not collection_name:
571
+ raise ValueError(f"No MongoDB collection mapping for solution type: {solution_type}")
572
+
573
+ data = mongodb_conn.get_collection_data(collection_name)
574
+ print(f"🗄️ Loaded solution data from MongoDB collection: {collection_name}")
575
+
576
+ docs = []
577
+ docs_to_embed = []
578
+
579
+ for item in data:
580
+ # Assuming the MongoDB document structure matches the JSON structure
581
+ for key, val in item.items():
582
+ if key in ["_id", "san_pham"]: # Skip MongoDB _id and san_pham
583
+ continue
584
+
585
+ if isinstance(val, list):
586
+ for d in val:
587
+ page_content = ". ".join([f"{k}: {v}" for k, v in d.items()])
588
+ docs.append(
589
+ Document(
590
+ page_content=page_content,
591
+ metadata={"category": key}
592
+ )
593
+ )
594
+
595
+ if key != "faq":
596
+ docs_to_embed.append(
597
+ Document(
598
+ page_content=page_content,
599
+ metadata={"category": key}
600
+ )
601
+ )
602
+ else:
603
+ page_content = f"Câu hỏi: {d.get('Câu hỏi', '')}"
604
+ docs_to_embed.append(
605
+ Document(
606
+ page_content=page_content,
607
+ metadata={"category": key}
608
+ )
609
+ )
610
+
611
+ elif isinstance(val, dict):
612
+ for k, v in val.items():
613
+ docs_to_embed.append(Document(page_content=f"{k}: {v}", metadata={"category": key}))
614
+ docs.append(Document(page_content=f"{k}: {v}", metadata={"category": key}))
615
+
616
+ print(f"Prepared {len(docs)} documents")
617
+ return docs, docs_to_embed
618
+
619
+
620
+ """=================CLASS INDEXING========================"""
621
+ class ProductIndexing:
622
+ def __init__(self, vector_db_client=client):
623
+ super().__init__()
624
+ self.client = vector_db_client
625
+ self.mongodb_conn = None
626
+
627
+ def setup_mongodb(self, connection_string: str = None):
628
+ """Setup MongoDB connection"""
629
+ self.mongodb_conn = MongoDBConnection(connection_string)
630
+ return self.mongodb_conn.connect()
631
+
632
+ def index(
633
+ self,
634
+ embeddings: List[Dict],
635
+ collection_name: str,
636
+ batch_size: int = 100
637
+ ):
638
+ """Index embeddings to a Qdrant collection in batches"""
639
+
640
+ total_docs = len(embeddings)
641
+ success_count = 0
642
+ error_count = 0
643
+
644
+ print(f"Adding {total_docs} multimodal documents to '{collection_name}'...")
645
+
646
+ for i in range(0, total_docs, batch_size):
647
+ batch = embeddings[i:i+batch_size]
648
+ points = []
649
+
650
+ try:
651
+ for embedding_data in batch:
652
+ combined_vectors = embedding_data["vectors"].copy()
653
+ combined_vectors.update(embedding_data["sparse_vectors"])
654
+
655
+ point = qdrant_client.http.models.PointStruct(
656
+ id=embedding_data["point_id"],
657
+ vector=combined_vectors,
658
+ payload=embedding_data["payload"]
659
+ )
660
+ points.append(point)
661
+
662
+ if points:
663
+ self.client.upsert(collection_name=collection_name, points=points)
664
+ success_count += len(batch)
665
+
666
+ text_count = sum(1 for p in points if any(v != 0 for v in p.vector["product"]))
667
+ image_count = sum(1 for p in points if any(v != 0 for v in p.vector["image"]))
668
+
669
+ print(f"✅ Batch {i//batch_size + 1}: {len(batch)} docs | {text_count} product | {image_count} images")
670
+ else:
671
+ print(f"⚠️ Batch {i//batch_size + 1}: No valid points to upload")
672
+
673
+ except Exception as e:
674
+ error_count += len(batch)
675
+ print(f"❌ Batch {i//batch_size + 1} failed: {e}")
676
+
677
+ print(f"\n📊 Final Results:")
678
+ print(f" ✅ Successful: {success_count}")
679
+ print(f" ❌ Failed: {error_count}")
680
+ print(f" 📈 Success Rate: {success_count/(success_count+error_count)*100:.1f}%")
681
+
682
+ def run_indexing(self, reload: bool = True, hybrid_mode: bool = True):
683
+ """
684
+ Index all product data from MongoDB into Qdrant collections.
685
+ Args:
686
+ reload: Whether to recreate collections
687
+ hybrid_mode: Whether to use hybrid text embedding (BGEM3)
688
+ """
689
+ if reload:
690
+ try:
691
+ for collection in product_collections:
692
+ self.client.recreate_collection(
693
+ collection_name=collection,
694
+ vectors_config=product_vectors_config,
695
+ sparse_vectors_config=sparse_vectors_config
696
+ )
697
+ print("All product collections recreated.")
698
+ except Exception as e:
699
+ print(f"Error while recreating collections: {e}")
700
+ return
701
+
702
+ # Setup MongoDB connection
703
+ if not self.mongodb_conn:
704
+ if not self.setup_mongodb():
705
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
706
+ return
707
+
708
+ # Create embedding processor
709
+ embed_object = ProductEmbedding()
710
+
711
+ for collection, product_type in zip(product_collections, product_types):
712
+ print(f"\n🔄 Processing {product_type} data from MongoDB...")
713
+
714
+ # Generate embeddings for specific product type
715
+ embeddings = embed_object.run_embedding(
716
+ product_type=product_type,
717
+ mongodb_conn=self.mongodb_conn,
718
+ hybrid_mode=hybrid_mode
719
+ )
720
+
721
+ # Index embeddings to specific collection
722
+ self.index(embeddings, collection)
723
+ self._create_payload_indexes_for_product_type(product_type, collection)
724
+
725
+ # Close MongoDB connection
726
+ if self.mongodb_conn:
727
+ self.mongodb_conn.close()
728
+ self.mongodb_conn = None
729
+
730
+ def indexing_single_product_type(self, product_type: str, collection_name: str, hybrid_mode: bool = True) -> str:
731
+ """
732
+ Indexing a single product group into its Qdrant collection from MongoDB
733
+ Args:
734
+ product_type: Type of product
735
+ collection_name: Qdrant collection name
736
+ hybrid_mode: Whether to use hybrid text embedding (BGEM3)
737
+ """
738
+ buffer = io.StringIO()
739
+ sys.stdout = buffer
740
+
741
+ try:
742
+ self.client.recreate_collection(
743
+ collection_name=collection_name,
744
+ vectors_config=product_vectors_config,
745
+ sparse_vectors_config=sparse_vectors_config
746
+ )
747
+ print(f"Collection {collection_name} created")
748
+
749
+ # Setup MongoDB connection
750
+ if not self.mongodb_conn:
751
+ if not self.setup_mongodb():
752
+ print("❌ Failed to connect to MongoDB")
753
+ sys.stdout = sys.__stdout__
754
+ return buffer.getvalue()
755
+
756
+ # Create embedding processor
757
+ embed_object = ProductEmbedding()
758
+
759
+ print(f"\n🔄 Processing {product_type} data from MongoDB...")
760
+ embeddings = embed_object.run_embedding(
761
+ product_type=product_type,
762
+ mongodb_conn=self.mongodb_conn,
763
+ hybrid_mode=hybrid_mode
764
+ )
765
+ self.index(embeddings, collection_name)
766
+
767
+ # Close MongoDB connection
768
+ if self.mongodb_conn:
769
+ self.mongodb_conn.close()
770
+ self.mongodb_conn = None
771
+
772
+ except Exception as e:
773
+ print(f"Error while indexing product type {product_type}: {e}")
774
+
775
+ self._create_payload_indexes_for_product_type(product_type, collection_name)
776
+ sys.stdout = sys.__stdout__
777
+ return buffer.getvalue()
778
+
779
+ def _create_payload_indexes_for_product_type(self, product_type: str, collection_name: str):
780
+ """Create payload indexes based on product type field schemas"""
781
+
782
+ print(f"🔍 Creating payload indexes for {product_type}...")
783
+
784
+ try:
785
+ # Common fields across all product types
786
+ self.client.create_payload_index(
787
+ collection_name=collection_name,
788
+ field_name="metadata.danh_muc_l2",
789
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
790
+ )
791
+
792
+ self.client.create_payload_index(
793
+ collection_name=collection_name,
794
+ field_name="metadata.danh_muc_l3",
795
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
796
+ )
797
+
798
+ self.client.create_payload_index(
799
+ collection_name=collection_name,
800
+ field_name="metadata.gia",
801
+ field_schema=qdrant_client.http.models.IntegerIndexParams(type="integer")
802
+ )
803
+
804
+ self.client.create_payload_index(
805
+ collection_name=collection_name,
806
+ field_name="metadata.cong_suat",
807
+ field_schema=qdrant_client.http.models.FloatIndexParams(type="float")
808
+ )
809
+
810
+ # Product-specific fields
811
+ if product_type == "phich_nuoc":
812
+ self.client.create_payload_index(
813
+ collection_name=collection_name,
814
+ field_name="metadata.dung_tich",
815
+ field_schema=qdrant_client.http.models.FloatIndexParams(type="float")
816
+ )
817
+ self.client.create_payload_index(
818
+ collection_name=collection_name,
819
+ field_name="metadata.chat_lieu",
820
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
821
+ )
822
+ self.client.create_payload_index(
823
+ collection_name=collection_name,
824
+ field_name="metadata.tinh_nang",
825
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
826
+ )
827
+
828
+ elif product_type == "chieu_sang":
829
+ self.client.create_payload_index(
830
+ collection_name=collection_name,
831
+ field_name="metadata.kich_thuoc",
832
+ field_schema=qdrant_client.http.models.FloatIndexParams(type="float")
833
+ )
834
+ self.client.create_payload_index(
835
+ collection_name=collection_name,
836
+ field_name="metadata.duong_kinh_lo_khoet_tran",
837
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
838
+ )
839
+ self.client.create_payload_index(
840
+ collection_name=collection_name,
841
+ field_name="metadata.tinh_nang",
842
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
843
+ )
844
+
845
+ elif product_type == "chuyen_dung":
846
+ self.client.create_payload_index(
847
+ collection_name=collection_name,
848
+ field_name="metadata.nhiet_do_mau",
849
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
850
+ )
851
+ self.client.create_payload_index(
852
+ collection_name=collection_name,
853
+ field_name="metadata.dien_ap",
854
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
855
+ )
856
+ self.client.create_payload_index(
857
+ collection_name=collection_name,
858
+ field_name="metadata.cong_nghe_led",
859
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
860
+ )
861
+ self.client.create_payload_index(
862
+ collection_name=collection_name,
863
+ field_name="metadata.loai_den",
864
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
865
+ )
866
+ self.client.create_payload_index(
867
+ collection_name=collection_name,
868
+ field_name="metadata.he_thong_hoa_luoi",
869
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
870
+ )
871
+
872
+ elif product_type == "thiet_bi_dien":
873
+ self.client.create_payload_index(
874
+ collection_name=collection_name,
875
+ field_name="metadata.dong_danh_dinh",
876
+ field_schema=qdrant_client.http.models.FloatIndexParams(type="float")
877
+ )
878
+ self.client.create_payload_index(
879
+ collection_name=collection_name,
880
+ field_name="metadata.anh_sang",
881
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
882
+ )
883
+ self.client.create_payload_index(
884
+ collection_name=collection_name,
885
+ field_name="metadata.so_hat",
886
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
887
+ )
888
+ self.client.create_payload_index(
889
+ collection_name=collection_name,
890
+ field_name="metadata.so_cuc",
891
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
892
+ )
893
+ self.client.create_payload_index(
894
+ collection_name=collection_name,
895
+ field_name="metadata.modules",
896
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
897
+ )
898
+ self.client.create_payload_index(
899
+ collection_name=collection_name,
900
+ field_name="metadata.doi_tuong",
901
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
902
+ )
903
+ self.client.create_payload_index(
904
+ collection_name=collection_name,
905
+ field_name="metadata.cong_nghe",
906
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
907
+ )
908
+ self.client.create_payload_index(
909
+ collection_name=collection_name,
910
+ field_name="metadata.loai_den",
911
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
912
+ )
913
+ self.client.create_payload_index(
914
+ collection_name=collection_name,
915
+ field_name="metadata.san_pham",
916
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
917
+ )
918
+
919
+ elif product_type == "nha_thong_minh":
920
+ self.client.create_payload_index(
921
+ collection_name=collection_name,
922
+ field_name="metadata.chieu_dai_day",
923
+ field_schema=qdrant_client.http.models.FloatIndexParams(type="float")
924
+ )
925
+ self.client.create_payload_index(
926
+ collection_name=collection_name,
927
+ field_name="metadata.lo_khoet_tran",
928
+ field_schema=qdrant_client.http.models.IntegerIndexParams(type="integer")
929
+ )
930
+ self.client.create_payload_index(
931
+ collection_name=collection_name,
932
+ field_name="metadata.nut_bam",
933
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
934
+ )
935
+ self.client.create_payload_index(
936
+ collection_name=collection_name,
937
+ field_name="metadata.dong_dien_o_cam_toi_da",
938
+ field_schema=qdrant_client.http.models.IntegerIndexParams(type="integer")
939
+ )
940
+ self.client.create_payload_index(
941
+ collection_name=collection_name,
942
+ field_name="metadata.dien_ap",
943
+ field_schema=qdrant_client.http.models.IntegerIndexParams(type="integer")
944
+ )
945
+ self.client.create_payload_index(
946
+ collection_name=collection_name,
947
+ field_name="metadata.hinh_dang",
948
+ field_schema=qdrant_client.http.models.KeywordIndexParams(type="keyword")
949
+ )
950
+ self.client.create_payload_index(
951
+ collection_name=collection_name,
952
+ field_name="metadata.tinh_nang",
953
+ field_schema=qdrant_client.http.models.TextIndexParams(type="text")
954
+ )
955
+ self.client.create_payload_index(
956
+ collection_name=collection_name,
957
+ field_name="metadata.goc_chieu",
958
+ field_schema=qdrant_client.http.models.TextIndexParams(type="text")
959
+ )
960
+ self.client.create_payload_index(
961
+ collection_name=collection_name,
962
+ field_name="metadata.combo",
963
+ field_schema=qdrant_client.http.models.TextIndexParams(type="text")
964
+ )
965
+ self.client.create_payload_index(
966
+ collection_name=collection_name,
967
+ field_name="metadata.anh_sang",
968
+ field_schema=qdrant_client.http.models.TextIndexParams(type="text")
969
+ )
970
+
971
+ print(f"✅ All payload indexes created for {product_type}")
972
+
973
+ except Exception as e:
974
+ print(f"❌ Error creating payload indexes for {product_type}: {e}")
975
+
976
+ class SolutionIndexing:
977
+ def __init__(self, vector_db_client=client):
978
+ super().__init__()
979
+ self.client = vector_db_client
980
+ self.mongodb_conn = None
981
+
982
+ def setup_mongodb(self, connection_string: str = None):
983
+ """Setup MongoDB connection"""
984
+ self.mongodb_conn = MongoDBConnection(connection_string)
985
+ return self.mongodb_conn.connect()
986
+
987
+ def index(
988
+ self,
989
+ embeddings: List[Dict],
990
+ collection_name: str,
991
+ batch_size: int = 10
992
+ ):
993
+ """Index embeddings to a Qdrant collection in batches"""
994
+
995
+ total_docs = len(embeddings)
996
+ success_count = 0
997
+ error_count = 0
998
+
999
+ print(f"Adding {total_docs} solution documents to '{collection_name}'...")
1000
+
1001
+ for i in range(0, total_docs, batch_size):
1002
+ batch = embeddings[i:i+batch_size]
1003
+ points = []
1004
+
1005
+ try:
1006
+ for embedding_data in batch:
1007
+ # Create Qdrant point from embedding data
1008
+ point = qdrant_client.http.models.PointStruct(
1009
+ id=embedding_data["point_id"],
1010
+ vector=embedding_data["vectors"],
1011
+ payload=embedding_data["payload"]
1012
+ )
1013
+ points.append(point)
1014
+
1015
+ # Upload batch to Qdrant
1016
+ if points:
1017
+ self.client.upsert(collection_name=collection_name, points=points)
1018
+ success_count += len(batch)
1019
+
1020
+ # Count successful embeddings
1021
+ text_count = sum(1 for p in points if any(v != 0 for v in p.vector))
1022
+
1023
+ print(f"✅ Batch {i//batch_size + 1}: {len(batch)} docs | {text_count} contents")
1024
+ else:
1025
+ print(f"⚠️ Batch {i//batch_size + 1}: No valid points to upload")
1026
+
1027
+ except Exception as e:
1028
+ error_count += len(batch)
1029
+ print(f"❌ Batch {i//batch_size + 1} failed: {e}")
1030
+
1031
+ print(f"\n📊 Final Results:")
1032
+ print(f" ✅ Successful: {success_count}")
1033
+ print(f" ❌ Failed: {error_count}")
1034
+ print(f" 📈 Success Rate: {success_count/(success_count+error_count)*100:.1f}%")
1035
+
1036
+ def run_indexing(self, reload: bool = True):
1037
+ """Index all solution data from MongoDB into Qdrant collections."""
1038
+ if reload:
1039
+ try:
1040
+ for collection in solution_collections:
1041
+ self.client.recreate_collection(
1042
+ collection_name=collection,
1043
+ vectors_config=qdrant_client.http.models.VectorParams(
1044
+ size=768,
1045
+ distance=qdrant_client.http.models.Distance.COSINE,
1046
+ )
1047
+ )
1048
+ print("All solution collections recreated.")
1049
+ except Exception as e:
1050
+ print(f"Error while recreating collections: {e}")
1051
+ return
1052
+
1053
+ # Setup MongoDB connection
1054
+ if not self.mongodb_conn:
1055
+ if not self.setup_mongodb():
1056
+ print("❌ Failed to connect to MongoDB. Aborting indexing.")
1057
+ return
1058
+
1059
+ # Create embedding processor
1060
+ embed_object = SolutionEmbedding()
1061
+
1062
+ for collection, solution_type in zip(solution_collections, solution_types):
1063
+ print(f"\n🔄 Processing {solution_type} data from MongoDB...")
1064
+ embeddings = embed_object.run_embedding(solution_type, self.mongodb_conn)
1065
+ self.index(embeddings, collection)
1066
+
1067
+ # Close MongoDB connection
1068
+ if self.mongodb_conn:
1069
+ self.mongodb_conn.close()
1070
+ self.mongodb_conn = None
1071
+
1072
+ def indexing_single_solution(self, solution: str, collection_name: str) -> str:
1073
+ """Indexing a single solution into its Qdrant collection from MongoDB"""
1074
+ buffer = io.StringIO()
1075
+ sys.stdout = buffer
1076
+
1077
+ try:
1078
+ self.client.recreate_collection(
1079
+ collection_name=collection_name,
1080
+ vectors_config=qdrant_client.http.models.VectorParams(
1081
+ size=768,
1082
+ distance=qdrant_client.http.models.Distance.COSINE,
1083
+ )
1084
+ )
1085
+ print(f"Collection {collection_name} created")
1086
+
1087
+ # Setup MongoDB connection
1088
+ if not self.mongodb_conn:
1089
+ if not self.setup_mongodb():
1090
+ print("❌ Failed to connect to MongoDB")
1091
+ sys.stdout = sys.__stdout__
1092
+ return buffer.getvalue()
1093
+
1094
+ # Create embedding processor
1095
+ embed_object = SolutionEmbedding()
1096
+
1097
+ print(f"\n🔄 Processing {solution} data from MongoDB...")
1098
+ embeddings = embed_object.run_embedding(solution, self.mongodb_conn)
1099
+ self.index(embeddings, collection_name)
1100
+
1101
+ # Close MongoDB connection
1102
+ if self.mongodb_conn:
1103
+ self.mongodb_conn.close()
1104
+ self.mongodb_conn = None
1105
+
1106
+ except Exception as e:
1107
+ print(f"Error while recreating collection and indexing solution {solution}: {e}")
1108
+
1109
+ sys.stdout = sys.__stdout__
1110
+ return buffer.getvalue()
1111
+
1112
+
1113
+ """=================GRADIO UI========================"""
1114
+ def create_gradio_interface():
1115
+ """Create Gradio interface for indexing from MongoDB"""
1116
+ product_indexing = ProductIndexing()
1117
+ solution_indexing = SolutionIndexing()
1118
+
1119
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
1120
+ gr.Markdown("# 🗄️ Qdrant Data Indexing System (MongoDB)")
1121
+ gr.Markdown("Recreate Qdrant Collections and Index Data from MongoDB Atlas")
1122
+
1123
+ output_box = gr.Textbox(lines=15, label="📋 Logs", interactive=False)
1124
+
1125
+ gr.Markdown("---")
1126
+ gr.Markdown("## 🏢 Giải pháp (Solutions)")
1127
+
1128
+ with gr.Row():
1129
+ gr.Button("GP Ngư nghiệp").click(
1130
+ solution_indexing.indexing_single_solution,
1131
+ inputs=[gr.State("ngu_nghiep"), gr.State(QDRANT_COLLECTION_NAME_GPNGUNGHIEP)],
1132
+ outputs=output_box)
1133
+
1134
+ gr.Button("GP Học đường").click(
1135
+ solution_indexing.indexing_single_solution,
1136
+ inputs=[gr.State("hoc_duong"), gr.State(QDRANT_COLLECTION_NAME_GPHOCDUONG)],
1137
+ outputs=output_box)
1138
+
1139
+ gr.Button("GP Nhà thông minh").click(
1140
+ solution_indexing.indexing_single_solution,
1141
+ inputs=[gr.State("nha_thong_minh"), gr.State(QDRANT_COLLECTION_NAME_GPNHATHONGMINH)],
1142
+ outputs=output_box)
1143
+
1144
+ gr.Button("GP Nông nghiệp CNC").click(
1145
+ solution_indexing.indexing_single_solution,
1146
+ inputs=[gr.State("nong_nghiep_cnc"), gr.State(QDRANT_COLLECTION_NAME_GPNNCNC)],
1147
+ outputs=output_box)
1148
+
1149
+ with gr.Row():
1150
+ gr.Button("GP Cảnh quan").click(
1151
+ solution_indexing.indexing_single_solution,
1152
+ inputs=[gr.State("canh_quan"), gr.State(QDRANT_COLLECTION_NAME_GPCANHQUAN)],
1153
+ outputs=output_box)
1154
+
1155
+ gr.Button("GP HTĐ NLMT").click(
1156
+ solution_indexing.indexing_single_solution,
1157
+ inputs=[gr.State("nlmt"), gr.State(QDRANT_COLLECTION_NAME_GPNLMT)],
1158
+ outputs=output_box)
1159
+
1160
+ gr.Button("GP Đường phố").click(
1161
+ solution_indexing.indexing_single_solution,
1162
+ inputs=[gr.State("duong_pho"), gr.State(QDRANT_COLLECTION_NAME_GPDUONGPHO)],
1163
+ outputs=output_box)
1164
+
1165
+ gr.Button("GP Văn phòng công sở").click(
1166
+ solution_indexing.indexing_single_solution,
1167
+ inputs=[gr.State("van_phong_cong_so"), gr.State(QDRANT_COLLECTION_NAME_GPVPCS)],
1168
+ outputs=output_box)
1169
+
1170
+ with gr.Row():
1171
+ gr.Button("GP Nhà máy CN").click(
1172
+ solution_indexing.indexing_single_solution,
1173
+ inputs=[gr.State("nha_may_cong_nghiep"), gr.State(QDRANT_COLLECTION_NAME_GPNMCN)],
1174
+ outputs=output_box)
1175
+
1176
+ gr.Button("GP Nhà ở xã hội").click(
1177
+ solution_indexing.indexing_single_solution,
1178
+ inputs=[gr.State("nha_o_xa_hoi"), gr.State(QDRANT_COLLECTION_NAME_GPNOXH)],
1179
+ outputs=output_box)
1180
+
1181
+ gr.Button("✨ Tất cả GP", variant="primary").click(
1182
+ solution_indexing.run_indexing,
1183
+ inputs=gr.State(True),
1184
+ outputs=output_box)
1185
+
1186
+ gr.Markdown("---")
1187
+ gr.Markdown("## 📦 Sản phẩm (Products)")
1188
+
1189
+ # Individual product buttons
1190
+ with gr.Row():
1191
+ btn_phich = gr.Button("SP Phích nước")
1192
+ btn_chieu_sang = gr.Button("SP Chiếu sáng")
1193
+ btn_chuyen_dung = gr.Button("SP Chuyên dụng")
1194
+ btn_ntm = gr.Button("SP Nhà thông minh")
1195
+ btn_thiet_bi = gr.Button("SP Thiết bị điện")
1196
+
1197
+ with gr.Row():
1198
+ btn_all_products = gr.Button("✨ Tất cả SP", variant="primary", scale=2)
1199
+
1200
+ # Setup click handlers
1201
+ btn_phich.click(
1202
+ product_indexing.indexing_single_product_type,
1203
+ inputs=[gr.State("phich_nuoc"), gr.State(QDRANT_COLLECTION_NAME_SPPHICHNUOC), gr.State(True)],
1204
+ outputs=output_box)
1205
+
1206
+ btn_chieu_sang.click(
1207
+ product_indexing.indexing_single_product_type,
1208
+ inputs=[gr.State("chieu_sang"), gr.State(QDRANT_COLLECTION_NAME_SPCHIEUSANG), gr.State(True)],
1209
+ outputs=output_box)
1210
+
1211
+ btn_chuyen_dung.click(
1212
+ product_indexing.indexing_single_product_type,
1213
+ inputs=[gr.State("chuyen_dung"), gr.State(QDRANT_COLLECTION_NAME_SPCHUYENDUNG), gr.State(True)],
1214
+ outputs=output_box)
1215
+
1216
+ btn_ntm.click(
1217
+ product_indexing.indexing_single_product_type,
1218
+ inputs=[gr.State("nha_thong_minh"), gr.State(QDRANT_COLLECTION_NAME_SPNHATHONGMINH), gr.State(True)],
1219
+ outputs=output_box)
1220
+
1221
+ btn_thiet_bi.click(
1222
+ product_indexing.indexing_single_product_type,
1223
+ inputs=[gr.State("thiet_bi_dien"), gr.State(QDRANT_COLLECTION_NAME_SPTHIETBIDIEN), gr.State(True)],
1224
+ outputs=output_box)
1225
+
1226
+ def index_all_products():
1227
+ buffer = io.StringIO()
1228
+ sys.stdout = buffer
1229
+ product_indexing.run_indexing(reload=True, hybrid_mode=True)
1230
+ sys.stdout = sys.__stdout__
1231
+ return buffer.getvalue()
1232
+
1233
+ btn_all_products.click(
1234
+ index_all_products,
1235
+ outputs=output_box)
1236
+
1237
+ return demo
1238
+
1239
+
1240
+ if __name__ == "__main__":
1241
+ demo = create_gradio_interface()
1242
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ sentence-transformers
4
+ FlagEmbedding
5
+ langchain-core
6
+ langchain-huggingface
7
+ qdrant-client
8
+ pymongo
9
+ Pillow
10
+ requests
11
+ gradio
12
+ dotenv