tanmaivan commited on
Commit
8ea32ac
·
1 Parent(s): 44009db

feat: improve query transformer and enable multi-language response

Browse files
Files changed (2) hide show
  1. data/jrg_data.json +1 -1
  2. data/save_to_vectordb.py +63 -30
data/jrg_data.json CHANGED
@@ -63,7 +63,7 @@
63
  }
64
  },
65
  {
66
- "content": "Anh Đỗ Phi Long (Long Do) là đại gia Long Thành Đồng Nai. Anh ấy hiện đang làm việc với vai trò Data Analyst tại Pizza Hut Vietnam.",
67
  "metadata": {
68
  "source": "Employee Directory",
69
  "category": "Employee_Directory"
 
63
  }
64
  },
65
  {
66
+ "content": "Anh Đỗ Phi Long (Long Do) là đại gia Biên Hòa Đồng Nai. Anh ấy hiện đang làm việc với vai trò Data Analyst tại Pizza Hut Vietnam.",
67
  "metadata": {
68
  "source": "Employee Directory",
69
  "category": "Employee_Directory"
data/save_to_vectordb.py CHANGED
@@ -1,5 +1,6 @@
1
  import json
2
  import os
 
3
  from dotenv import load_dotenv
4
  from qdrant_client import models
5
  from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
@@ -7,54 +8,86 @@ from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_core.documents import Document
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
 
10
- # Safely resolve absolute paths to avoid directory execution issues
11
  current_dir = os.path.dirname(os.path.abspath(__file__))
12
- env_path = os.path.join(current_dir, '..', '.env')
 
13
  load_dotenv(env_path)
14
 
15
  QDRANT_COLLECTION_NAME = os.getenv('QDRANT_COLLECTION_NAME', 'jrg_bot_collection')
16
  QDRANT_URL = os.getenv('QDRANT_URL')
17
  QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
18
 
19
- def load_and_chunk_data(file_path):
20
- """Load JSON data and split into smaller chunks for optimal retrieval."""
21
- with open(file_path, 'r', encoding='utf-8') as f:
22
- data = json.load(f)
23
-
24
  raw_documents = []
25
- for item in data:
26
- source = item.get('metadata', {}).get('source', 'Unknown')
27
- print(f"Loading document from source: {source} ...")
 
 
 
 
 
 
28
 
29
- raw_documents.append(Document(
30
- page_content=item.get('content', ''),
31
- metadata=item.get('metadata', {})
32
- ))
33
-
34
- # Text Splitter to prevent context window overflow when data grows
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  text_splitter = RecursiveCharacterTextSplitter(
36
- chunk_size=500,
37
- chunk_overlap=50
38
  )
39
 
40
- docs = text_splitter.split_documents(raw_documents)
41
- print(f"Successfully loaded and split into {len(docs)} chunks.")
42
- return docs
 
43
 
44
  def main():
45
- json_path = os.path.join(current_dir, 'jrg_data.json')
46
- documents = load_and_chunk_data(json_path)
 
 
 
 
47
 
48
- print("Initializing Embedding Models (Dense & Sparse)...")
49
- # Dense Embedding for semantic search
 
 
 
 
 
 
50
  model_name = 'bkai-foundation-models/vietnamese-bi-encoder'
 
51
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
52
 
53
- # Sparse Embedding for keyword search (BM25)
 
54
  sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25")
55
 
56
- print("Uploading data to Qdrant Cloud...")
57
- # Save to Qdrant using Hybrid Search
 
 
58
  vectorstore = QdrantVectorStore.from_documents(
59
  documents,
60
  embedding_model,
@@ -64,10 +97,10 @@ def main():
64
  api_key=QDRANT_API_KEY,
65
  collection_name=QDRANT_COLLECTION_NAME,
66
  distance=models.Distance.COSINE,
67
- force_recreate=True # Avoid duplicating data on multiple runs
68
  )
69
 
70
- print(f"SUCCESS: Data has been saved to Qdrant collection '{QDRANT_COLLECTION_NAME}'.")
71
 
72
  if __name__ == "__main__":
73
  main()
 
1
  import json
2
  import os
3
+ import glob
4
  from dotenv import load_dotenv
5
  from qdrant_client import models
6
  from langchain_qdrant import QdrantVectorStore, FastEmbedSparse, RetrievalMode
 
8
  from langchain_core.documents import Document
9
  from langchain_text_splitters import RecursiveCharacterTextSplitter
10
 
11
+ # 1. Setup paths
12
  current_dir = os.path.dirname(os.path.abspath(__file__))
13
+ # Assuming .env is located in the parent directory (pizzahut/)
14
+ env_path = os.path.join(current_dir, '..', '.env')
15
  load_dotenv(env_path)
16
 
17
  QDRANT_COLLECTION_NAME = os.getenv('QDRANT_COLLECTION_NAME', 'jrg_bot_collection')
18
  QDRANT_URL = os.getenv('QDRANT_URL')
19
  QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
20
 
21
+ def load_all_json_files(data_dir):
22
+ """Scan and convert all .json files in the data directory into Langchain Documents"""
 
 
 
23
  raw_documents = []
24
+
25
+ # Find all .json files in the current directory
26
+ json_files = glob.glob(os.path.join(data_dir, "*.json"))
27
+
28
+ print(f"--- STARTING DATA INGESTION: FOUND {len(json_files)} JSON FILES ---")
29
+
30
+ for file_path in json_files:
31
+ file_name = os.path.basename(file_path)
32
+ print(f"Reading file: {file_name}...")
33
 
34
+ with open(file_path, 'r', encoding='utf-8') as f:
35
+ try:
36
+ data = json.load(f)
37
+ for item in data:
38
+ content = item.get('content', '').strip()
39
+ if not content:
40
+ continue
41
+
42
+ raw_documents.append(Document(
43
+ page_content=content,
44
+ metadata=item.get('metadata', {})
45
+ ))
46
+ except json.JSONDecodeError:
47
+ print(f"ERROR: JSON syntax error in file: {file_name}")
48
+
49
+ print(f"Successfully loaded {len(raw_documents)} raw documents.")
50
+
51
+ # Text Splitter: Crucial for optimizing context window and memory limits
52
+ print("Initializing Text Splitter for chunking...")
53
  text_splitter = RecursiveCharacterTextSplitter(
54
+ chunk_size=800,
55
+ chunk_overlap=100
56
  )
57
 
58
+ final_docs = text_splitter.split_documents(raw_documents)
59
+ print(f"Data has been split into {len(final_docs)} optimal chunks ready for VectorDB.")
60
+
61
+ return final_docs
62
 
63
  def main():
64
+ if not QDRANT_URL or not QDRANT_API_KEY:
65
+ print("CRITICAL ERROR: QDRANT_URL or QDRANT_API_KEY not found in .env file.")
66
+ return
67
+
68
+ # 2. Process data
69
+ documents = load_all_json_files(current_dir)
70
 
71
+ if not documents:
72
+ print("WARNING: No data found to ingest. Please check your JSON files.")
73
+ return
74
+
75
+ # 3. Initialize AI Embedding Models
76
+ print("Initializing Hybrid Embedding Models (Dense & Sparse)...")
77
+
78
+ # Dense Model: For semantic search (Vietnamese optimized)
79
  model_name = 'bkai-foundation-models/vietnamese-bi-encoder'
80
+ print(f"Loading Dense Model: {model_name}...")
81
  embedding_model = HuggingFaceEmbeddings(model_name=model_name)
82
 
83
+ # Sparse Model: For exact keyword matching (BM25)
84
+ print("Loading Sparse Model: Qdrant/BM25...")
85
  sparse_embeddings = FastEmbedSparse(model_name="Qdrant/BM25")
86
 
87
+ # 4. Upload to Qdrant Cloud
88
+ print(f"Uploading vectors to Qdrant Collection: '{QDRANT_COLLECTION_NAME}'...")
89
+ print("This process may take a few minutes depending on the data size. Please do not close the terminal...")
90
+
91
  vectorstore = QdrantVectorStore.from_documents(
92
  documents,
93
  embedding_model,
 
97
  api_key=QDRANT_API_KEY,
98
  collection_name=QDRANT_COLLECTION_NAME,
99
  distance=models.Distance.COSINE,
100
+ force_recreate=True # WARNING: This will drop the existing collection and recreate it
101
  )
102
 
103
+ print("SUCCESS: The entire Knowledge Base has been uploaded to Qdrant Cloud!")
104
 
105
  if __name__ == "__main__":
106
  main()