dungeon29 commited on
Commit
9e66bad
·
verified ·
1 Parent(s): c800b50

Update rag_engine.py

Browse files
Files changed (1) hide show
  1. rag_engine.py +26 -52
rag_engine.py CHANGED
@@ -1,44 +1,33 @@
1
  import os
2
  import glob
3
  from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
4
- from langchain_community.vectorstores import Qdrant
5
- from qdrant_client import QdrantClient
6
- from qdrant_client.http import models
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_text_splitters import RecursiveCharacterTextSplitter
9
  from langchain_core.documents import Document
10
 
11
  class RAGEngine:
12
- def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./qdrant_db"):
13
  self.knowledge_base_dir = knowledge_base_dir
14
  self.persist_directory = persist_directory
15
- self.collection_name = "phishing_knowledge"
16
 
17
  # Initialize Embeddings (using same model as before)
18
  self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
19
 
20
- # Initialize Qdrant Client (Local mode)
21
- self.client = QdrantClient(path=self.persist_directory)
22
-
23
- # Initialize Vector Store wrapper
24
- self.vector_store = Qdrant(
25
- client=self.client,
26
- collection_name=self.collection_name,
27
- embeddings=self.embedding_fn
28
  )
29
 
30
- # Check if collection exists and has data
31
- try:
32
- count = self.client.count(collection_name=self.collection_name).count
33
- if count == 0:
34
- self._build_index()
35
- except:
36
- # Collection might not exist yet
37
  self._build_index()
38
 
39
  def _build_index(self):
40
  """Load documents and build index"""
41
- print("🔄 Building Knowledge Base Index (Qdrant)...")
42
 
43
  documents = self._load_documents()
44
  if not documents:
@@ -54,17 +43,9 @@ class RAGEngine:
54
  chunks = text_splitter.split_documents(documents)
55
 
56
  if chunks:
57
- # Re-create collection to ensure clean slate or add to it
58
- # For simplicity in local build, we use Qdrant.from_documents which creates/replaces
59
- self.vector_store = Qdrant.from_documents(
60
- chunks,
61
- self.embedding_fn,
62
- path=self.persist_directory,
63
- collection_name=self.collection_name,
64
- force_recreate=True
65
- )
66
- # Update the client reference after recreation
67
- self.client = self.vector_store.client
68
  print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
69
  else:
70
  print("⚠️ No chunks created.")
@@ -114,30 +95,23 @@ class RAGEngine:
114
  def refresh_knowledge_base(self):
115
  """Force rebuild of the index"""
116
  print("♻️ Refreshing Knowledge Base...")
117
- # In Qdrant local, we can just rebuild with force_recreate=True which is handled in _build_index
 
 
 
 
 
 
 
118
  self._build_index()
119
  return "✅ Knowledge Base Refreshed!"
120
 
121
- def retrieve(self, query, n_results=3, use_mmr=True):
122
- """
123
- Retrieve relevant context
124
- Args:
125
- query: Câu truy vấn
126
- n_results: Số lượng kết quả trả về
127
- use_mmr: Sử dụng MMR (True) hay Similarity Search thường (False)
128
- """
129
- if use_mmr:
130
- results = self.vector_store.max_marginal_relevance_search(
131
- query,
132
- k=n_results,
133
- fetch_k=n_results*3,
134
- lambda_mult=0.6
135
- )
136
- else:
137
- # Standard Similarity Search
138
- results = self.vector_store.similarity_search(query, k=n_results)
139
 
140
  # Format results
141
  if results:
142
  return [doc.page_content for doc in results]
143
- return []
 
1
  import os
2
  import glob
3
  from langchain_community.document_loaders import DirectoryLoader, TextLoader, PyPDFLoader, JSONLoader
4
+ from langchain_community.vectorstores import Chroma
 
 
5
  from langchain_huggingface import HuggingFaceEmbeddings
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_core.documents import Document
8
 
9
  class RAGEngine:
10
+ def __init__(self, knowledge_base_dir="./knowledge_base", persist_directory="./chroma_db"):
11
  self.knowledge_base_dir = knowledge_base_dir
12
  self.persist_directory = persist_directory
 
13
 
14
  # Initialize Embeddings (using same model as before)
15
  self.embedding_fn = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
16
 
17
+ # Initialize Vector Store
18
+ self.vector_store = Chroma(
19
+ persist_directory=self.persist_directory,
20
+ embedding_function=self.embedding_fn,
21
+ collection_name="phishing_knowledge"
 
 
 
22
  )
23
 
24
+ # Build index if empty or on init
25
+ if not self.vector_store.get()['ids']:
 
 
 
 
 
26
  self._build_index()
27
 
28
  def _build_index(self):
29
  """Load documents and build index"""
30
+ print("🔄 Building Knowledge Base Index...")
31
 
32
  documents = self._load_documents()
33
  if not documents:
 
43
  chunks = text_splitter.split_documents(documents)
44
 
45
  if chunks:
46
+ # Add to vector store
47
+ self.vector_store.add_documents(chunks)
48
+ self.vector_store.persist()
 
 
 
 
 
 
 
 
49
  print(f"✅ Indexed {len(chunks)} chunks from {len(documents)} documents.")
50
  else:
51
  print("⚠️ No chunks created.")
 
95
  def refresh_knowledge_base(self):
96
  """Force rebuild of the index"""
97
  print("♻️ Refreshing Knowledge Base...")
98
+ # Clear existing collection
99
+ self.vector_store.delete_collection()
100
+ self.vector_store = Chroma(
101
+ persist_directory=self.persist_directory,
102
+ embedding_function=self.embedding_fn,
103
+ collection_name="phishing_knowledge"
104
+ )
105
+ # Rebuild
106
  self._build_index()
107
  return "✅ Knowledge Base Refreshed!"
108
 
109
+ def retrieve(self, query, n_results=3):
110
+ """Retrieve relevant context"""
111
+ # Search
112
+ results = self.vector_store.similarity_search(query, k=n_results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  # Format results
115
  if results:
116
  return [doc.page_content for doc in results]
117
+ return []