muhammadnoman76 commited on
Commit
e02b28a
·
1 Parent(s): fe1a3c4
app/services/chat_processor.py CHANGED
@@ -9,7 +9,16 @@ from app.services.environmental_condition import EnvironmentalData
9
  from app.services.prompts import *
10
  from app.services.vector_database_search import VectorDatabaseSearch
11
  import re
12
- vectordb = VectorDatabaseSearch()
 
 
 
 
 
 
 
 
 
13
 
14
  class ChatProcessor:
15
  def __init__(self, token: str, session_id: Optional[str] = None, num_results: int = 3, num_images: int = 3):
@@ -58,26 +67,32 @@ class ChatProcessor:
58
  name = profile['name']
59
  age = profile['age']
60
  self.chat_session.load_chat_history()
61
- self.chat_session.update_title(self.session_id,query)
62
  history = self.chat_session.format_history()
63
 
64
- history_based_prompt = HISTORY_BASED_PROMPT.format(history=history,query= query)
65
-
66
  enhanced_query = Model().send_message_openrouter(history_based_prompt)
67
 
68
  self.session_id = self.ensure_valid_session(title=enhanced_query)
69
  permission = self.chat_session.get_user_preferences()
70
- websearch_enabled = permission.get('websearch', False)
71
  env_recommendations = permission.get('environmental_recommendations', False)
72
  personalized_recommendations = permission.get('personalized_recommendations', False)
73
  keywords_permission = permission.get('keywords', False)
74
  reference_permission = permission.get('references', False)
75
  language = self.chat_session.get_language().lower()
76
 
 
77
 
78
- language_prompt = LANGUAGE_RESPONSE_PROMPT.format(language = language)
79
-
80
- if websearch_enabled :
 
 
 
 
 
81
  with ThreadPoolExecutor(max_workers=2) as executor:
82
  future_web = executor.submit(self.web_searcher.search, enhanced_query)
83
  future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
@@ -93,186 +108,165 @@ class ChatProcessor:
93
  references.append(result['link'])
94
 
95
  context = "\n".join(context_parts)
 
 
 
 
96
 
97
- if env_recommendations and personalized_recommendations:
98
- prompt = ENVIRONMENTAL_PERSONALIZED_PROMPT.format(
99
- user_name=name,
100
- user_age=age,
101
- history=history,
102
- user_details=self.chat_session.get_personalized_recommendation(),
103
- environmental_condition=self.environment_data.get_environmental_data(),
104
- previous_history=history,
105
- context=context,
106
- current_query=enhanced_query
107
- )
108
- elif personalized_recommendations:
109
- prompt = PERSONALIZED_PROMPT.format(
110
- user_name=name,
111
- user_age=age,
112
- user_details=self.chat_session.get_personalized_recommendation(),
113
- previous_history=history,
114
- context=context,
115
- current_query=enhanced_query
116
- )
117
- elif env_recommendations :
118
- prompt = ENVIRONMENTAL_PROMPT.format(
119
- user_name=name,
120
- user_age=age,
121
- environmental_condition=self.environment_data.get_environmental_data(),
122
- previous_history=history,
123
- context=context,
124
- current_query=enhanced_query
125
- )
126
- else:
127
- prompt = DEFAULT_PROMPT.format(
128
- previous_history=history,
129
- context=context,
130
- current_query=enhanced_query
131
- )
132
-
133
- prompt = prompt + language_prompt
134
-
135
- response = Model().llm(prompt,enhanced_query)
136
-
137
- keywords = ""
138
-
139
- if (keywords_permission):
140
- keywords = self.extract_keywords_yake(response, language=language)
141
- if (not reference_permission):
142
- references = ""
143
-
144
- chat_data = {
145
- "query": enhanced_query,
146
- "response": response,
147
- "references": references,
148
- "page_no": "",
149
- "keywords": keywords,
150
- "images": image_results,
151
- "context": context,
152
- "timestamp": datetime.now(timezone.utc).isoformat(),
153
- "session_id": self.chat_session.session_id
154
- }
155
-
156
- if not self.chat_session.save_chat(chat_data):
157
- raise ValueError("Failed to save chat message")
158
- return chat_data
159
-
160
  else:
 
161
  attach_image = False
162
 
163
- with ThreadPoolExecutor(max_workers=2) as executor:
164
  future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
165
  image_results = future_images.result()
166
 
167
  start_time = datetime.now(timezone.utc)
168
 
169
- results = vectordb.search( query=enhanced_query, top_k=3)
 
 
 
 
170
 
171
  context_parts = []
172
  references = []
173
- seen_pages = set()
174
 
175
  for result in results:
176
- confidence = result['confidence']
177
- if confidence > 60:
 
178
  context_parts.append(f"Content: {result['content']}")
179
- page = result['page']
180
- if page not in seen_pages: # Only append if page is not seen
181
- references.append(f"Source: {result['source']}, Page: {page}")
182
- seen_pages.add(page)
183
- attach_image = True
 
 
184
 
185
  context = "\n".join(context_parts)
186
 
187
- if not context or len(context) < 10:
188
- context = "There is no context found unfortunately"
189
-
190
- if env_recommendations and personalized_recommendations:
191
- prompt = ENVIRONMENTAL_PERSONALIZED_PROMPT.format(
192
- user_name=name,
193
- user_age = age,
194
- history=history,
195
- user_details=self.chat_session.get_personalized_recommendation(),
196
- environmental_condition=self.environment_data.get_environmental_data(),
197
- previous_history=history,
198
- context=context,
199
- current_query=enhanced_query
200
- )
201
- elif personalized_recommendations:
202
- prompt = PERSONALIZED_PROMPT.format(
203
- user_name=name,
204
- user_age=age,
205
- user_details=self.chat_session.get_personalized_recommendation(),
206
- previous_history=history,
207
- context=context,
208
- current_query=enhanced_query
209
- )
210
- elif env_recommendations :
211
- prompt = ENVIRONMENTAL_PROMPT.format(
212
- user_name=name,
213
- user_age=age,
214
- environmental_condition=self.environment_data.get_environmental_data(),
215
- previous_history=history,
216
- context=context,
217
- current_query=enhanced_query
218
- )
219
- else:
220
- prompt = DEFAULT_PROMPT.format(
221
- previous_history=history,
222
- context=context,
223
- current_query=enhanced_query
224
- )
225
-
226
- prompt = prompt + language_prompt
227
-
228
- response = Model().response = Model().llm(prompt,query)
229
 
230
  end_time = datetime.now(timezone.utc)
231
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  keywords = ""
233
 
234
- if (keywords_permission):
235
- keywords = self.extract_keywords_yake(response, language=language)
236
-
237
- if (not reference_permission):
238
- references = ""
239
-
240
- if not attach_image:
241
- image_results = ""
242
- keywords = ""
243
-
244
- chat_data = {
245
- "query": enhanced_query,
246
- "response": response,
247
- "references": references,
248
- "page_no": "",
249
- "keywords": keywords,
250
- "images": image_results,
251
- "context": context,
252
- "timestamp": datetime.now(timezone.utc).isoformat(),
253
- "session_id": self.chat_session.session_id
254
- }
255
  match = re.search(r'(## Personal Recommendations|## Environmental Considerations)', response)
256
- if match:
257
- truncated_response = response[:match.start()].strip()
258
- else:
259
- truncated_response = response
260
- if not self.chat_session.save_details(session_id=self.session_id , context= context , query= enhanced_query , response=truncated_response , rag_start_time=start_time , rag_end_time=end_time ):
261
- raise ValueError("Failed to save the RAG details")
262
- if not self.chat_session.save_chat(chat_data):
263
- raise ValueError("Failed to save chat message")
264
- return chat_data
 
 
 
 
 
 
 
 
265
 
266
  except Exception as e:
 
267
  return {
268
  "error": str(e),
269
  "query": query,
270
- "response": "Sorry, there was an error processing your request.",
271
  "timestamp": datetime.now(timezone.utc).isoformat()
272
  }
273
 
274
  def web_search(self, query: str) -> Dict[str, Any]:
275
- if self.session_id and len(self.session_id) > 5:
276
- return self.process_chat(query=query)
277
- else:
278
- return self.process_chat(query=query)
 
9
  from app.services.prompts import *
10
  from app.services.vector_database_search import VectorDatabaseSearch
11
  import re
12
+ import logging
13
+
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Initialize vector database with error handling
17
+ try:
18
+ vectordb = VectorDatabaseSearch()
19
+ except Exception as e:
20
+ logger.error(f"Failed to initialize vector database: {e}")
21
+ vectordb = None
22
 
23
  class ChatProcessor:
24
  def __init__(self, token: str, session_id: Optional[str] = None, num_results: int = 3, num_images: int = 3):
 
67
  name = profile['name']
68
  age = profile['age']
69
  self.chat_session.load_chat_history()
70
+ self.chat_session.update_title(self.session_id, query)
71
  history = self.chat_session.format_history()
72
 
73
+ # Enhanced query generation
74
+ history_based_prompt = HISTORY_BASED_PROMPT.format(history=history, query=query)
75
  enhanced_query = Model().send_message_openrouter(history_based_prompt)
76
 
77
  self.session_id = self.ensure_valid_session(title=enhanced_query)
78
  permission = self.chat_session.get_user_preferences()
79
+ websearch_enabled = permission.get('websearch', False)
80
  env_recommendations = permission.get('environmental_recommendations', False)
81
  personalized_recommendations = permission.get('personalized_recommendations', False)
82
  keywords_permission = permission.get('keywords', False)
83
  reference_permission = permission.get('references', False)
84
  language = self.chat_session.get_language().lower()
85
 
86
+ language_prompt = LANGUAGE_RESPONSE_PROMPT.format(language=language)
87
 
88
+ # Check if vector database is available when websearch is disabled
89
+ vector_db_available = vectordb and vectordb.is_available() if not websearch_enabled else False
90
+
91
+ # If websearch is disabled and vector DB is not available, enable websearch as fallback
92
+ use_websearch = websearch_enabled or not vector_db_available
93
+
94
+ if use_websearch:
95
+ logger.info("Using web search for context")
96
  with ThreadPoolExecutor(max_workers=2) as executor:
97
  future_web = executor.submit(self.web_searcher.search, enhanced_query)
98
  future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
 
108
  references.append(result['link'])
109
 
110
  context = "\n".join(context_parts)
111
+
112
+ # If web search returns no results, provide a helpful context
113
+ if not context:
114
+ context = "No specific information found. Please provide general dermatological advice based on your expertise."
115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116
  else:
117
+ logger.info("Using vector database for context")
118
  attach_image = False
119
 
120
+ with ThreadPoolExecutor(max_workers=1) as executor:
121
  future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
122
  image_results = future_images.result()
123
 
124
  start_time = datetime.now(timezone.utc)
125
 
126
+ # Search vector database
127
+ if vectordb:
128
+ results = vectordb.search(query=enhanced_query, top_k=5) # Increased top_k for better results
129
+ else:
130
+ results = []
131
 
132
  context_parts = []
133
  references = []
134
+ seen_pages = set()
135
 
136
  for result in results:
137
+ confidence = result.get('confidence', 0)
138
+ # Lowered confidence threshold for better recall
139
+ if confidence > 30:
140
  context_parts.append(f"Content: {result['content']}")
141
+ source = result.get('source', 'Unknown')
142
+ page = result.get('page', 0)
143
+ page_key = f"{source}_{page}"
144
+ if page_key not in seen_pages:
145
+ references.append(f"Source: {source}, Page: {page}")
146
+ seen_pages.add(page_key)
147
+ attach_image = True
148
 
149
  context = "\n".join(context_parts)
150
 
151
+ # Provide more helpful context when vector search returns nothing
152
+ if not context or len(context) < 50:
153
+ logger.warning("Vector database returned insufficient context")
154
+ # Fall back to web search if available
155
+ if self.web_searcher:
156
+ logger.info("Falling back to web search due to insufficient vector results")
157
+ web_results = self.web_searcher.search(enhanced_query)
158
+ context_parts = []
159
+ references = []
160
+ for idx, result in enumerate(web_results[:3], 1):
161
+ if result['text']:
162
+ context_parts.append(f"From Source {idx}: {result['text']}\n")
163
+ references.append(result['link'])
164
+ context = "\n".join(context_parts)
165
+
166
+ if not context:
167
+ context = "Based on general dermatological knowledge and best practices."
168
+ attach_image = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
 
170
  end_time = datetime.now(timezone.utc)
171
 
172
+ # Generate appropriate prompt based on user preferences
173
+ if env_recommendations and personalized_recommendations:
174
+ prompt = ENVIRONMENTAL_PERSONALIZED_PROMPT.format(
175
+ user_name=name,
176
+ user_age=age,
177
+ history=history,
178
+ user_details=self.chat_session.get_personalized_recommendation(),
179
+ environmental_condition=self.environment_data.get_environmental_data(),
180
+ previous_history=history,
181
+ context=context,
182
+ current_query=enhanced_query
183
+ )
184
+ elif personalized_recommendations:
185
+ prompt = PERSONALIZED_PROMPT.format(
186
+ user_name=name,
187
+ user_age=age,
188
+ user_details=self.chat_session.get_personalized_recommendation(),
189
+ previous_history=history,
190
+ context=context,
191
+ current_query=enhanced_query
192
+ )
193
+ elif env_recommendations:
194
+ prompt = ENVIRONMENTAL_PROMPT.format(
195
+ user_name=name,
196
+ user_age=age,
197
+ environmental_condition=self.environment_data.get_environmental_data(),
198
+ previous_history=history,
199
+ context=context,
200
+ current_query=enhanced_query
201
+ )
202
+ else:
203
+ prompt = DEFAULT_PROMPT.format(
204
+ previous_history=history,
205
+ context=context,
206
+ current_query=enhanced_query
207
+ )
208
+
209
+ prompt = prompt + "\n" + language_prompt
210
+
211
+ # Generate response
212
+ response = Model().llm(prompt, enhanced_query)
213
+
214
+ # Extract keywords if enabled
215
+ keywords = ""
216
+ if keywords_permission:
217
+ keywords = self.extract_keywords_yake(response, language=language)
218
+
219
+ if not reference_permission:
220
+ references = ""
221
+
222
+ # Prepare images
223
+ if not use_websearch and not attach_image:
224
+ image_results = ""
225
  keywords = ""
226
 
227
+ # Prepare chat data
228
+ chat_data = {
229
+ "query": enhanced_query,
230
+ "response": response,
231
+ "references": references,
232
+ "page_no": "",
233
+ "keywords": keywords,
234
+ "images": image_results if 'image_results' in locals() else "",
235
+ "context": context,
236
+ "timestamp": datetime.now(timezone.utc).isoformat(),
237
+ "session_id": self.chat_session.session_id
238
+ }
239
+
240
+ # Save RAG details if using vector database
241
+ if not use_websearch and 'start_time' in locals() and 'end_time' in locals():
 
 
 
 
 
 
242
  match = re.search(r'(## Personal Recommendations|## Environmental Considerations)', response)
243
+ truncated_response = response[:match.start()].strip() if match else response
244
+
245
+ if not self.chat_session.save_details(
246
+ session_id=self.session_id,
247
+ context=context,
248
+ query=enhanced_query,
249
+ response=truncated_response,
250
+ rag_start_time=start_time,
251
+ rag_end_time=end_time
252
+ ):
253
+ logger.warning("Failed to save RAG details")
254
+
255
+ # Save chat
256
+ if not self.chat_session.save_chat(chat_data):
257
+ raise ValueError("Failed to save chat message")
258
+
259
+ return chat_data
260
 
261
  except Exception as e:
262
+ logger.error(f"Error in process_chat: {str(e)}")
263
  return {
264
  "error": str(e),
265
  "query": query,
266
+ "response": "I apologize, but I'm experiencing technical difficulties. Please try again or enable web search in your preferences for better results.",
267
  "timestamp": datetime.now(timezone.utc).isoformat()
268
  }
269
 
270
  def web_search(self, query: str) -> Dict[str, Any]:
271
+ """Public method for web search endpoint"""
272
+ return self.process_chat(query=query)
 
 
app/services/vector_database_search.py CHANGED
@@ -5,33 +5,81 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
5
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
  from langchain_qdrant import Qdrant
7
  from qdrant_client import QdrantClient, models
 
8
  from dotenv import load_dotenv
 
9
 
10
  load_dotenv()
11
 
 
 
 
 
12
  os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
13
  QDRANT_URL = os.getenv("QDRANT_URL")
14
  QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
15
- QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME")
16
 
17
  class VectorDatabaseSearch:
18
  def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
19
  self.collection_name = collection_name
20
  self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
21
- self.client = QdrantClient(url=QDRANT_URL, api_key=QDRANT_API_KEY)
22
- self._initialize_collection()
 
 
 
 
23
 
24
- self.vectorstore = Qdrant(
25
- client=self.client,
26
- collection_name=collection_name,
27
- embeddings=self.embeddings
28
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
  def _initialize_collection(self):
31
  """Initialize Qdrant collection if it doesn't exist"""
 
 
 
32
  try:
33
  collections = self.client.get_collections()
34
- if not any(c.name == self.collection_name for c in collections.collections):
 
 
35
  self.client.create_collection(
36
  collection_name=self.collection_name,
37
  vectors_config=models.VectorParams(
@@ -39,12 +87,22 @@ class VectorDatabaseSearch:
39
  distance=models.Distance.COSINE
40
  )
41
  )
42
- print(f"Created collection: {self.collection_name}")
 
 
 
 
 
43
  except Exception as e:
44
- print(f"Error initializing collection: {e}")
 
45
 
46
  def add_pdf(self, pdf_path):
47
  """Add PDF to vector database"""
 
 
 
 
48
  try:
49
  loader = PyPDFLoader(pdf_path)
50
  docs = loader.load()
@@ -52,75 +110,107 @@ class VectorDatabaseSearch:
52
  split_docs = splitter.split_documents(docs)
53
 
54
  book_name = os.path.splitext(os.path.basename(pdf_path))[0]
55
- print(f"Processing {book_name} with {len(split_docs)} chunks")
56
 
57
  for doc in split_docs:
58
- # Ensure metadata is stored in a consistent way
59
  doc.metadata = {
60
  "source": book_name,
61
  "page": doc.metadata.get('page', 1),
62
  "id": str(uuid.uuid4())
63
  }
64
 
65
- # Add documents to vector store
66
  self.vectorstore.add_documents(split_docs)
67
- print(f"Added {len(split_docs)} chunks from {book_name}")
68
  return True
 
69
  except Exception as e:
70
- print(f"Error adding PDF: {e}")
71
  return False
72
 
73
  def search(self, query, top_k=5):
74
  """Search documents based on query"""
 
 
 
 
75
  try:
 
 
 
 
 
 
 
76
  results = self.vectorstore.similarity_search_with_score(query, k=top_k)
77
 
78
  formatted = []
79
  for doc, score in results:
 
 
 
80
  formatted.append({
81
- "source": doc.metadata['source'],
82
- "page": doc.metadata['page'],
83
  "content": doc.page_content[:500],
84
- "confidence": round(score * 100, 2)
85
  })
 
 
86
  return formatted
 
87
  except Exception as e:
88
- print(f"Search error: {e}")
89
  return []
90
 
91
  def get_book_info(self):
92
  """Retrieve list of unique book sources in the collection"""
 
 
 
 
93
  try:
94
- # First check if the collection exists
95
  collections = self.client.get_collections()
96
  if not any(c.name == self.collection_name for c in collections.collections):
97
- print(f"Collection {self.collection_name} does not exist yet")
98
  return []
99
-
100
- # Get all points with payload from the collection
 
 
 
 
 
 
101
  points = self.client.scroll(
102
  collection_name=self.collection_name,
103
- limit=1000,
104
  with_payload=True,
105
- with_vectors=False # We don't need vector data
106
  )[0]
107
 
108
- # Debug information
109
- print(f"Retrieved {len(points)} points from collection")
110
-
111
- # Extract unique book sources from payloads
112
  books = set()
113
  for point in points:
114
- # Check if payload exists and has 'metadata' field with 'source'
115
  if hasattr(point, 'payload') and point.payload:
116
- # Check different possible payload structures
117
  if 'metadata' in point.payload and 'source' in point.payload['metadata']:
118
  books.add(point.payload['metadata']['source'])
119
  elif 'source' in point.payload:
120
  books.add(point.payload['source'])
121
-
122
- print(f"Found {len(books)} unique books")
123
  return list(books)
 
124
  except Exception as e:
125
- print(f"Error retrieving book info: {e}")
126
- return []
 
 
 
 
 
 
 
 
 
 
 
 
5
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
6
  from langchain_qdrant import Qdrant
7
  from qdrant_client import QdrantClient, models
8
+ from qdrant_client.http.exceptions import UnexpectedResponse
9
  from dotenv import load_dotenv
10
+ import logging
11
 
12
  load_dotenv()
13
 
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
  os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
19
  QDRANT_URL = os.getenv("QDRANT_URL")
20
  QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
21
+ QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "dermatology_docs")
22
 
23
  class VectorDatabaseSearch:
24
  def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
25
  self.collection_name = collection_name
26
  self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
27
+ self.client = None
28
+ self.vectorstore = None
29
+ self.is_initialized = False
30
+
31
+ # Initialize connection
32
+ self._initialize_connection()
33
 
34
+ def _initialize_connection(self):
35
+ """Initialize Qdrant connection with proper error handling"""
36
+ try:
37
+ # Check if credentials are available
38
+ if not QDRANT_URL or not QDRANT_API_KEY:
39
+ logger.warning("Qdrant credentials not found. Vector search will be disabled.")
40
+ self.is_initialized = False
41
+ return
42
+
43
+ # Initialize Qdrant client
44
+ self.client = QdrantClient(
45
+ url=QDRANT_URL,
46
+ api_key=QDRANT_API_KEY,
47
+ timeout=30 # Add timeout
48
+ )
49
+
50
+ # Test connection
51
+ self.client.get_collections()
52
+
53
+ # Initialize collection
54
+ self._initialize_collection()
55
+
56
+ # Initialize vector store
57
+ self.vectorstore = Qdrant(
58
+ client=self.client,
59
+ collection_name=self.collection_name,
60
+ embeddings=self.embeddings
61
+ )
62
+
63
+ self.is_initialized = True
64
+ logger.info(f"Successfully connected to Qdrant collection: {self.collection_name}")
65
+
66
+ except UnexpectedResponse as e:
67
+ logger.error(f"Authentication error with Qdrant: {e}")
68
+ self.is_initialized = False
69
+ except Exception as e:
70
+ logger.error(f"Error initializing Qdrant connection: {e}")
71
+ self.is_initialized = False
72
 
73
  def _initialize_collection(self):
74
  """Initialize Qdrant collection if it doesn't exist"""
75
+ if not self.client:
76
+ return
77
+
78
  try:
79
  collections = self.client.get_collections()
80
+ collection_exists = any(c.name == self.collection_name for c in collections.collections)
81
+
82
+ if not collection_exists:
83
  self.client.create_collection(
84
  collection_name=self.collection_name,
85
  vectors_config=models.VectorParams(
 
87
  distance=models.Distance.COSINE
88
  )
89
  )
90
+ logger.info(f"Created new collection: {self.collection_name}")
91
+ else:
92
+ # Check if collection has data
93
+ collection_info = self.client.get_collection(self.collection_name)
94
+ logger.info(f"Collection {self.collection_name} exists with {collection_info.points_count} points")
95
+
96
  except Exception as e:
97
+ logger.error(f"Error initializing collection: {e}")
98
+ self.is_initialized = False
99
 
100
  def add_pdf(self, pdf_path):
101
  """Add PDF to vector database"""
102
+ if not self.is_initialized:
103
+ logger.error("Vector database not initialized. Cannot add PDF.")
104
+ return False
105
+
106
  try:
107
  loader = PyPDFLoader(pdf_path)
108
  docs = loader.load()
 
110
  split_docs = splitter.split_documents(docs)
111
 
112
  book_name = os.path.splitext(os.path.basename(pdf_path))[0]
113
+ logger.info(f"Processing {book_name} with {len(split_docs)} chunks")
114
 
115
  for doc in split_docs:
 
116
  doc.metadata = {
117
  "source": book_name,
118
  "page": doc.metadata.get('page', 1),
119
  "id": str(uuid.uuid4())
120
  }
121
 
 
122
  self.vectorstore.add_documents(split_docs)
123
+ logger.info(f"Successfully added {len(split_docs)} chunks from {book_name}")
124
  return True
125
+
126
  except Exception as e:
127
+ logger.error(f"Error adding PDF: {e}")
128
  return False
129
 
130
  def search(self, query, top_k=5):
131
  """Search documents based on query"""
132
+ if not self.is_initialized:
133
+ logger.warning("Vector database not initialized. Returning empty results.")
134
+ return []
135
+
136
  try:
137
+ # Check if collection has any data
138
+ collection_info = self.client.get_collection(self.collection_name)
139
+ if collection_info.points_count == 0:
140
+ logger.warning(f"Collection {self.collection_name} is empty. No documents to search.")
141
+ return []
142
+
143
+ # Perform similarity search
144
  results = self.vectorstore.similarity_search_with_score(query, k=top_k)
145
 
146
  formatted = []
147
  for doc, score in results:
148
+ # Convert score to confidence percentage (cosine similarity)
149
+ confidence = (1 - score) * 100 # Qdrant returns distance, not similarity
150
+
151
  formatted.append({
152
+ "source": doc.metadata.get('source', 'Unknown'),
153
+ "page": doc.metadata.get('page', 0),
154
  "content": doc.page_content[:500],
155
+ "confidence": round(confidence, 2)
156
  })
157
+
158
+ logger.info(f"Found {len(formatted)} results for query: {query[:50]}...")
159
  return formatted
160
+
161
  except Exception as e:
162
+ logger.error(f"Search error: {e}")
163
  return []
164
 
165
  def get_book_info(self):
166
  """Retrieve list of unique book sources in the collection"""
167
+ if not self.is_initialized:
168
+ logger.warning("Vector database not initialized.")
169
+ return []
170
+
171
  try:
172
+ # Check if collection exists
173
  collections = self.client.get_collections()
174
  if not any(c.name == self.collection_name for c in collections.collections):
175
+ logger.info(f"Collection {self.collection_name} does not exist yet")
176
  return []
177
+
178
+ # Get collection info
179
+ collection_info = self.client.get_collection(self.collection_name)
180
+ if collection_info.points_count == 0:
181
+ logger.info("Collection is empty")
182
+ return []
183
+
184
+ # Get sample of points to extract sources
185
  points = self.client.scroll(
186
  collection_name=self.collection_name,
187
+ limit=min(1000, collection_info.points_count),
188
  with_payload=True,
189
+ with_vectors=False
190
  )[0]
191
 
 
 
 
 
192
  books = set()
193
  for point in points:
 
194
  if hasattr(point, 'payload') and point.payload:
 
195
  if 'metadata' in point.payload and 'source' in point.payload['metadata']:
196
  books.add(point.payload['metadata']['source'])
197
  elif 'source' in point.payload:
198
  books.add(point.payload['source'])
199
+
200
+ logger.info(f"Found {len(books)} unique books in collection")
201
  return list(books)
202
+
203
  except Exception as e:
204
+ logger.error(f"Error retrieving book info: {e}")
205
+ return []
206
+
207
+ def is_available(self):
208
+ """Check if vector database is available and has data"""
209
+ if not self.is_initialized:
210
+ return False
211
+
212
+ try:
213
+ collection_info = self.client.get_collection(self.collection_name)
214
+ return collection_info.points_count > 0
215
+ except:
216
+ return False