Spaces:
Sleeping
Sleeping
Commit
·
e02b28a
1
Parent(s):
fe1a3c4
update
Browse files- app/services/chat_processor.py +152 -158
- app/services/vector_database_search.py +126 -36
app/services/chat_processor.py
CHANGED
|
@@ -9,7 +9,16 @@ from app.services.environmental_condition import EnvironmentalData
|
|
| 9 |
from app.services.prompts import *
|
| 10 |
from app.services.vector_database_search import VectorDatabaseSearch
|
| 11 |
import re
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
class ChatProcessor:
|
| 15 |
def __init__(self, token: str, session_id: Optional[str] = None, num_results: int = 3, num_images: int = 3):
|
|
@@ -58,26 +67,32 @@ class ChatProcessor:
|
|
| 58 |
name = profile['name']
|
| 59 |
age = profile['age']
|
| 60 |
self.chat_session.load_chat_history()
|
| 61 |
-
self.chat_session.update_title(self.session_id,query)
|
| 62 |
history = self.chat_session.format_history()
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
enhanced_query = Model().send_message_openrouter(history_based_prompt)
|
| 67 |
|
| 68 |
self.session_id = self.ensure_valid_session(title=enhanced_query)
|
| 69 |
permission = self.chat_session.get_user_preferences()
|
| 70 |
-
websearch_enabled
|
| 71 |
env_recommendations = permission.get('environmental_recommendations', False)
|
| 72 |
personalized_recommendations = permission.get('personalized_recommendations', False)
|
| 73 |
keywords_permission = permission.get('keywords', False)
|
| 74 |
reference_permission = permission.get('references', False)
|
| 75 |
language = self.chat_session.get_language().lower()
|
| 76 |
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 82 |
future_web = executor.submit(self.web_searcher.search, enhanced_query)
|
| 83 |
future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
|
|
@@ -93,186 +108,165 @@ class ChatProcessor:
|
|
| 93 |
references.append(result['link'])
|
| 94 |
|
| 95 |
context = "\n".join(context_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
|
| 97 |
-
if env_recommendations and personalized_recommendations:
|
| 98 |
-
prompt = ENVIRONMENTAL_PERSONALIZED_PROMPT.format(
|
| 99 |
-
user_name=name,
|
| 100 |
-
user_age=age,
|
| 101 |
-
history=history,
|
| 102 |
-
user_details=self.chat_session.get_personalized_recommendation(),
|
| 103 |
-
environmental_condition=self.environment_data.get_environmental_data(),
|
| 104 |
-
previous_history=history,
|
| 105 |
-
context=context,
|
| 106 |
-
current_query=enhanced_query
|
| 107 |
-
)
|
| 108 |
-
elif personalized_recommendations:
|
| 109 |
-
prompt = PERSONALIZED_PROMPT.format(
|
| 110 |
-
user_name=name,
|
| 111 |
-
user_age=age,
|
| 112 |
-
user_details=self.chat_session.get_personalized_recommendation(),
|
| 113 |
-
previous_history=history,
|
| 114 |
-
context=context,
|
| 115 |
-
current_query=enhanced_query
|
| 116 |
-
)
|
| 117 |
-
elif env_recommendations :
|
| 118 |
-
prompt = ENVIRONMENTAL_PROMPT.format(
|
| 119 |
-
user_name=name,
|
| 120 |
-
user_age=age,
|
| 121 |
-
environmental_condition=self.environment_data.get_environmental_data(),
|
| 122 |
-
previous_history=history,
|
| 123 |
-
context=context,
|
| 124 |
-
current_query=enhanced_query
|
| 125 |
-
)
|
| 126 |
-
else:
|
| 127 |
-
prompt = DEFAULT_PROMPT.format(
|
| 128 |
-
previous_history=history,
|
| 129 |
-
context=context,
|
| 130 |
-
current_query=enhanced_query
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
prompt = prompt + language_prompt
|
| 134 |
-
|
| 135 |
-
response = Model().llm(prompt,enhanced_query)
|
| 136 |
-
|
| 137 |
-
keywords = ""
|
| 138 |
-
|
| 139 |
-
if (keywords_permission):
|
| 140 |
-
keywords = self.extract_keywords_yake(response, language=language)
|
| 141 |
-
if (not reference_permission):
|
| 142 |
-
references = ""
|
| 143 |
-
|
| 144 |
-
chat_data = {
|
| 145 |
-
"query": enhanced_query,
|
| 146 |
-
"response": response,
|
| 147 |
-
"references": references,
|
| 148 |
-
"page_no": "",
|
| 149 |
-
"keywords": keywords,
|
| 150 |
-
"images": image_results,
|
| 151 |
-
"context": context,
|
| 152 |
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 153 |
-
"session_id": self.chat_session.session_id
|
| 154 |
-
}
|
| 155 |
-
|
| 156 |
-
if not self.chat_session.save_chat(chat_data):
|
| 157 |
-
raise ValueError("Failed to save chat message")
|
| 158 |
-
return chat_data
|
| 159 |
-
|
| 160 |
else:
|
|
|
|
| 161 |
attach_image = False
|
| 162 |
|
| 163 |
-
with ThreadPoolExecutor(max_workers=
|
| 164 |
future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
|
| 165 |
image_results = future_images.result()
|
| 166 |
|
| 167 |
start_time = datetime.now(timezone.utc)
|
| 168 |
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
context_parts = []
|
| 172 |
references = []
|
| 173 |
-
seen_pages = set()
|
| 174 |
|
| 175 |
for result in results:
|
| 176 |
-
confidence = result
|
| 177 |
-
|
|
|
|
| 178 |
context_parts.append(f"Content: {result['content']}")
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
| 184 |
|
| 185 |
context = "\n".join(context_parts)
|
| 186 |
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
user_details=self.chat_session.get_personalized_recommendation(),
|
| 206 |
-
previous_history=history,
|
| 207 |
-
context=context,
|
| 208 |
-
current_query=enhanced_query
|
| 209 |
-
)
|
| 210 |
-
elif env_recommendations :
|
| 211 |
-
prompt = ENVIRONMENTAL_PROMPT.format(
|
| 212 |
-
user_name=name,
|
| 213 |
-
user_age=age,
|
| 214 |
-
environmental_condition=self.environment_data.get_environmental_data(),
|
| 215 |
-
previous_history=history,
|
| 216 |
-
context=context,
|
| 217 |
-
current_query=enhanced_query
|
| 218 |
-
)
|
| 219 |
-
else:
|
| 220 |
-
prompt = DEFAULT_PROMPT.format(
|
| 221 |
-
previous_history=history,
|
| 222 |
-
context=context,
|
| 223 |
-
current_query=enhanced_query
|
| 224 |
-
)
|
| 225 |
-
|
| 226 |
-
prompt = prompt + language_prompt
|
| 227 |
-
|
| 228 |
-
response = Model().response = Model().llm(prompt,query)
|
| 229 |
|
| 230 |
end_time = datetime.now(timezone.utc)
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
keywords = ""
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
"keywords": keywords,
|
| 250 |
-
"images": image_results,
|
| 251 |
-
"context": context,
|
| 252 |
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 253 |
-
"session_id": self.chat_session.session_id
|
| 254 |
-
}
|
| 255 |
match = re.search(r'(## Personal Recommendations|## Environmental Considerations)', response)
|
| 256 |
-
if match
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
except Exception as e:
|
|
|
|
| 267 |
return {
|
| 268 |
"error": str(e),
|
| 269 |
"query": query,
|
| 270 |
-
"response": "
|
| 271 |
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 272 |
}
|
| 273 |
|
| 274 |
def web_search(self, query: str) -> Dict[str, Any]:
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
else:
|
| 278 |
-
return self.process_chat(query=query)
|
|
|
|
| 9 |
from app.services.prompts import *
|
| 10 |
from app.services.vector_database_search import VectorDatabaseSearch
|
| 11 |
import re
|
| 12 |
+
import logging
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
# Initialize vector database with error handling
|
| 17 |
+
try:
|
| 18 |
+
vectordb = VectorDatabaseSearch()
|
| 19 |
+
except Exception as e:
|
| 20 |
+
logger.error(f"Failed to initialize vector database: {e}")
|
| 21 |
+
vectordb = None
|
| 22 |
|
| 23 |
class ChatProcessor:
|
| 24 |
def __init__(self, token: str, session_id: Optional[str] = None, num_results: int = 3, num_images: int = 3):
|
|
|
|
| 67 |
name = profile['name']
|
| 68 |
age = profile['age']
|
| 69 |
self.chat_session.load_chat_history()
|
| 70 |
+
self.chat_session.update_title(self.session_id, query)
|
| 71 |
history = self.chat_session.format_history()
|
| 72 |
|
| 73 |
+
# Enhanced query generation
|
| 74 |
+
history_based_prompt = HISTORY_BASED_PROMPT.format(history=history, query=query)
|
| 75 |
enhanced_query = Model().send_message_openrouter(history_based_prompt)
|
| 76 |
|
| 77 |
self.session_id = self.ensure_valid_session(title=enhanced_query)
|
| 78 |
permission = self.chat_session.get_user_preferences()
|
| 79 |
+
websearch_enabled = permission.get('websearch', False)
|
| 80 |
env_recommendations = permission.get('environmental_recommendations', False)
|
| 81 |
personalized_recommendations = permission.get('personalized_recommendations', False)
|
| 82 |
keywords_permission = permission.get('keywords', False)
|
| 83 |
reference_permission = permission.get('references', False)
|
| 84 |
language = self.chat_session.get_language().lower()
|
| 85 |
|
| 86 |
+
language_prompt = LANGUAGE_RESPONSE_PROMPT.format(language=language)
|
| 87 |
|
| 88 |
+
# Check if vector database is available when websearch is disabled
|
| 89 |
+
vector_db_available = vectordb and vectordb.is_available() if not websearch_enabled else False
|
| 90 |
+
|
| 91 |
+
# If websearch is disabled and vector DB is not available, enable websearch as fallback
|
| 92 |
+
use_websearch = websearch_enabled or not vector_db_available
|
| 93 |
+
|
| 94 |
+
if use_websearch:
|
| 95 |
+
logger.info("Using web search for context")
|
| 96 |
with ThreadPoolExecutor(max_workers=2) as executor:
|
| 97 |
future_web = executor.submit(self.web_searcher.search, enhanced_query)
|
| 98 |
future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
|
|
|
|
| 108 |
references.append(result['link'])
|
| 109 |
|
| 110 |
context = "\n".join(context_parts)
|
| 111 |
+
|
| 112 |
+
# If web search returns no results, provide a helpful context
|
| 113 |
+
if not context:
|
| 114 |
+
context = "No specific information found. Please provide general dermatological advice based on your expertise."
|
| 115 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
else:
|
| 117 |
+
logger.info("Using vector database for context")
|
| 118 |
attach_image = False
|
| 119 |
|
| 120 |
+
with ThreadPoolExecutor(max_workers=1) as executor:
|
| 121 |
future_images = executor.submit(self.web_searcher.search_images, enhanced_query)
|
| 122 |
image_results = future_images.result()
|
| 123 |
|
| 124 |
start_time = datetime.now(timezone.utc)
|
| 125 |
|
| 126 |
+
# Search vector database
|
| 127 |
+
if vectordb:
|
| 128 |
+
results = vectordb.search(query=enhanced_query, top_k=5) # Increased top_k for better results
|
| 129 |
+
else:
|
| 130 |
+
results = []
|
| 131 |
|
| 132 |
context_parts = []
|
| 133 |
references = []
|
| 134 |
+
seen_pages = set()
|
| 135 |
|
| 136 |
for result in results:
|
| 137 |
+
confidence = result.get('confidence', 0)
|
| 138 |
+
# Lowered confidence threshold for better recall
|
| 139 |
+
if confidence > 30:
|
| 140 |
context_parts.append(f"Content: {result['content']}")
|
| 141 |
+
source = result.get('source', 'Unknown')
|
| 142 |
+
page = result.get('page', 0)
|
| 143 |
+
page_key = f"{source}_{page}"
|
| 144 |
+
if page_key not in seen_pages:
|
| 145 |
+
references.append(f"Source: {source}, Page: {page}")
|
| 146 |
+
seen_pages.add(page_key)
|
| 147 |
+
attach_image = True
|
| 148 |
|
| 149 |
context = "\n".join(context_parts)
|
| 150 |
|
| 151 |
+
# Provide more helpful context when vector search returns nothing
|
| 152 |
+
if not context or len(context) < 50:
|
| 153 |
+
logger.warning("Vector database returned insufficient context")
|
| 154 |
+
# Fall back to web search if available
|
| 155 |
+
if self.web_searcher:
|
| 156 |
+
logger.info("Falling back to web search due to insufficient vector results")
|
| 157 |
+
web_results = self.web_searcher.search(enhanced_query)
|
| 158 |
+
context_parts = []
|
| 159 |
+
references = []
|
| 160 |
+
for idx, result in enumerate(web_results[:3], 1):
|
| 161 |
+
if result['text']:
|
| 162 |
+
context_parts.append(f"From Source {idx}: {result['text']}\n")
|
| 163 |
+
references.append(result['link'])
|
| 164 |
+
context = "\n".join(context_parts)
|
| 165 |
+
|
| 166 |
+
if not context:
|
| 167 |
+
context = "Based on general dermatological knowledge and best practices."
|
| 168 |
+
attach_image = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
|
| 170 |
end_time = datetime.now(timezone.utc)
|
| 171 |
|
| 172 |
+
# Generate appropriate prompt based on user preferences
|
| 173 |
+
if env_recommendations and personalized_recommendations:
|
| 174 |
+
prompt = ENVIRONMENTAL_PERSONALIZED_PROMPT.format(
|
| 175 |
+
user_name=name,
|
| 176 |
+
user_age=age,
|
| 177 |
+
history=history,
|
| 178 |
+
user_details=self.chat_session.get_personalized_recommendation(),
|
| 179 |
+
environmental_condition=self.environment_data.get_environmental_data(),
|
| 180 |
+
previous_history=history,
|
| 181 |
+
context=context,
|
| 182 |
+
current_query=enhanced_query
|
| 183 |
+
)
|
| 184 |
+
elif personalized_recommendations:
|
| 185 |
+
prompt = PERSONALIZED_PROMPT.format(
|
| 186 |
+
user_name=name,
|
| 187 |
+
user_age=age,
|
| 188 |
+
user_details=self.chat_session.get_personalized_recommendation(),
|
| 189 |
+
previous_history=history,
|
| 190 |
+
context=context,
|
| 191 |
+
current_query=enhanced_query
|
| 192 |
+
)
|
| 193 |
+
elif env_recommendations:
|
| 194 |
+
prompt = ENVIRONMENTAL_PROMPT.format(
|
| 195 |
+
user_name=name,
|
| 196 |
+
user_age=age,
|
| 197 |
+
environmental_condition=self.environment_data.get_environmental_data(),
|
| 198 |
+
previous_history=history,
|
| 199 |
+
context=context,
|
| 200 |
+
current_query=enhanced_query
|
| 201 |
+
)
|
| 202 |
+
else:
|
| 203 |
+
prompt = DEFAULT_PROMPT.format(
|
| 204 |
+
previous_history=history,
|
| 205 |
+
context=context,
|
| 206 |
+
current_query=enhanced_query
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
prompt = prompt + "\n" + language_prompt
|
| 210 |
+
|
| 211 |
+
# Generate response
|
| 212 |
+
response = Model().llm(prompt, enhanced_query)
|
| 213 |
+
|
| 214 |
+
# Extract keywords if enabled
|
| 215 |
+
keywords = ""
|
| 216 |
+
if keywords_permission:
|
| 217 |
+
keywords = self.extract_keywords_yake(response, language=language)
|
| 218 |
+
|
| 219 |
+
if not reference_permission:
|
| 220 |
+
references = ""
|
| 221 |
+
|
| 222 |
+
# Prepare images
|
| 223 |
+
if not use_websearch and not attach_image:
|
| 224 |
+
image_results = ""
|
| 225 |
keywords = ""
|
| 226 |
|
| 227 |
+
# Prepare chat data
|
| 228 |
+
chat_data = {
|
| 229 |
+
"query": enhanced_query,
|
| 230 |
+
"response": response,
|
| 231 |
+
"references": references,
|
| 232 |
+
"page_no": "",
|
| 233 |
+
"keywords": keywords,
|
| 234 |
+
"images": image_results if 'image_results' in locals() else "",
|
| 235 |
+
"context": context,
|
| 236 |
+
"timestamp": datetime.now(timezone.utc).isoformat(),
|
| 237 |
+
"session_id": self.chat_session.session_id
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
# Save RAG details if using vector database
|
| 241 |
+
if not use_websearch and 'start_time' in locals() and 'end_time' in locals():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
match = re.search(r'(## Personal Recommendations|## Environmental Considerations)', response)
|
| 243 |
+
truncated_response = response[:match.start()].strip() if match else response
|
| 244 |
+
|
| 245 |
+
if not self.chat_session.save_details(
|
| 246 |
+
session_id=self.session_id,
|
| 247 |
+
context=context,
|
| 248 |
+
query=enhanced_query,
|
| 249 |
+
response=truncated_response,
|
| 250 |
+
rag_start_time=start_time,
|
| 251 |
+
rag_end_time=end_time
|
| 252 |
+
):
|
| 253 |
+
logger.warning("Failed to save RAG details")
|
| 254 |
+
|
| 255 |
+
# Save chat
|
| 256 |
+
if not self.chat_session.save_chat(chat_data):
|
| 257 |
+
raise ValueError("Failed to save chat message")
|
| 258 |
+
|
| 259 |
+
return chat_data
|
| 260 |
|
| 261 |
except Exception as e:
|
| 262 |
+
logger.error(f"Error in process_chat: {str(e)}")
|
| 263 |
return {
|
| 264 |
"error": str(e),
|
| 265 |
"query": query,
|
| 266 |
+
"response": "I apologize, but I'm experiencing technical difficulties. Please try again or enable web search in your preferences for better results.",
|
| 267 |
"timestamp": datetime.now(timezone.utc).isoformat()
|
| 268 |
}
|
| 269 |
|
| 270 |
def web_search(self, query: str) -> Dict[str, Any]:
|
| 271 |
+
"""Public method for web search endpoint"""
|
| 272 |
+
return self.process_chat(query=query)
|
|
|
|
|
|
app/services/vector_database_search.py
CHANGED
|
@@ -5,33 +5,81 @@ from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
| 5 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 6 |
from langchain_qdrant import Qdrant
|
| 7 |
from qdrant_client import QdrantClient, models
|
|
|
|
| 8 |
from dotenv import load_dotenv
|
|
|
|
| 9 |
|
| 10 |
load_dotenv()
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
|
| 13 |
QDRANT_URL = os.getenv("QDRANT_URL")
|
| 14 |
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 15 |
-
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME")
|
| 16 |
|
| 17 |
class VectorDatabaseSearch:
|
| 18 |
def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
|
| 19 |
self.collection_name = collection_name
|
| 20 |
self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 21 |
-
self.client =
|
| 22 |
-
self.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
def _initialize_collection(self):
|
| 31 |
"""Initialize Qdrant collection if it doesn't exist"""
|
|
|
|
|
|
|
|
|
|
| 32 |
try:
|
| 33 |
collections = self.client.get_collections()
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
self.client.create_collection(
|
| 36 |
collection_name=self.collection_name,
|
| 37 |
vectors_config=models.VectorParams(
|
|
@@ -39,12 +87,22 @@ class VectorDatabaseSearch:
|
|
| 39 |
distance=models.Distance.COSINE
|
| 40 |
)
|
| 41 |
)
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
except Exception as e:
|
| 44 |
-
|
|
|
|
| 45 |
|
| 46 |
def add_pdf(self, pdf_path):
|
| 47 |
"""Add PDF to vector database"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
try:
|
| 49 |
loader = PyPDFLoader(pdf_path)
|
| 50 |
docs = loader.load()
|
|
@@ -52,75 +110,107 @@ class VectorDatabaseSearch:
|
|
| 52 |
split_docs = splitter.split_documents(docs)
|
| 53 |
|
| 54 |
book_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 55 |
-
|
| 56 |
|
| 57 |
for doc in split_docs:
|
| 58 |
-
# Ensure metadata is stored in a consistent way
|
| 59 |
doc.metadata = {
|
| 60 |
"source": book_name,
|
| 61 |
"page": doc.metadata.get('page', 1),
|
| 62 |
"id": str(uuid.uuid4())
|
| 63 |
}
|
| 64 |
|
| 65 |
-
# Add documents to vector store
|
| 66 |
self.vectorstore.add_documents(split_docs)
|
| 67 |
-
|
| 68 |
return True
|
|
|
|
| 69 |
except Exception as e:
|
| 70 |
-
|
| 71 |
return False
|
| 72 |
|
| 73 |
def search(self, query, top_k=5):
|
| 74 |
"""Search documents based on query"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 77 |
|
| 78 |
formatted = []
|
| 79 |
for doc, score in results:
|
|
|
|
|
|
|
|
|
|
| 80 |
formatted.append({
|
| 81 |
-
"source": doc.metadata
|
| 82 |
-
"page": doc.metadata
|
| 83 |
"content": doc.page_content[:500],
|
| 84 |
-
"confidence": round(
|
| 85 |
})
|
|
|
|
|
|
|
| 86 |
return formatted
|
|
|
|
| 87 |
except Exception as e:
|
| 88 |
-
|
| 89 |
return []
|
| 90 |
|
| 91 |
def get_book_info(self):
|
| 92 |
"""Retrieve list of unique book sources in the collection"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
try:
|
| 94 |
-
#
|
| 95 |
collections = self.client.get_collections()
|
| 96 |
if not any(c.name == self.collection_name for c in collections.collections):
|
| 97 |
-
|
| 98 |
return []
|
| 99 |
-
|
| 100 |
-
# Get
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
points = self.client.scroll(
|
| 102 |
collection_name=self.collection_name,
|
| 103 |
-
limit=1000,
|
| 104 |
with_payload=True,
|
| 105 |
-
with_vectors=False
|
| 106 |
)[0]
|
| 107 |
|
| 108 |
-
# Debug information
|
| 109 |
-
print(f"Retrieved {len(points)} points from collection")
|
| 110 |
-
|
| 111 |
-
# Extract unique book sources from payloads
|
| 112 |
books = set()
|
| 113 |
for point in points:
|
| 114 |
-
# Check if payload exists and has 'metadata' field with 'source'
|
| 115 |
if hasattr(point, 'payload') and point.payload:
|
| 116 |
-
# Check different possible payload structures
|
| 117 |
if 'metadata' in point.payload and 'source' in point.payload['metadata']:
|
| 118 |
books.add(point.payload['metadata']['source'])
|
| 119 |
elif 'source' in point.payload:
|
| 120 |
books.add(point.payload['source'])
|
| 121 |
-
|
| 122 |
-
|
| 123 |
return list(books)
|
|
|
|
| 124 |
except Exception as e:
|
| 125 |
-
|
| 126 |
-
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from langchain_google_genai import GoogleGenerativeAIEmbeddings
|
| 6 |
from langchain_qdrant import Qdrant
|
| 7 |
from qdrant_client import QdrantClient, models
|
| 8 |
+
from qdrant_client.http.exceptions import UnexpectedResponse
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
+
import logging
|
| 11 |
|
| 12 |
load_dotenv()
|
| 13 |
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
|
| 19 |
QDRANT_URL = os.getenv("QDRANT_URL")
|
| 20 |
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 21 |
+
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "dermatology_docs")
|
| 22 |
|
| 23 |
class VectorDatabaseSearch:
|
| 24 |
def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
|
| 25 |
self.collection_name = collection_name
|
| 26 |
self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
|
| 27 |
+
self.client = None
|
| 28 |
+
self.vectorstore = None
|
| 29 |
+
self.is_initialized = False
|
| 30 |
+
|
| 31 |
+
# Initialize connection
|
| 32 |
+
self._initialize_connection()
|
| 33 |
|
| 34 |
+
def _initialize_connection(self):
|
| 35 |
+
"""Initialize Qdrant connection with proper error handling"""
|
| 36 |
+
try:
|
| 37 |
+
# Check if credentials are available
|
| 38 |
+
if not QDRANT_URL or not QDRANT_API_KEY:
|
| 39 |
+
logger.warning("Qdrant credentials not found. Vector search will be disabled.")
|
| 40 |
+
self.is_initialized = False
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# Initialize Qdrant client
|
| 44 |
+
self.client = QdrantClient(
|
| 45 |
+
url=QDRANT_URL,
|
| 46 |
+
api_key=QDRANT_API_KEY,
|
| 47 |
+
timeout=30 # Add timeout
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
# Test connection
|
| 51 |
+
self.client.get_collections()
|
| 52 |
+
|
| 53 |
+
# Initialize collection
|
| 54 |
+
self._initialize_collection()
|
| 55 |
+
|
| 56 |
+
# Initialize vector store
|
| 57 |
+
self.vectorstore = Qdrant(
|
| 58 |
+
client=self.client,
|
| 59 |
+
collection_name=self.collection_name,
|
| 60 |
+
embeddings=self.embeddings
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
self.is_initialized = True
|
| 64 |
+
logger.info(f"Successfully connected to Qdrant collection: {self.collection_name}")
|
| 65 |
+
|
| 66 |
+
except UnexpectedResponse as e:
|
| 67 |
+
logger.error(f"Authentication error with Qdrant: {e}")
|
| 68 |
+
self.is_initialized = False
|
| 69 |
+
except Exception as e:
|
| 70 |
+
logger.error(f"Error initializing Qdrant connection: {e}")
|
| 71 |
+
self.is_initialized = False
|
| 72 |
|
| 73 |
def _initialize_collection(self):
|
| 74 |
"""Initialize Qdrant collection if it doesn't exist"""
|
| 75 |
+
if not self.client:
|
| 76 |
+
return
|
| 77 |
+
|
| 78 |
try:
|
| 79 |
collections = self.client.get_collections()
|
| 80 |
+
collection_exists = any(c.name == self.collection_name for c in collections.collections)
|
| 81 |
+
|
| 82 |
+
if not collection_exists:
|
| 83 |
self.client.create_collection(
|
| 84 |
collection_name=self.collection_name,
|
| 85 |
vectors_config=models.VectorParams(
|
|
|
|
| 87 |
distance=models.Distance.COSINE
|
| 88 |
)
|
| 89 |
)
|
| 90 |
+
logger.info(f"Created new collection: {self.collection_name}")
|
| 91 |
+
else:
|
| 92 |
+
# Check if collection has data
|
| 93 |
+
collection_info = self.client.get_collection(self.collection_name)
|
| 94 |
+
logger.info(f"Collection {self.collection_name} exists with {collection_info.points_count} points")
|
| 95 |
+
|
| 96 |
except Exception as e:
|
| 97 |
+
logger.error(f"Error initializing collection: {e}")
|
| 98 |
+
self.is_initialized = False
|
| 99 |
|
| 100 |
def add_pdf(self, pdf_path):
|
| 101 |
"""Add PDF to vector database"""
|
| 102 |
+
if not self.is_initialized:
|
| 103 |
+
logger.error("Vector database not initialized. Cannot add PDF.")
|
| 104 |
+
return False
|
| 105 |
+
|
| 106 |
try:
|
| 107 |
loader = PyPDFLoader(pdf_path)
|
| 108 |
docs = loader.load()
|
|
|
|
| 110 |
split_docs = splitter.split_documents(docs)
|
| 111 |
|
| 112 |
book_name = os.path.splitext(os.path.basename(pdf_path))[0]
|
| 113 |
+
logger.info(f"Processing {book_name} with {len(split_docs)} chunks")
|
| 114 |
|
| 115 |
for doc in split_docs:
|
|
|
|
| 116 |
doc.metadata = {
|
| 117 |
"source": book_name,
|
| 118 |
"page": doc.metadata.get('page', 1),
|
| 119 |
"id": str(uuid.uuid4())
|
| 120 |
}
|
| 121 |
|
|
|
|
| 122 |
self.vectorstore.add_documents(split_docs)
|
| 123 |
+
logger.info(f"Successfully added {len(split_docs)} chunks from {book_name}")
|
| 124 |
return True
|
| 125 |
+
|
| 126 |
except Exception as e:
|
| 127 |
+
logger.error(f"Error adding PDF: {e}")
|
| 128 |
return False
|
| 129 |
|
| 130 |
def search(self, query, top_k=5):
|
| 131 |
"""Search documents based on query"""
|
| 132 |
+
if not self.is_initialized:
|
| 133 |
+
logger.warning("Vector database not initialized. Returning empty results.")
|
| 134 |
+
return []
|
| 135 |
+
|
| 136 |
try:
|
| 137 |
+
# Check if collection has any data
|
| 138 |
+
collection_info = self.client.get_collection(self.collection_name)
|
| 139 |
+
if collection_info.points_count == 0:
|
| 140 |
+
logger.warning(f"Collection {self.collection_name} is empty. No documents to search.")
|
| 141 |
+
return []
|
| 142 |
+
|
| 143 |
+
# Perform similarity search
|
| 144 |
results = self.vectorstore.similarity_search_with_score(query, k=top_k)
|
| 145 |
|
| 146 |
formatted = []
|
| 147 |
for doc, score in results:
|
| 148 |
+
# Convert score to confidence percentage (cosine similarity)
|
| 149 |
+
confidence = (1 - score) * 100 # Qdrant returns distance, not similarity
|
| 150 |
+
|
| 151 |
formatted.append({
|
| 152 |
+
"source": doc.metadata.get('source', 'Unknown'),
|
| 153 |
+
"page": doc.metadata.get('page', 0),
|
| 154 |
"content": doc.page_content[:500],
|
| 155 |
+
"confidence": round(confidence, 2)
|
| 156 |
})
|
| 157 |
+
|
| 158 |
+
logger.info(f"Found {len(formatted)} results for query: {query[:50]}...")
|
| 159 |
return formatted
|
| 160 |
+
|
| 161 |
except Exception as e:
|
| 162 |
+
logger.error(f"Search error: {e}")
|
| 163 |
return []
|
| 164 |
|
| 165 |
def get_book_info(self):
|
| 166 |
"""Retrieve list of unique book sources in the collection"""
|
| 167 |
+
if not self.is_initialized:
|
| 168 |
+
logger.warning("Vector database not initialized.")
|
| 169 |
+
return []
|
| 170 |
+
|
| 171 |
try:
|
| 172 |
+
# Check if collection exists
|
| 173 |
collections = self.client.get_collections()
|
| 174 |
if not any(c.name == self.collection_name for c in collections.collections):
|
| 175 |
+
logger.info(f"Collection {self.collection_name} does not exist yet")
|
| 176 |
return []
|
| 177 |
+
|
| 178 |
+
# Get collection info
|
| 179 |
+
collection_info = self.client.get_collection(self.collection_name)
|
| 180 |
+
if collection_info.points_count == 0:
|
| 181 |
+
logger.info("Collection is empty")
|
| 182 |
+
return []
|
| 183 |
+
|
| 184 |
+
# Get sample of points to extract sources
|
| 185 |
points = self.client.scroll(
|
| 186 |
collection_name=self.collection_name,
|
| 187 |
+
limit=min(1000, collection_info.points_count),
|
| 188 |
with_payload=True,
|
| 189 |
+
with_vectors=False
|
| 190 |
)[0]
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
books = set()
|
| 193 |
for point in points:
|
|
|
|
| 194 |
if hasattr(point, 'payload') and point.payload:
|
|
|
|
| 195 |
if 'metadata' in point.payload and 'source' in point.payload['metadata']:
|
| 196 |
books.add(point.payload['metadata']['source'])
|
| 197 |
elif 'source' in point.payload:
|
| 198 |
books.add(point.payload['source'])
|
| 199 |
+
|
| 200 |
+
logger.info(f"Found {len(books)} unique books in collection")
|
| 201 |
return list(books)
|
| 202 |
+
|
| 203 |
except Exception as e:
|
| 204 |
+
logger.error(f"Error retrieving book info: {e}")
|
| 205 |
+
return []
|
| 206 |
+
|
| 207 |
+
def is_available(self):
|
| 208 |
+
"""Check if vector database is available and has data"""
|
| 209 |
+
if not self.is_initialized:
|
| 210 |
+
return False
|
| 211 |
+
|
| 212 |
+
try:
|
| 213 |
+
collection_info = self.client.get_collection(self.collection_name)
|
| 214 |
+
return collection_info.points_count > 0
|
| 215 |
+
except:
|
| 216 |
+
return False
|