File size: 8,574 Bytes
75e2b6c
 
 
 
 
 
 
e02b28a
75e2b6c
e02b28a
75e2b6c
 
 
e02b28a
 
 
 
75e2b6c
 
 
e02b28a
75e2b6c
 
 
 
 
e02b28a
 
 
 
 
 
75e2b6c
e02b28a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75e2b6c
 
 
e02b28a
 
 
75e2b6c
 
e02b28a
 
 
75e2b6c
 
 
 
 
 
 
e02b28a
 
 
 
 
 
75e2b6c
e02b28a
 
75e2b6c
 
 
e02b28a
 
 
 
75e2b6c
 
 
 
 
 
 
e02b28a
d2d11be
75e2b6c
 
 
 
 
 
 
 
e02b28a
75e2b6c
e02b28a
75e2b6c
e02b28a
75e2b6c
 
 
 
e02b28a
 
 
 
75e2b6c
e02b28a
 
 
 
 
 
 
75e2b6c
 
 
 
e02b28a
 
 
75e2b6c
e02b28a
 
75e2b6c
e02b28a
75e2b6c
e02b28a
 
75e2b6c
e02b28a
75e2b6c
e02b28a
75e2b6c
 
 
 
e02b28a
 
 
 
75e2b6c
e02b28a
d2d11be
 
e02b28a
d2d11be
e02b28a
 
 
 
 
 
 
 
75e2b6c
 
e02b28a
d2d11be
e02b28a
75e2b6c
 
d2d11be
 
 
 
 
 
 
e02b28a
 
75e2b6c
e02b28a
75e2b6c
e02b28a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
import os
import uuid
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_qdrant import Qdrant
from qdrant_client import QdrantClient, models
from qdrant_client.http.exceptions import UnexpectedResponse
from dotenv import load_dotenv
import logging

load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

os.environ["GOOGLE_API_KEY"] = os.getenv("GEMINI_API_KEY")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION_NAME = os.getenv("QDRANT_COLLECTION_NAME", "dermatology_docs")

class VectorDatabaseSearch:
    def __init__(self, collection_name=QDRANT_COLLECTION_NAME):
        self.collection_name = collection_name
        self.embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
        self.client = None
        self.vectorstore = None
        self.is_initialized = False
        
        # Initialize connection
        self._initialize_connection()
        
    def _initialize_connection(self):
        """Initialize Qdrant connection with proper error handling"""
        try:
            # Check if credentials are available
            if not QDRANT_URL or not QDRANT_API_KEY:
                logger.warning("Qdrant credentials not found. Vector search will be disabled.")
                self.is_initialized = False
                return
            
            # Initialize Qdrant client
            self.client = QdrantClient(
                url=QDRANT_URL, 
                api_key=QDRANT_API_KEY,
                timeout=30  # Add timeout
            )
            
            # Test connection
            self.client.get_collections()
            
            # Initialize collection
            self._initialize_collection()
            
            # Initialize vector store
            self.vectorstore = Qdrant(
                client=self.client,
                collection_name=self.collection_name,
                embeddings=self.embeddings
            )
            
            self.is_initialized = True
            logger.info(f"Successfully connected to Qdrant collection: {self.collection_name}")
            
        except UnexpectedResponse as e:
            logger.error(f"Authentication error with Qdrant: {e}")
            self.is_initialized = False
        except Exception as e:
            logger.error(f"Error initializing Qdrant connection: {e}")
            self.is_initialized = False

    def _initialize_collection(self):
        """Initialize Qdrant collection if it doesn't exist"""
        if not self.client:
            return
            
        try:
            collections = self.client.get_collections()
            collection_exists = any(c.name == self.collection_name for c in collections.collections)
            
            if not collection_exists:
                self.client.create_collection(
                    collection_name=self.collection_name,
                    vectors_config=models.VectorParams(
                        size=768,  
                        distance=models.Distance.COSINE
                    )
                )
                logger.info(f"Created new collection: {self.collection_name}")
            else:
                # Check if collection has data
                collection_info = self.client.get_collection(self.collection_name)
                logger.info(f"Collection {self.collection_name} exists with {collection_info.points_count} points")
                
        except Exception as e:
            logger.error(f"Error initializing collection: {e}")
            self.is_initialized = False

    def add_pdf(self, pdf_path):
        """Add PDF to vector database"""
        if not self.is_initialized:
            logger.error("Vector database not initialized. Cannot add PDF.")
            return False
            
        try:
            loader = PyPDFLoader(pdf_path)
            docs = loader.load()
            splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
            split_docs = splitter.split_documents(docs)
            
            book_name = os.path.splitext(os.path.basename(pdf_path))[0]
            logger.info(f"Processing {book_name} with {len(split_docs)} chunks")
            
            for doc in split_docs:
                doc.metadata = {
                    "source": book_name,
                    "page": doc.metadata.get('page', 1),
                    "id": str(uuid.uuid4())
                }
            
            self.vectorstore.add_documents(split_docs)
            logger.info(f"Successfully added {len(split_docs)} chunks from {book_name}")
            return True
            
        except Exception as e:
            logger.error(f"Error adding PDF: {e}")
            return False

    def search(self, query, top_k=5):
        """Search documents based on query"""
        if not self.is_initialized:
            logger.warning("Vector database not initialized. Returning empty results.")
            return []
            
        try:
            # Check if collection has any data
            collection_info = self.client.get_collection(self.collection_name)
            if collection_info.points_count == 0:
                logger.warning(f"Collection {self.collection_name} is empty. No documents to search.")
                return []
            
            # Perform similarity search
            results = self.vectorstore.similarity_search_with_score(query, k=top_k)
            
            formatted = []
            for doc, score in results:
                # Convert score to confidence percentage (cosine similarity)
                confidence = (1 - score) * 100  # Qdrant returns distance, not similarity
                
                formatted.append({
                    "source": doc.metadata.get('source', 'Unknown'),
                    "page": doc.metadata.get('page', 0),
                    "content": doc.page_content[:500],
                    "confidence": round(confidence, 2)
                })
                
            logger.info(f"Found {len(formatted)} results for query: {query[:50]}...")
            return formatted
            
        except Exception as e:
            logger.error(f"Search error: {e}")
            return []

    def get_book_info(self):
        """Retrieve list of unique book sources in the collection"""
        if not self.is_initialized:
            logger.warning("Vector database not initialized.")
            return []
            
        try:
            # Check if collection exists
            collections = self.client.get_collections()
            if not any(c.name == self.collection_name for c in collections.collections):
                logger.info(f"Collection {self.collection_name} does not exist yet")
                return []
            
            # Get collection info
            collection_info = self.client.get_collection(self.collection_name)
            if collection_info.points_count == 0:
                logger.info("Collection is empty")
                return []
            
            # Get sample of points to extract sources
            points = self.client.scroll(
                collection_name=self.collection_name,
                limit=min(1000, collection_info.points_count),
                with_payload=True,
                with_vectors=False
            )[0]
            
            books = set()
            for point in points:
                if hasattr(point, 'payload') and point.payload:
                    if 'metadata' in point.payload and 'source' in point.payload['metadata']:
                        books.add(point.payload['metadata']['source'])
                    elif 'source' in point.payload:
                        books.add(point.payload['source'])
            
            logger.info(f"Found {len(books)} unique books in collection")
            return list(books)
            
        except Exception as e:
            logger.error(f"Error retrieving book info: {e}")
            return []

    def is_available(self):
        """Check if vector database is available and has data"""
        if not self.is_initialized:
            return False
            
        try:
            collection_info = self.client.get_collection(self.collection_name)
            return collection_info.points_count > 0
        except:
            return False