File size: 10,808 Bytes
1367957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
# processing/pinecone_manager.py
"""

Pinecone cloud vector database implementation

Scalable cloud-based vector search

"""

import pinecone
from typing import List, Dict, Any, Optional
import time
from embeddings.embedding_models import EmbeddingManager
from embeddings.text_chunking import ResearchPaperChunker


class PineconeManager:
    """Pinecone cloud vector database manager"""

    def __init__(self,

                 api_key: str = None,

                 environment: str = "us-west1-gcp",

                 index_name: str = "medical-research-papers",

                 embedding_model: str = "all-MiniLM-L6-v2",

                 chunk_strategy: str = "semantic"):

        self.api_key = api_key
        self.environment = environment
        self.index_name = index_name
        self.embedding_manager = EmbeddingManager(embedding_model)
        self.chunker = ResearchPaperChunker(chunk_strategy)

        if not api_key:
            print("⚠️  Pinecone API key not provided. Please set PINECONE_API_KEY environment variable.")
            return

        # Initialize Pinecone
        try:
            pinecone.init(api_key=api_key, environment=environment)
            print(f"βœ… Pinecone initialized: {environment}")

            # Create or connect to index
            if index_name not in pinecone.list_indexes():
                print(f"πŸ†• Creating new Pinecone index: {index_name}")
                self._create_index()
            else:
                print(f"πŸ“‚ Connecting to existing index: {index_name}")

            self.index = pinecone.Index(index_name)
            print("βœ… Pinecone index ready")

        except Exception as e:
            print(f"❌ Pinecone initialization error: {e}")
            raise

    def _create_index(self):
        """Create a new Pinecone index"""
        try:
            dimension = self.embedding_manager.get_embedding_dimensions()

            pinecone.create_index(
                name=self.index_name,
                dimension=dimension,
                metric="cosine",
                metadata_config={
                    "indexed": ["domain", "source", "publication_date"]
                }
            )

            # Wait for index to be ready
            while not pinecone.describe_index(self.index_name).status['ready']:
                time.sleep(1)

            print(f"βœ… Pinecone index created: {self.index_name} (dimension: {dimension})")

        except Exception as e:
            print(f"❌ Error creating Pinecone index: {e}")
            raise

    def add_papers(self, papers: List[Dict[str, Any]], batch_size: int = 100) -> bool:
        """Add papers to Pinecone"""
        try:
            if not hasattr(self, 'index'):
                print("❌ Pinecone not initialized properly")
                return False

            # Chunk all papers
            all_chunks = self.chunker.batch_chunk_papers(papers)

            if not all_chunks:
                print("⚠️  No chunks generated from papers")
                return False

            # Prepare vectors for Pinecone
            vectors = []
            chunk_texts = [chunk['text'] for chunk in all_chunks]
            embeddings = self.embedding_manager.encode(chunk_texts)

            for i, chunk in enumerate(all_chunks):
                vector_id = f"{chunk['paper_id']}_chunk_{i}"

                metadata = {
                    'paper_id': chunk['paper_id'],
                    'paper_title': chunk['paper_title'],
                    'text': chunk['text'],
                    'source': chunk['source'],
                    'domain': chunk['domain'],
                    'publication_date': chunk.get('publication_date', ''),
                    'chunk_strategy': chunk.get('chunk_strategy', 'semantic'),
                    'chunk_index': i,
                    'start_char': chunk.get('start_char', 0),
                    'end_char': chunk.get('end_char', 0)
                }

                # Add authors if available
                if chunk.get('authors'):
                    metadata['authors'] = ','.join(chunk['authors'][:3])  # Limit author list

                vectors.append((vector_id, embeddings[i].tolist(), metadata))

            # Upload in batches
            total_vectors = len(vectors)
            for i in range(0, total_vectors, batch_size):
                batch_end = min(i + batch_size, total_vectors)
                batch_vectors = vectors[i:batch_end]

                self.index.upsert(vectors=batch_vectors)
                print(f"πŸ“¦ Uploaded batch {i // batch_size + 1}: {i}-{batch_end - 1} vectors")

                # Small delay to avoid rate limits
                time.sleep(0.1)

            print(f"βœ… Successfully uploaded {total_vectors} vectors from {len(papers)} papers")
            return True

        except Exception as e:
            print(f"❌ Error adding papers to Pinecone: {e}")
            return False

    def search(self,

               query: str,

               domain: str = None,

               n_results: int = 10,

               include_metadata: bool = True,

               include_values: bool = False) -> List[Dict[str, Any]]:
        """Search for similar paper chunks in Pinecone"""
        try:
            if not hasattr(self, 'index'):
                print("❌ Pinecone not initialized properly")
                return []

            # Encode query
            query_embedding = self.embedding_manager.encode([query])[0].tolist()

            # Build filter
            filter_dict = {}
            if domain:
                filter_dict['domain'] = {'$eq': domain}

            # Perform search
            results = self.index.query(
                vector=query_embedding,
                top_k=n_results,
                filter=filter_dict if filter_dict else None,
                include_metadata=include_metadata,
                include_values=include_values
            )

            # Format results
            formatted_results = []
            for match in results['matches']:
                formatted_results.append({
                    'text': match['metadata']['text'],
                    'metadata': match['metadata'],
                    'distance': match['score'],  # Pinecone uses score (cosine similarity)
                    'id': match['id']
                })

            return formatted_results

        except Exception as e:
            print(f"❌ Pinecone search error: {e}")
            return []

    def get_collection_stats(self) -> Dict[str, Any]:
        """Get statistics about the Pinecone index"""
        try:
            if not hasattr(self, 'index'):
                return {"error": "Pinecone not initialized"}

            stats = self.index.describe_index_stats()

            return {
                "total_vectors": stats['total_vector_count'],
                "dimension": stats['dimension'],
                "index_fullness": stats.get('index_fullness', 0),
                "namespaces": stats.get('namespaces', {}),
                "embedding_model": self.embedding_manager.model_name
            }

        except Exception as e:
            print(f"❌ Error getting Pinecone stats: {e}")
            return {}

    def delete_paper(self, paper_id: str) -> bool:
        """Delete all vectors for a specific paper"""
        try:
            if not hasattr(self, 'index'):
                print("❌ Pinecone not initialized properly")
                return False

            # Find all vectors for this paper
            results = self.index.query(
                vector=[0] * self.embedding_manager.get_embedding_dimensions(),  # Dummy vector
                filter={'paper_id': {'$eq': paper_id}},
                top_k=10000,  # Large number to get all matches
                include_metadata=False
            )

            vector_ids = [match['id'] for match in results['matches']]

            if vector_ids:
                self.index.delete(ids=vector_ids)
                print(f"βœ… Deleted {len(vector_ids)} vectors for paper {paper_id}")
                return True
            else:
                print(f"⚠️  No vectors found for paper {paper_id}")
                return False

        except Exception as e:
            print(f"❌ Error deleting paper {paper_id}: {e}")
            return False

    def update_paper(self, paper_id: str, paper_data: Dict[str, Any]) -> bool:
        """Update a paper's vectors"""
        try:
            # First delete existing vectors
            self.delete_paper(paper_id)

            # Then add updated paper
            return self.add_papers([paper_data])

        except Exception as e:
            print(f"❌ Error updating paper {paper_id}: {e}")
            return False


# Quick test (requires actual Pinecone API key)
def test_pinecone_manager():
    """Test Pinecone manager (requires API key)"""
    import os

    api_key = os.getenv('PINECONE_API_KEY')
    if not api_key:
        print("❌ Pinecone API key not found in environment variables")
        print("   Set PINECONE_API_KEY to test Pinecone functionality")
        return

    test_papers = [
        {
            'id': 'test_001',
            'title': 'AI in Medical Imaging',
            'abstract': 'Deep learning transforms medical image analysis with improved accuracy.',
            'source': 'test',
            'domain': 'medical_imaging',
            'authors': ['John Doe', 'Jane Smith']
        }
    ]

    print("πŸ§ͺ Testing Pinecone Manager")
    print("=" * 50)

    try:
        manager = PineconeManager(
            api_key=api_key,
            index_name="test-medical-papers",
            embedding_model="all-MiniLM-L6-v2"
        )

        # Add test papers
        success = manager.add_papers(test_papers)
        if success:
            print("βœ… Papers added successfully")

            # Test search
            results = manager.search("medical image analysis", n_results=5)
            print(f"πŸ” Search results: {len(results)} chunks found")
            for result in results[:2]:
                print(f"   - {result['metadata']['paper_title']} (score: {result['distance']:.3f})")

            # Get stats
            stats = manager.get_collection_stats()
            print(f"πŸ“Š Collection stats: {stats}")

        else:
            print("❌ Failed to add papers")

    except Exception as e:
        print(f"❌ Pinecone test failed: {e}")


if __name__ == "__main__":
    test_pinecone_manager()