iamfaham commited on
Commit
3603603
·
verified ·
1 Parent(s): c3980ee

Upload appwrite_service.py

Browse files
Files changed (1) hide show
  1. appwrite_service.py +919 -0
appwrite_service.py ADDED
@@ -0,0 +1,919 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from appwrite.client import Client
4
+ from appwrite.services.databases import Databases
5
+ from appwrite.services.storage import Storage
6
+ from appwrite.input_file import InputFile
7
+ import json
8
+ import logging
9
+ from typing import List, Dict, Any, Optional
10
+ import tempfile
11
+ import time
12
+
13
+ # Load environment variables
14
+ load_dotenv()
15
+
16
+ # Configure logging
17
+ logging.basicConfig(level=logging.INFO)
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class AppwriteService:
22
+ def __init__(self):
23
+ """Initialize Appwrite client and services"""
24
+ # Validate required environment variables
25
+ self._validate_environment()
26
+
27
+ self.client = Client()
28
+
29
+ # Set up client with environment variables
30
+ self.client.set_endpoint(
31
+ os.getenv("APPWRITE_ENDPOINT", "https://cloud.appwrite.io/v1")
32
+ )
33
+ self.client.set_project(os.getenv("APPWRITE_PROJECT_ID"))
34
+ self.client.set_key(os.getenv("APPWRITE_API_KEY"))
35
+
36
+ # Initialize services
37
+ self.databases = Databases(self.client)
38
+ self.storage = Storage(self.client)
39
+
40
+ # Database and collection IDs
41
+ self.database_id = os.getenv("APPWRITE_DATABASE_ID", "react_docs_db")
42
+ self.chunks_collection_id = os.getenv(
43
+ "APPWRITE_COLLECTION_ID", "document_chunks"
44
+ )
45
+ self.completion_collection_id = "completion_status"
46
+ self.bucket_id = os.getenv("APPWRITE_BUCKET_ID", "react_docs_bucket")
47
+
48
+ # Initialize database and storage if they don't exist
49
+ self._initialize_database()
50
+ self._initialize_storage()
51
+
52
+ def _validate_environment(self):
53
+ """Validate that required environment variables are set"""
54
+ required_vars = ["APPWRITE_PROJECT_ID", "APPWRITE_API_KEY"]
55
+
56
+ missing_vars = []
57
+ for var in required_vars:
58
+ if not os.getenv(var):
59
+ missing_vars.append(var)
60
+
61
+ if missing_vars:
62
+ error_msg = (
63
+ f"Missing required environment variables: {', '.join(missing_vars)}"
64
+ )
65
+ logger.error(error_msg)
66
+ logger.error("Please set these variables in your .env file:")
67
+ for var in missing_vars:
68
+ logger.error(f" {var}=your_value_here")
69
+ raise ValueError(error_msg)
70
+
71
+ def _initialize_database(self):
72
+ """Initialize database and chunks collection if they don't exist"""
73
+ try:
74
+ # Check if database exists
75
+ try:
76
+ self.databases.get(database_id=self.database_id)
77
+ logger.info(f"Database {self.database_id} already exists")
78
+ except Exception:
79
+ # Create database
80
+ self.databases.create(
81
+ database_id=self.database_id, name="React Documentation Database"
82
+ )
83
+ logger.info(f"Created database {self.database_id}")
84
+
85
+ # Initialize chunks collection
86
+ self._initialize_chunks_collection()
87
+
88
+ # Initialize completion status collection
89
+ self._initialize_completion_collection()
90
+
91
+ except Exception as e:
92
+ logger.error(f"Error initializing database: {str(e)}")
93
+ raise
94
+
95
+ def _initialize_storage(self):
96
+ """Check if storage bucket exists (don't create if it doesn't)"""
97
+ try:
98
+ # Check if bucket exists
99
+ try:
100
+ self.storage.get_bucket(bucket_id=self.bucket_id)
101
+ logger.info(f"Storage bucket {self.bucket_id} exists and is accessible")
102
+ except Exception as e:
103
+ logger.error(
104
+ f"Storage bucket {self.bucket_id} not found or not accessible: {str(e)}"
105
+ )
106
+ logger.error(
107
+ "Please make sure the bucket exists and your API key has access to it"
108
+ )
109
+ raise
110
+
111
+ except Exception as e:
112
+ logger.error(f"Error checking storage bucket: {str(e)}")
113
+ raise
114
+
115
+ def _initialize_chunks_collection(self):
116
+ """Initialize chunks collection"""
117
+ try:
118
+ # Check if chunks collection exists
119
+ try:
120
+ self.databases.get_collection(
121
+ database_id=self.database_id,
122
+ collection_id=self.chunks_collection_id,
123
+ )
124
+ logger.info(
125
+ f"Chunks collection {self.chunks_collection_id} already exists"
126
+ )
127
+ except Exception:
128
+ # Create chunks collection
129
+ self.databases.create_collection(
130
+ database_id=self.database_id,
131
+ collection_id=self.chunks_collection_id,
132
+ name="Document Chunks",
133
+ )
134
+
135
+ # Create attributes for the chunks collection
136
+ self.databases.create_string_attribute(
137
+ database_id=self.database_id,
138
+ collection_id=self.chunks_collection_id,
139
+ key="content",
140
+ size=65536, # 64KB for content
141
+ required=True,
142
+ )
143
+
144
+ self.databases.create_string_attribute(
145
+ database_id=self.database_id,
146
+ collection_id=self.chunks_collection_id,
147
+ key="title",
148
+ size=255,
149
+ required=True,
150
+ )
151
+
152
+ self.databases.create_string_attribute(
153
+ database_id=self.database_id,
154
+ collection_id=self.chunks_collection_id,
155
+ key="url",
156
+ size=500,
157
+ required=False,
158
+ )
159
+
160
+ self.databases.create_string_attribute(
161
+ database_id=self.database_id,
162
+ collection_id=self.chunks_collection_id,
163
+ key="chunk_id",
164
+ size=100,
165
+ required=True,
166
+ )
167
+
168
+ logger.info(
169
+ f"Created chunks collection {self.chunks_collection_id} with attributes"
170
+ )
171
+
172
+ except Exception as e:
173
+ logger.error(f"Error initializing chunks collection: {str(e)}")
174
+ raise
175
+
176
+ def _initialize_completion_collection(self):
177
+ """Initialize completion status collection"""
178
+ try:
179
+ # Check if completion collection exists
180
+ try:
181
+ self.databases.get_collection(
182
+ database_id=self.database_id,
183
+ collection_id=self.completion_collection_id,
184
+ )
185
+ logger.info(
186
+ f"Completion collection {self.completion_collection_id} already exists"
187
+ )
188
+ except Exception:
189
+ # Create completion collection
190
+ self.databases.create_collection(
191
+ database_id=self.database_id,
192
+ collection_id=self.completion_collection_id,
193
+ name="Completion Status",
194
+ )
195
+
196
+ # Create attributes for the completion collection
197
+ self.databases.create_string_attribute(
198
+ database_id=self.database_id,
199
+ collection_id=self.completion_collection_id,
200
+ key="url",
201
+ size=500,
202
+ required=True,
203
+ )
204
+
205
+ self.databases.create_string_attribute(
206
+ database_id=self.database_id,
207
+ collection_id=self.completion_collection_id,
208
+ key="status",
209
+ size=50,
210
+ required=True,
211
+ )
212
+
213
+ self.databases.create_string_attribute(
214
+ database_id=self.database_id,
215
+ collection_id=self.completion_collection_id,
216
+ key="completed_at",
217
+ size=100,
218
+ required=True,
219
+ )
220
+
221
+ self.databases.create_integer_attribute(
222
+ database_id=self.database_id,
223
+ collection_id=self.completion_collection_id,
224
+ key="chunks_count",
225
+ required=True,
226
+ )
227
+
228
+ logger.info(
229
+ f"Created completion collection {self.completion_collection_id} with attributes"
230
+ )
231
+
232
+ except Exception as e:
233
+ logger.error(f"Error initializing completion collection: {str(e)}")
234
+ raise
235
+
236
+ def get_docs_file_id(self, url: str) -> str:
237
+ """Generate file ID based on the documentation URL"""
238
+ url_lower = url.lower()
239
+
240
+ # Map URLs to file IDs
241
+ if "react.dev" in url_lower or "reactjs.org" in url_lower:
242
+ return "react_docs_raw.json"
243
+ elif "docs.python.org" in url_lower or "python.org" in url_lower:
244
+ return "python_docs_raw.json"
245
+ elif "golang.org" in url_lower or "go.dev" in url_lower:
246
+ return "golang_docs_raw.json"
247
+ elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
248
+ return "mdn_docs_raw.json"
249
+ elif "vuejs.org" in url_lower:
250
+ return "vue_docs_raw.json"
251
+ elif "nodejs.org" in url_lower:
252
+ return "nodejs_docs_raw.json"
253
+ elif "angular.io" in url_lower:
254
+ return "angular_docs_raw.json"
255
+ elif "svelte.dev" in url_lower:
256
+ return "svelte_docs_raw.json"
257
+ elif "nextjs.org" in url_lower:
258
+ return "nextjs_docs_raw.json"
259
+ elif "nuxt.com" in url_lower:
260
+ return "nuxt_docs_raw.json"
261
+ elif "djangoproject.com" in url_lower or "django" in url_lower:
262
+ return "django_docs_raw.json"
263
+ elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
264
+ return "fastapi_docs_raw.json"
265
+ elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
266
+ return "docker_docs_raw.json"
267
+ elif "kubernetes.io" in url_lower:
268
+ return "kubernetes_docs_raw.json"
269
+ elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
270
+ return "mongodb_docs_raw.json"
271
+ elif "postgresql.org" in url_lower or "postgresql" in url_lower:
272
+ return "postgresql_docs_raw.json"
273
+ else:
274
+ # For unknown URLs, create a generic ID based on domain
275
+ from urllib.parse import urlparse
276
+
277
+ parsed = urlparse(url)
278
+ domain = parsed.netloc.replace(".", "_").replace("www_", "")
279
+ return f"{domain}_docs_raw.json"
280
+
281
+ def docs_already_exist(self, url: str) -> bool:
282
+ """Check if documentation for this URL already exists in storage"""
283
+ try:
284
+ file_id = self.get_docs_file_id(url)
285
+ # Try to get the file from storage
286
+ self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
287
+ logger.info(f"Documentation already exists for {url} (file: {file_id})")
288
+ return True
289
+ except Exception as e:
290
+ logger.info(f"Documentation does not exist for {url}: {str(e)}")
291
+ return False
292
+
293
+ def save_raw_docs_to_storage(
294
+ self, docs: List[Dict[str, Any]], url: str = None
295
+ ) -> bool:
296
+ """Save raw documents as JSON file to Appwrite storage bucket"""
297
+ temp_file_path = None
298
+ max_retries = 3
299
+ retry_delay = 2 # seconds
300
+
301
+ for attempt in range(max_retries):
302
+ try:
303
+ logger.info(
304
+ f"Saving {len(docs)} raw documents to Appwrite storage (attempt {attempt + 1}/{max_retries})"
305
+ )
306
+
307
+ # Generate file ID based on URL
308
+ file_id = self.get_docs_file_id(url) if url else "unknown_docs_raw.json"
309
+ logger.info(f"Using file ID: {file_id}")
310
+
311
+ # Create JSON content
312
+ json_content = json.dumps(docs, indent=2, ensure_ascii=False)
313
+
314
+ # Create temporary file with a unique name
315
+ temp_file_path = tempfile.mktemp(suffix=".json")
316
+
317
+ # Write content to temporary file
318
+ with open(temp_file_path, "w", encoding="utf-8") as temp_file:
319
+ temp_file.write(json_content)
320
+
321
+ # Upload file to storage bucket
322
+ input_file = InputFile.from_path(temp_file_path)
323
+
324
+ # Try to delete existing file first, then create new one
325
+ try:
326
+ # Try to delete existing file
327
+ self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
328
+ logger.info(f"Deleted existing file: {file_id}")
329
+ except Exception as e:
330
+ # File doesn't exist or can't be deleted, that's okay
331
+ logger.info(
332
+ f"Could not delete existing file (may not exist): {str(e)}"
333
+ )
334
+
335
+ # Upload to storage with retry logic
336
+ result = self.storage.create_file(
337
+ bucket_id=self.bucket_id,
338
+ file_id=file_id,
339
+ file=input_file,
340
+ )
341
+
342
+ logger.info(
343
+ f"Successfully saved raw documents to storage: {result['$id']}"
344
+ )
345
+ return True
346
+
347
+ except Exception as e:
348
+ logger.error(
349
+ f"Error saving raw documents to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
350
+ )
351
+
352
+ # Clean up temporary file on error
353
+ if temp_file_path and os.path.exists(temp_file_path):
354
+ try:
355
+ os.unlink(temp_file_path)
356
+ temp_file_path = None
357
+ except (OSError, PermissionError) as cleanup_error:
358
+ logger.warning(
359
+ f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
360
+ )
361
+
362
+ # If this is the last attempt, return False
363
+ if attempt == max_retries - 1:
364
+ logger.error(
365
+ f"Failed to save raw documents after {max_retries} attempts"
366
+ )
367
+ return False
368
+
369
+ # Wait before retrying
370
+ logger.info(f"Retrying in {retry_delay} seconds...")
371
+ time.sleep(retry_delay)
372
+ retry_delay *= 2 # Exponential backoff
373
+
374
+ return False
375
+
376
+ def get_raw_docs_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
377
+ """Retrieve raw documents from Appwrite storage bucket"""
378
+ max_retries = 3
379
+ retry_delay = 2 # seconds
380
+
381
+ for attempt in range(max_retries):
382
+ try:
383
+ logger.info(
384
+ f"Retrieving raw documents from Appwrite storage (attempt {attempt + 1}/{max_retries})"
385
+ )
386
+
387
+ # Generate file ID based on URL
388
+ file_id = self.get_docs_file_id(url) if url else "react_docs_raw.json"
389
+ logger.info(f"Looking for file: {file_id}")
390
+
391
+ # Download file from storage
392
+ result = self.storage.get_file_download(
393
+ bucket_id=self.bucket_id, file_id=file_id
394
+ )
395
+
396
+ logger.info(f"Download result type: {type(result)}")
397
+
398
+ # Handle different possible return types
399
+ docs = None
400
+
401
+ # Case 1: Result is already a list of dicts (JSON content)
402
+ if isinstance(result, list) and result and isinstance(result[0], dict):
403
+ docs = result
404
+ logger.info("Result is already a list of documents")
405
+
406
+ # Case 2: Result is bytes
407
+ elif isinstance(result, bytes):
408
+ json_content = result.decode("utf-8")
409
+ docs = json.loads(json_content)
410
+ logger.info("Result is bytes, decoded successfully")
411
+
412
+ # Case 3: Result is a list of bytes
413
+ elif (
414
+ isinstance(result, list) and result and isinstance(result[0], bytes)
415
+ ):
416
+ json_bytes = b"".join(result)
417
+ json_content = json_bytes.decode("utf-8")
418
+ docs = json.loads(json_content)
419
+ logger.info("Result is list of bytes, joined and decoded")
420
+
421
+ # Case 4: Result is a single dict
422
+ elif isinstance(result, dict):
423
+ docs = [result]
424
+ logger.info("Result is a single document dict")
425
+
426
+ # Case 5: Try to convert to string and parse
427
+ else:
428
+ try:
429
+ json_str = str(result)
430
+ docs = json.loads(json_str)
431
+ logger.info("Result converted to string and parsed")
432
+ except Exception as e:
433
+ logger.error(f"Failed to parse result: {str(e)}")
434
+ raise ValueError(
435
+ f"Could not parse downloaded file content: {str(e)}"
436
+ )
437
+
438
+ if docs is None:
439
+ raise ValueError("Could not parse the downloaded file content")
440
+
441
+ logger.info(f"Retrieved {len(docs)} raw documents from storage")
442
+ return docs
443
+
444
+ except Exception as e:
445
+ logger.error(
446
+ f"Error retrieving raw documents from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
447
+ )
448
+
449
+ # If this is the last attempt, return empty list
450
+ if attempt == max_retries - 1:
451
+ logger.error(
452
+ f"Failed to retrieve raw documents after {max_retries} attempts"
453
+ )
454
+ return []
455
+
456
+ # Wait before retrying
457
+ logger.info(f"Retrying in {retry_delay} seconds...")
458
+ time.sleep(retry_delay)
459
+ retry_delay *= 2 # Exponential backoff
460
+
461
+ return []
462
+
463
+ def get_chunks_file_id(self, url: str) -> str:
464
+ """Generate chunks file ID based on the documentation URL"""
465
+ url_lower = url.lower()
466
+
467
+ # Map URLs to chunks file IDs
468
+ if "react.dev" in url_lower or "reactjs.org" in url_lower:
469
+ return "react_docs_chunks.json"
470
+ elif "docs.python.org" in url_lower or "python.org" in url_lower:
471
+ return "python_docs_chunks.json"
472
+ elif "golang.org" in url_lower or "go.dev" in url_lower:
473
+ return "golang_docs_chunks.json"
474
+ elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
475
+ return "mdn_docs_chunks.json"
476
+ elif "vuejs.org" in url_lower:
477
+ return "vue_docs_chunks.json"
478
+ elif "nodejs.org" in url_lower:
479
+ return "nodejs_docs_chunks.json"
480
+ elif "angular.io" in url_lower:
481
+ return "angular_docs_chunks.json"
482
+ elif "svelte.dev" in url_lower:
483
+ return "svelte_docs_chunks.json"
484
+ elif "nextjs.org" in url_lower:
485
+ return "nextjs_docs_chunks.json"
486
+ elif "nuxt.com" in url_lower:
487
+ return "nuxt_docs_chunks.json"
488
+ elif "djangoproject.com" in url_lower or "django" in url_lower:
489
+ return "django_docs_chunks.json"
490
+ elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
491
+ return "fastapi_docs_chunks.json"
492
+ elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
493
+ return "docker_docs_chunks.json"
494
+ elif "kubernetes.io" in url_lower:
495
+ return "kubernetes_docs_chunks.json"
496
+ elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
497
+ return "mongodb_docs_chunks.json"
498
+ elif "postgresql.org" in url_lower or "postgresql" in url_lower:
499
+ return "postgresql_docs_chunks.json"
500
+ else:
501
+ # For unknown URLs, create a generic ID based on domain
502
+ from urllib.parse import urlparse
503
+
504
+ parsed = urlparse(url)
505
+ domain = parsed.netloc.replace(".", "_").replace("www_", "")
506
+ return f"{domain}_docs_chunks.json"
507
+
508
+ def chunks_already_exist(self, url: str) -> bool:
509
+ """Check if chunks for this URL already exist in storage"""
510
+ try:
511
+ file_id = self.get_chunks_file_id(url)
512
+ # Try to get the file from storage
513
+ self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
514
+ logger.info(f"Chunks already exist for {url} (file: {file_id})")
515
+ return True
516
+ except Exception as e:
517
+ logger.info(f"Chunks do not exist for {url}: {str(e)}")
518
+ return False
519
+
520
+ def save_chunks_to_storage(
521
+ self, chunks: List[Dict[str, Any]], url: str = None
522
+ ) -> bool:
523
+ """Save document chunks as JSON file to Appwrite storage bucket (FAST)"""
524
+ temp_file_path = None
525
+ max_retries = 3
526
+ retry_delay = 2 # seconds
527
+
528
+ for attempt in range(max_retries):
529
+ try:
530
+ logger.info(
531
+ f"Saving {len(chunks)} chunks to Appwrite storage (attempt {attempt + 1}/{max_retries})"
532
+ )
533
+
534
+ # Generate file ID based on URL
535
+ file_id = (
536
+ self.get_chunks_file_id(url) if url else "unknown_docs_chunks.json"
537
+ )
538
+ logger.info(f"Using chunks file ID: {file_id}")
539
+
540
+ # Create JSON content
541
+ json_content = json.dumps(chunks, indent=2, ensure_ascii=False)
542
+
543
+ # Create temporary file with a unique name
544
+ temp_file_path = tempfile.mktemp(suffix=".json")
545
+
546
+ # Write content to temporary file
547
+ with open(temp_file_path, "w", encoding="utf-8") as temp_file:
548
+ temp_file.write(json_content)
549
+
550
+ # Upload file to storage bucket
551
+ input_file = InputFile.from_path(temp_file_path)
552
+
553
+ # Try to delete existing file first, then create new one
554
+ try:
555
+ # Try to delete existing file
556
+ self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
557
+ logger.info(f"Deleted existing chunks file: {file_id}")
558
+ except Exception as e:
559
+ # File doesn't exist or can't be deleted, that's okay
560
+ logger.info(
561
+ f"Could not delete existing chunks file (may not exist): {str(e)}"
562
+ )
563
+
564
+ # Upload to storage with retry logic
565
+ result = self.storage.create_file(
566
+ bucket_id=self.bucket_id,
567
+ file_id=file_id,
568
+ file=input_file,
569
+ )
570
+
571
+ logger.info(f"Successfully saved chunks to storage: {result['$id']}")
572
+ return True
573
+
574
+ except Exception as e:
575
+ logger.error(
576
+ f"Error saving chunks to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
577
+ )
578
+
579
+ # Clean up temporary file on error
580
+ if temp_file_path and os.path.exists(temp_file_path):
581
+ try:
582
+ os.unlink(temp_file_path)
583
+ temp_file_path = None
584
+ except (OSError, PermissionError) as cleanup_error:
585
+ logger.warning(
586
+ f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
587
+ )
588
+
589
+ # If this is the last attempt, return False
590
+ if attempt == max_retries - 1:
591
+ logger.error(f"Failed to save chunks after {max_retries} attempts")
592
+ return False
593
+
594
+ # Wait before retrying
595
+ logger.info(f"Retrying in {retry_delay} seconds...")
596
+ time.sleep(retry_delay)
597
+ retry_delay *= 2 # Exponential backoff
598
+
599
+ return False
600
+
601
+ def get_chunks_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
602
+ """Retrieve document chunks from Appwrite storage bucket (FAST)"""
603
+ max_retries = 3
604
+ retry_delay = 2 # seconds
605
+
606
+ for attempt in range(max_retries):
607
+ try:
608
+ logger.info(
609
+ f"Retrieving chunks from Appwrite storage (attempt {attempt + 1}/{max_retries})"
610
+ )
611
+
612
+ # Generate file ID based on URL
613
+ file_id = (
614
+ self.get_chunks_file_id(url) if url else "react_docs_chunks.json"
615
+ )
616
+ logger.info(f"Looking for chunks file: {file_id}")
617
+
618
+ # Download file from storage
619
+ result = self.storage.get_file_download(
620
+ bucket_id=self.bucket_id, file_id=file_id
621
+ )
622
+
623
+ logger.info(f"Download result type: {type(result)}")
624
+
625
+ # Handle different possible return types
626
+ chunks = None
627
+
628
+ # Case 1: Result is already a list of dicts (JSON content)
629
+ if isinstance(result, list) and result and isinstance(result[0], dict):
630
+ chunks = result
631
+ logger.info("Result is already a list of chunks")
632
+
633
+ # Case 2: Result is bytes
634
+ elif isinstance(result, bytes):
635
+ json_content = result.decode("utf-8")
636
+ chunks = json.loads(json_content)
637
+ logger.info("Result is bytes, decoded successfully")
638
+
639
+ # Case 3: Result is a list of bytes
640
+ elif (
641
+ isinstance(result, list) and result and isinstance(result[0], bytes)
642
+ ):
643
+ json_bytes = b"".join(result)
644
+ json_content = json_bytes.decode("utf-8")
645
+ chunks = json.loads(json_content)
646
+ logger.info("Result is list of bytes, joined and decoded")
647
+
648
+ # Case 4: Result is a single dict
649
+ elif isinstance(result, dict):
650
+ chunks = [result]
651
+ logger.info("Result is a single chunk dict")
652
+
653
+ # Case 5: Try to convert to string and parse
654
+ else:
655
+ try:
656
+ json_str = str(result)
657
+ chunks = json.loads(json_str)
658
+ logger.info("Result converted to string and parsed")
659
+ except Exception as e:
660
+ logger.error(f"Failed to parse result: {str(e)}")
661
+ raise ValueError(
662
+ f"Could not parse downloaded chunks file content: {str(e)}"
663
+ )
664
+
665
+ if chunks is None:
666
+ raise ValueError(
667
+ "Could not parse the downloaded chunks file content"
668
+ )
669
+
670
+ logger.info(f"Retrieved {len(chunks)} chunks from storage")
671
+ return chunks
672
+
673
+ except Exception as e:
674
+ logger.error(
675
+ f"Error retrieving chunks from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
676
+ )
677
+
678
+ # If this is the last attempt, return empty list
679
+ if attempt == max_retries - 1:
680
+ logger.error(
681
+ f"Failed to retrieve chunks after {max_retries} attempts"
682
+ )
683
+ return []
684
+
685
+ # Wait before retrying
686
+ logger.info(f"Retrying in {retry_delay} seconds...")
687
+ time.sleep(retry_delay)
688
+ retry_delay *= 2 # Exponential backoff
689
+
690
+ return []
691
+
692
+ def save_chunks(self, chunks: List[Dict[str, Any]], url: str = None) -> bool:
693
+ """Save document chunks - optimized version using storage bucket"""
694
+ try:
695
+ logger.info(f"Saving {len(chunks)} chunks using optimized method")
696
+
697
+ # Use the fast storage method instead of database
698
+ return self.save_chunks_to_storage(chunks, url)
699
+
700
+ except Exception as e:
701
+ logger.error(f"Error saving chunks: {str(e)}")
702
+ return False
703
+
704
+ def get_all_chunks(self, url: str = None) -> List[Dict[str, Any]]:
705
+ """Retrieve all document chunks - optimized version using storage bucket"""
706
+ try:
707
+ logger.info("Retrieving all chunks using optimized method")
708
+
709
+ # Use the fast storage method instead of database
710
+ return self.get_chunks_from_storage(url)
711
+
712
+ except Exception as e:
713
+ logger.error(f"Error retrieving chunks: {str(e)}")
714
+ return []
715
+
716
+ def search_chunks(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
717
+ """Search for chunks containing specific text"""
718
+ try:
719
+ logger.info(f"Searching for chunks with query: {query}")
720
+
721
+ # Search documents in the collection
722
+ response = self.databases.list_documents(
723
+ database_id=self.database_id,
724
+ collection_id=self.chunks_collection_id,
725
+ queries=[],
726
+ )
727
+
728
+ chunks = []
729
+ for doc in response["documents"]:
730
+ # Simple client-side search for now
731
+ if (
732
+ query.lower() in doc["content"].lower()
733
+ or query.lower() in doc["title"].lower()
734
+ ):
735
+ chunks.append(
736
+ {
737
+ "content": doc["content"],
738
+ "title": doc["title"],
739
+ "url": doc.get("url", ""),
740
+ "chunk_id": doc["chunk_id"],
741
+ }
742
+ )
743
+
744
+ logger.info(f"Found {len(chunks)} matching chunks")
745
+ return chunks[:limit]
746
+
747
+ except Exception as e:
748
+ logger.error(f"Error searching chunks in Appwrite: {str(e)}")
749
+ return []
750
+
751
+ def delete_raw_docs_from_storage(self) -> bool:
752
+ """Delete raw documents file from storage bucket"""
753
+ try:
754
+ logger.info("Deleting raw documents from storage")
755
+
756
+ # Delete file from storage
757
+ self.storage.delete_file(
758
+ bucket_id=self.bucket_id, file_id="react_docs_raw.json"
759
+ )
760
+
761
+ logger.info("Successfully deleted raw documents from storage")
762
+ return True
763
+
764
+ except Exception as e:
765
+ logger.error(f"Error deleting raw documents from storage: {str(e)}")
766
+ return False
767
+
768
+ def delete_all_chunks(self) -> bool:
769
+ """Delete all chunks from the database (use with caution)"""
770
+ try:
771
+ logger.info("Deleting all chunks from Appwrite")
772
+
773
+ # Get all documents
774
+ response = self.databases.list_documents(
775
+ database_id=self.database_id,
776
+ collection_id=self.chunks_collection_id,
777
+ )
778
+
779
+ # Delete each document
780
+ for doc in response["documents"]:
781
+ self.databases.delete_document(
782
+ database_id=self.database_id,
783
+ collection_id=self.chunks_collection_id,
784
+ document_id=doc["$id"],
785
+ )
786
+
787
+ logger.info("Successfully deleted all chunks")
788
+ return True
789
+
790
+ except Exception as e:
791
+ logger.error(f"Error deleting chunks from Appwrite: {str(e)}")
792
+ return False
793
+
794
+ def get_raw_docs_count(self) -> int:
795
+ """Get the total number of raw documents in storage"""
796
+ try:
797
+ # Check if raw docs file exists
798
+ try:
799
+ self.storage.get_file(
800
+ bucket_id=self.bucket_id, file_id="react_docs_raw.json"
801
+ )
802
+ # If file exists, get the count from the content
803
+ docs = self.get_raw_docs_from_storage()
804
+ return len(docs)
805
+ except Exception:
806
+ return 0
807
+ except Exception as e:
808
+ logger.error(f"Error getting raw docs count: {str(e)}")
809
+ return 0
810
+
811
+ def get_chunks_count(self) -> int:
812
+ """Get the total number of chunks in the database"""
813
+ try:
814
+ response = self.databases.list_documents(
815
+ database_id=self.database_id,
816
+ collection_id=self.chunks_collection_id,
817
+ )
818
+ return response["total"]
819
+ except Exception as e:
820
+ logger.error(f"Error getting chunks count: {str(e)}")
821
+ return 0
822
+
823
+ def clear_all_data(self) -> bool:
824
+ """Clear all data from both storage and database"""
825
+ try:
826
+ logger.info("Clearing all data from storage and database")
827
+ success1 = self.delete_raw_docs_from_storage()
828
+ success2 = self.delete_all_chunks()
829
+ return success1 and success2
830
+ except Exception as e:
831
+ logger.error(f"Error clearing all data: {str(e)}")
832
+ return False
833
+
834
+ def list_storage_files(self) -> List[str]:
835
+ """List all files in the storage bucket"""
836
+ try:
837
+ response = self.storage.list_files(bucket_id=self.bucket_id)
838
+ files = [file["$id"] for file in response["files"]]
839
+ logger.info(f"Found {len(files)} files in storage")
840
+ return files
841
+ except Exception as e:
842
+ logger.error(f"Error listing storage files: {str(e)}")
843
+ return []
844
+
845
+ def save_completion_status(self, url: str, chunks_count: int) -> bool:
846
+ """Save completion status for a documentation URL"""
847
+ try:
848
+ import datetime
849
+
850
+ # Check if completion record already exists
851
+ existing_record = self.get_completion_status(url)
852
+
853
+ if existing_record:
854
+ # Update existing record
855
+ self.databases.update_document(
856
+ database_id=self.database_id,
857
+ collection_id=self.completion_collection_id,
858
+ document_id=existing_record["$id"],
859
+ data={
860
+ "url": url,
861
+ "status": "completed",
862
+ "completed_at": datetime.datetime.now().isoformat(),
863
+ "chunks_count": chunks_count,
864
+ },
865
+ )
866
+ logger.info(f"Updated completion status for {url}")
867
+ else:
868
+ # Create new record
869
+ self.databases.create_document(
870
+ database_id=self.database_id,
871
+ collection_id=self.completion_collection_id,
872
+ document_id="unique()",
873
+ data={
874
+ "url": url,
875
+ "status": "completed",
876
+ "completed_at": datetime.datetime.now().isoformat(),
877
+ "chunks_count": chunks_count,
878
+ },
879
+ )
880
+ logger.info(f"Saved completion status for {url}")
881
+
882
+ return True
883
+ except Exception as e:
884
+ logger.error(f"Error saving completion status: {str(e)}")
885
+ return False
886
+
887
+ def get_completion_status(self, url: str) -> Optional[Dict[str, Any]]:
888
+ """Get completion status for a documentation URL"""
889
+ try:
890
+ from appwrite.query import Query
891
+
892
+ response = self.databases.list_documents(
893
+ database_id=self.database_id,
894
+ collection_id=self.completion_collection_id,
895
+ queries=[Query.equal("url", url)],
896
+ )
897
+
898
+ if response["documents"]:
899
+ return response["documents"][0]
900
+ return None
901
+ except Exception as e:
902
+ logger.error(f"Error getting completion status: {str(e)}")
903
+ return None
904
+
905
+ def is_fully_processed(self, url: str) -> bool:
906
+ """Check if documentation is fully processed (has completion status)"""
907
+ try:
908
+ completion_status = self.get_completion_status(url)
909
+ return (
910
+ completion_status is not None
911
+ and completion_status.get("status") == "completed"
912
+ )
913
+ except Exception as e:
914
+ logger.error(f"Error checking if fully processed: {str(e)}")
915
+ return False
916
+
917
+
918
+ # Global instance
919
+ appwrite_service = AppwriteService()