Spaces:
Sleeping
Sleeping
Upload appwrite_service.py
Browse files- appwrite_service.py +919 -0
appwrite_service.py
ADDED
|
@@ -0,0 +1,919 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from appwrite.client import Client
|
| 4 |
+
from appwrite.services.databases import Databases
|
| 5 |
+
from appwrite.services.storage import Storage
|
| 6 |
+
from appwrite.input_file import InputFile
|
| 7 |
+
import json
|
| 8 |
+
import logging
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
import tempfile
|
| 11 |
+
import time
|
| 12 |
+
|
| 13 |
+
# Load environment variables
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
# Configure logging
|
| 17 |
+
logging.basicConfig(level=logging.INFO)
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class AppwriteService:
|
| 22 |
+
def __init__(self):
|
| 23 |
+
"""Initialize Appwrite client and services"""
|
| 24 |
+
# Validate required environment variables
|
| 25 |
+
self._validate_environment()
|
| 26 |
+
|
| 27 |
+
self.client = Client()
|
| 28 |
+
|
| 29 |
+
# Set up client with environment variables
|
| 30 |
+
self.client.set_endpoint(
|
| 31 |
+
os.getenv("APPWRITE_ENDPOINT", "https://cloud.appwrite.io/v1")
|
| 32 |
+
)
|
| 33 |
+
self.client.set_project(os.getenv("APPWRITE_PROJECT_ID"))
|
| 34 |
+
self.client.set_key(os.getenv("APPWRITE_API_KEY"))
|
| 35 |
+
|
| 36 |
+
# Initialize services
|
| 37 |
+
self.databases = Databases(self.client)
|
| 38 |
+
self.storage = Storage(self.client)
|
| 39 |
+
|
| 40 |
+
# Database and collection IDs
|
| 41 |
+
self.database_id = os.getenv("APPWRITE_DATABASE_ID", "react_docs_db")
|
| 42 |
+
self.chunks_collection_id = os.getenv(
|
| 43 |
+
"APPWRITE_COLLECTION_ID", "document_chunks"
|
| 44 |
+
)
|
| 45 |
+
self.completion_collection_id = "completion_status"
|
| 46 |
+
self.bucket_id = os.getenv("APPWRITE_BUCKET_ID", "react_docs_bucket")
|
| 47 |
+
|
| 48 |
+
# Initialize database and storage if they don't exist
|
| 49 |
+
self._initialize_database()
|
| 50 |
+
self._initialize_storage()
|
| 51 |
+
|
| 52 |
+
def _validate_environment(self):
|
| 53 |
+
"""Validate that required environment variables are set"""
|
| 54 |
+
required_vars = ["APPWRITE_PROJECT_ID", "APPWRITE_API_KEY"]
|
| 55 |
+
|
| 56 |
+
missing_vars = []
|
| 57 |
+
for var in required_vars:
|
| 58 |
+
if not os.getenv(var):
|
| 59 |
+
missing_vars.append(var)
|
| 60 |
+
|
| 61 |
+
if missing_vars:
|
| 62 |
+
error_msg = (
|
| 63 |
+
f"Missing required environment variables: {', '.join(missing_vars)}"
|
| 64 |
+
)
|
| 65 |
+
logger.error(error_msg)
|
| 66 |
+
logger.error("Please set these variables in your .env file:")
|
| 67 |
+
for var in missing_vars:
|
| 68 |
+
logger.error(f" {var}=your_value_here")
|
| 69 |
+
raise ValueError(error_msg)
|
| 70 |
+
|
| 71 |
+
def _initialize_database(self):
|
| 72 |
+
"""Initialize database and chunks collection if they don't exist"""
|
| 73 |
+
try:
|
| 74 |
+
# Check if database exists
|
| 75 |
+
try:
|
| 76 |
+
self.databases.get(database_id=self.database_id)
|
| 77 |
+
logger.info(f"Database {self.database_id} already exists")
|
| 78 |
+
except Exception:
|
| 79 |
+
# Create database
|
| 80 |
+
self.databases.create(
|
| 81 |
+
database_id=self.database_id, name="React Documentation Database"
|
| 82 |
+
)
|
| 83 |
+
logger.info(f"Created database {self.database_id}")
|
| 84 |
+
|
| 85 |
+
# Initialize chunks collection
|
| 86 |
+
self._initialize_chunks_collection()
|
| 87 |
+
|
| 88 |
+
# Initialize completion status collection
|
| 89 |
+
self._initialize_completion_collection()
|
| 90 |
+
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Error initializing database: {str(e)}")
|
| 93 |
+
raise
|
| 94 |
+
|
| 95 |
+
def _initialize_storage(self):
|
| 96 |
+
"""Check if storage bucket exists (don't create if it doesn't)"""
|
| 97 |
+
try:
|
| 98 |
+
# Check if bucket exists
|
| 99 |
+
try:
|
| 100 |
+
self.storage.get_bucket(bucket_id=self.bucket_id)
|
| 101 |
+
logger.info(f"Storage bucket {self.bucket_id} exists and is accessible")
|
| 102 |
+
except Exception as e:
|
| 103 |
+
logger.error(
|
| 104 |
+
f"Storage bucket {self.bucket_id} not found or not accessible: {str(e)}"
|
| 105 |
+
)
|
| 106 |
+
logger.error(
|
| 107 |
+
"Please make sure the bucket exists and your API key has access to it"
|
| 108 |
+
)
|
| 109 |
+
raise
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
logger.error(f"Error checking storage bucket: {str(e)}")
|
| 113 |
+
raise
|
| 114 |
+
|
| 115 |
+
def _initialize_chunks_collection(self):
|
| 116 |
+
"""Initialize chunks collection"""
|
| 117 |
+
try:
|
| 118 |
+
# Check if chunks collection exists
|
| 119 |
+
try:
|
| 120 |
+
self.databases.get_collection(
|
| 121 |
+
database_id=self.database_id,
|
| 122 |
+
collection_id=self.chunks_collection_id,
|
| 123 |
+
)
|
| 124 |
+
logger.info(
|
| 125 |
+
f"Chunks collection {self.chunks_collection_id} already exists"
|
| 126 |
+
)
|
| 127 |
+
except Exception:
|
| 128 |
+
# Create chunks collection
|
| 129 |
+
self.databases.create_collection(
|
| 130 |
+
database_id=self.database_id,
|
| 131 |
+
collection_id=self.chunks_collection_id,
|
| 132 |
+
name="Document Chunks",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Create attributes for the chunks collection
|
| 136 |
+
self.databases.create_string_attribute(
|
| 137 |
+
database_id=self.database_id,
|
| 138 |
+
collection_id=self.chunks_collection_id,
|
| 139 |
+
key="content",
|
| 140 |
+
size=65536, # 64KB for content
|
| 141 |
+
required=True,
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
self.databases.create_string_attribute(
|
| 145 |
+
database_id=self.database_id,
|
| 146 |
+
collection_id=self.chunks_collection_id,
|
| 147 |
+
key="title",
|
| 148 |
+
size=255,
|
| 149 |
+
required=True,
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
self.databases.create_string_attribute(
|
| 153 |
+
database_id=self.database_id,
|
| 154 |
+
collection_id=self.chunks_collection_id,
|
| 155 |
+
key="url",
|
| 156 |
+
size=500,
|
| 157 |
+
required=False,
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
self.databases.create_string_attribute(
|
| 161 |
+
database_id=self.database_id,
|
| 162 |
+
collection_id=self.chunks_collection_id,
|
| 163 |
+
key="chunk_id",
|
| 164 |
+
size=100,
|
| 165 |
+
required=True,
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
logger.info(
|
| 169 |
+
f"Created chunks collection {self.chunks_collection_id} with attributes"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
logger.error(f"Error initializing chunks collection: {str(e)}")
|
| 174 |
+
raise
|
| 175 |
+
|
| 176 |
+
def _initialize_completion_collection(self):
|
| 177 |
+
"""Initialize completion status collection"""
|
| 178 |
+
try:
|
| 179 |
+
# Check if completion collection exists
|
| 180 |
+
try:
|
| 181 |
+
self.databases.get_collection(
|
| 182 |
+
database_id=self.database_id,
|
| 183 |
+
collection_id=self.completion_collection_id,
|
| 184 |
+
)
|
| 185 |
+
logger.info(
|
| 186 |
+
f"Completion collection {self.completion_collection_id} already exists"
|
| 187 |
+
)
|
| 188 |
+
except Exception:
|
| 189 |
+
# Create completion collection
|
| 190 |
+
self.databases.create_collection(
|
| 191 |
+
database_id=self.database_id,
|
| 192 |
+
collection_id=self.completion_collection_id,
|
| 193 |
+
name="Completion Status",
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
# Create attributes for the completion collection
|
| 197 |
+
self.databases.create_string_attribute(
|
| 198 |
+
database_id=self.database_id,
|
| 199 |
+
collection_id=self.completion_collection_id,
|
| 200 |
+
key="url",
|
| 201 |
+
size=500,
|
| 202 |
+
required=True,
|
| 203 |
+
)
|
| 204 |
+
|
| 205 |
+
self.databases.create_string_attribute(
|
| 206 |
+
database_id=self.database_id,
|
| 207 |
+
collection_id=self.completion_collection_id,
|
| 208 |
+
key="status",
|
| 209 |
+
size=50,
|
| 210 |
+
required=True,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
self.databases.create_string_attribute(
|
| 214 |
+
database_id=self.database_id,
|
| 215 |
+
collection_id=self.completion_collection_id,
|
| 216 |
+
key="completed_at",
|
| 217 |
+
size=100,
|
| 218 |
+
required=True,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
self.databases.create_integer_attribute(
|
| 222 |
+
database_id=self.database_id,
|
| 223 |
+
collection_id=self.completion_collection_id,
|
| 224 |
+
key="chunks_count",
|
| 225 |
+
required=True,
|
| 226 |
+
)
|
| 227 |
+
|
| 228 |
+
logger.info(
|
| 229 |
+
f"Created completion collection {self.completion_collection_id} with attributes"
|
| 230 |
+
)
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.error(f"Error initializing completion collection: {str(e)}")
|
| 234 |
+
raise
|
| 235 |
+
|
| 236 |
+
def get_docs_file_id(self, url: str) -> str:
|
| 237 |
+
"""Generate file ID based on the documentation URL"""
|
| 238 |
+
url_lower = url.lower()
|
| 239 |
+
|
| 240 |
+
# Map URLs to file IDs
|
| 241 |
+
if "react.dev" in url_lower or "reactjs.org" in url_lower:
|
| 242 |
+
return "react_docs_raw.json"
|
| 243 |
+
elif "docs.python.org" in url_lower or "python.org" in url_lower:
|
| 244 |
+
return "python_docs_raw.json"
|
| 245 |
+
elif "golang.org" in url_lower or "go.dev" in url_lower:
|
| 246 |
+
return "golang_docs_raw.json"
|
| 247 |
+
elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
|
| 248 |
+
return "mdn_docs_raw.json"
|
| 249 |
+
elif "vuejs.org" in url_lower:
|
| 250 |
+
return "vue_docs_raw.json"
|
| 251 |
+
elif "nodejs.org" in url_lower:
|
| 252 |
+
return "nodejs_docs_raw.json"
|
| 253 |
+
elif "angular.io" in url_lower:
|
| 254 |
+
return "angular_docs_raw.json"
|
| 255 |
+
elif "svelte.dev" in url_lower:
|
| 256 |
+
return "svelte_docs_raw.json"
|
| 257 |
+
elif "nextjs.org" in url_lower:
|
| 258 |
+
return "nextjs_docs_raw.json"
|
| 259 |
+
elif "nuxt.com" in url_lower:
|
| 260 |
+
return "nuxt_docs_raw.json"
|
| 261 |
+
elif "djangoproject.com" in url_lower or "django" in url_lower:
|
| 262 |
+
return "django_docs_raw.json"
|
| 263 |
+
elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
|
| 264 |
+
return "fastapi_docs_raw.json"
|
| 265 |
+
elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
|
| 266 |
+
return "docker_docs_raw.json"
|
| 267 |
+
elif "kubernetes.io" in url_lower:
|
| 268 |
+
return "kubernetes_docs_raw.json"
|
| 269 |
+
elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
|
| 270 |
+
return "mongodb_docs_raw.json"
|
| 271 |
+
elif "postgresql.org" in url_lower or "postgresql" in url_lower:
|
| 272 |
+
return "postgresql_docs_raw.json"
|
| 273 |
+
else:
|
| 274 |
+
# For unknown URLs, create a generic ID based on domain
|
| 275 |
+
from urllib.parse import urlparse
|
| 276 |
+
|
| 277 |
+
parsed = urlparse(url)
|
| 278 |
+
domain = parsed.netloc.replace(".", "_").replace("www_", "")
|
| 279 |
+
return f"{domain}_docs_raw.json"
|
| 280 |
+
|
| 281 |
+
def docs_already_exist(self, url: str) -> bool:
|
| 282 |
+
"""Check if documentation for this URL already exists in storage"""
|
| 283 |
+
try:
|
| 284 |
+
file_id = self.get_docs_file_id(url)
|
| 285 |
+
# Try to get the file from storage
|
| 286 |
+
self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
|
| 287 |
+
logger.info(f"Documentation already exists for {url} (file: {file_id})")
|
| 288 |
+
return True
|
| 289 |
+
except Exception as e:
|
| 290 |
+
logger.info(f"Documentation does not exist for {url}: {str(e)}")
|
| 291 |
+
return False
|
| 292 |
+
|
| 293 |
+
def save_raw_docs_to_storage(
|
| 294 |
+
self, docs: List[Dict[str, Any]], url: str = None
|
| 295 |
+
) -> bool:
|
| 296 |
+
"""Save raw documents as JSON file to Appwrite storage bucket"""
|
| 297 |
+
temp_file_path = None
|
| 298 |
+
max_retries = 3
|
| 299 |
+
retry_delay = 2 # seconds
|
| 300 |
+
|
| 301 |
+
for attempt in range(max_retries):
|
| 302 |
+
try:
|
| 303 |
+
logger.info(
|
| 304 |
+
f"Saving {len(docs)} raw documents to Appwrite storage (attempt {attempt + 1}/{max_retries})"
|
| 305 |
+
)
|
| 306 |
+
|
| 307 |
+
# Generate file ID based on URL
|
| 308 |
+
file_id = self.get_docs_file_id(url) if url else "unknown_docs_raw.json"
|
| 309 |
+
logger.info(f"Using file ID: {file_id}")
|
| 310 |
+
|
| 311 |
+
# Create JSON content
|
| 312 |
+
json_content = json.dumps(docs, indent=2, ensure_ascii=False)
|
| 313 |
+
|
| 314 |
+
# Create temporary file with a unique name
|
| 315 |
+
temp_file_path = tempfile.mktemp(suffix=".json")
|
| 316 |
+
|
| 317 |
+
# Write content to temporary file
|
| 318 |
+
with open(temp_file_path, "w", encoding="utf-8") as temp_file:
|
| 319 |
+
temp_file.write(json_content)
|
| 320 |
+
|
| 321 |
+
# Upload file to storage bucket
|
| 322 |
+
input_file = InputFile.from_path(temp_file_path)
|
| 323 |
+
|
| 324 |
+
# Try to delete existing file first, then create new one
|
| 325 |
+
try:
|
| 326 |
+
# Try to delete existing file
|
| 327 |
+
self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
|
| 328 |
+
logger.info(f"Deleted existing file: {file_id}")
|
| 329 |
+
except Exception as e:
|
| 330 |
+
# File doesn't exist or can't be deleted, that's okay
|
| 331 |
+
logger.info(
|
| 332 |
+
f"Could not delete existing file (may not exist): {str(e)}"
|
| 333 |
+
)
|
| 334 |
+
|
| 335 |
+
# Upload to storage with retry logic
|
| 336 |
+
result = self.storage.create_file(
|
| 337 |
+
bucket_id=self.bucket_id,
|
| 338 |
+
file_id=file_id,
|
| 339 |
+
file=input_file,
|
| 340 |
+
)
|
| 341 |
+
|
| 342 |
+
logger.info(
|
| 343 |
+
f"Successfully saved raw documents to storage: {result['$id']}"
|
| 344 |
+
)
|
| 345 |
+
return True
|
| 346 |
+
|
| 347 |
+
except Exception as e:
|
| 348 |
+
logger.error(
|
| 349 |
+
f"Error saving raw documents to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# Clean up temporary file on error
|
| 353 |
+
if temp_file_path and os.path.exists(temp_file_path):
|
| 354 |
+
try:
|
| 355 |
+
os.unlink(temp_file_path)
|
| 356 |
+
temp_file_path = None
|
| 357 |
+
except (OSError, PermissionError) as cleanup_error:
|
| 358 |
+
logger.warning(
|
| 359 |
+
f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
|
| 360 |
+
)
|
| 361 |
+
|
| 362 |
+
# If this is the last attempt, return False
|
| 363 |
+
if attempt == max_retries - 1:
|
| 364 |
+
logger.error(
|
| 365 |
+
f"Failed to save raw documents after {max_retries} attempts"
|
| 366 |
+
)
|
| 367 |
+
return False
|
| 368 |
+
|
| 369 |
+
# Wait before retrying
|
| 370 |
+
logger.info(f"Retrying in {retry_delay} seconds...")
|
| 371 |
+
time.sleep(retry_delay)
|
| 372 |
+
retry_delay *= 2 # Exponential backoff
|
| 373 |
+
|
| 374 |
+
return False
|
| 375 |
+
|
| 376 |
+
def get_raw_docs_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
|
| 377 |
+
"""Retrieve raw documents from Appwrite storage bucket"""
|
| 378 |
+
max_retries = 3
|
| 379 |
+
retry_delay = 2 # seconds
|
| 380 |
+
|
| 381 |
+
for attempt in range(max_retries):
|
| 382 |
+
try:
|
| 383 |
+
logger.info(
|
| 384 |
+
f"Retrieving raw documents from Appwrite storage (attempt {attempt + 1}/{max_retries})"
|
| 385 |
+
)
|
| 386 |
+
|
| 387 |
+
# Generate file ID based on URL
|
| 388 |
+
file_id = self.get_docs_file_id(url) if url else "react_docs_raw.json"
|
| 389 |
+
logger.info(f"Looking for file: {file_id}")
|
| 390 |
+
|
| 391 |
+
# Download file from storage
|
| 392 |
+
result = self.storage.get_file_download(
|
| 393 |
+
bucket_id=self.bucket_id, file_id=file_id
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
logger.info(f"Download result type: {type(result)}")
|
| 397 |
+
|
| 398 |
+
# Handle different possible return types
|
| 399 |
+
docs = None
|
| 400 |
+
|
| 401 |
+
# Case 1: Result is already a list of dicts (JSON content)
|
| 402 |
+
if isinstance(result, list) and result and isinstance(result[0], dict):
|
| 403 |
+
docs = result
|
| 404 |
+
logger.info("Result is already a list of documents")
|
| 405 |
+
|
| 406 |
+
# Case 2: Result is bytes
|
| 407 |
+
elif isinstance(result, bytes):
|
| 408 |
+
json_content = result.decode("utf-8")
|
| 409 |
+
docs = json.loads(json_content)
|
| 410 |
+
logger.info("Result is bytes, decoded successfully")
|
| 411 |
+
|
| 412 |
+
# Case 3: Result is a list of bytes
|
| 413 |
+
elif (
|
| 414 |
+
isinstance(result, list) and result and isinstance(result[0], bytes)
|
| 415 |
+
):
|
| 416 |
+
json_bytes = b"".join(result)
|
| 417 |
+
json_content = json_bytes.decode("utf-8")
|
| 418 |
+
docs = json.loads(json_content)
|
| 419 |
+
logger.info("Result is list of bytes, joined and decoded")
|
| 420 |
+
|
| 421 |
+
# Case 4: Result is a single dict
|
| 422 |
+
elif isinstance(result, dict):
|
| 423 |
+
docs = [result]
|
| 424 |
+
logger.info("Result is a single document dict")
|
| 425 |
+
|
| 426 |
+
# Case 5: Try to convert to string and parse
|
| 427 |
+
else:
|
| 428 |
+
try:
|
| 429 |
+
json_str = str(result)
|
| 430 |
+
docs = json.loads(json_str)
|
| 431 |
+
logger.info("Result converted to string and parsed")
|
| 432 |
+
except Exception as e:
|
| 433 |
+
logger.error(f"Failed to parse result: {str(e)}")
|
| 434 |
+
raise ValueError(
|
| 435 |
+
f"Could not parse downloaded file content: {str(e)}"
|
| 436 |
+
)
|
| 437 |
+
|
| 438 |
+
if docs is None:
|
| 439 |
+
raise ValueError("Could not parse the downloaded file content")
|
| 440 |
+
|
| 441 |
+
logger.info(f"Retrieved {len(docs)} raw documents from storage")
|
| 442 |
+
return docs
|
| 443 |
+
|
| 444 |
+
except Exception as e:
|
| 445 |
+
logger.error(
|
| 446 |
+
f"Error retrieving raw documents from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
|
| 447 |
+
)
|
| 448 |
+
|
| 449 |
+
# If this is the last attempt, return empty list
|
| 450 |
+
if attempt == max_retries - 1:
|
| 451 |
+
logger.error(
|
| 452 |
+
f"Failed to retrieve raw documents after {max_retries} attempts"
|
| 453 |
+
)
|
| 454 |
+
return []
|
| 455 |
+
|
| 456 |
+
# Wait before retrying
|
| 457 |
+
logger.info(f"Retrying in {retry_delay} seconds...")
|
| 458 |
+
time.sleep(retry_delay)
|
| 459 |
+
retry_delay *= 2 # Exponential backoff
|
| 460 |
+
|
| 461 |
+
return []
|
| 462 |
+
|
| 463 |
+
def get_chunks_file_id(self, url: str) -> str:
|
| 464 |
+
"""Generate chunks file ID based on the documentation URL"""
|
| 465 |
+
url_lower = url.lower()
|
| 466 |
+
|
| 467 |
+
# Map URLs to chunks file IDs
|
| 468 |
+
if "react.dev" in url_lower or "reactjs.org" in url_lower:
|
| 469 |
+
return "react_docs_chunks.json"
|
| 470 |
+
elif "docs.python.org" in url_lower or "python.org" in url_lower:
|
| 471 |
+
return "python_docs_chunks.json"
|
| 472 |
+
elif "golang.org" in url_lower or "go.dev" in url_lower:
|
| 473 |
+
return "golang_docs_chunks.json"
|
| 474 |
+
elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
|
| 475 |
+
return "mdn_docs_chunks.json"
|
| 476 |
+
elif "vuejs.org" in url_lower:
|
| 477 |
+
return "vue_docs_chunks.json"
|
| 478 |
+
elif "nodejs.org" in url_lower:
|
| 479 |
+
return "nodejs_docs_chunks.json"
|
| 480 |
+
elif "angular.io" in url_lower:
|
| 481 |
+
return "angular_docs_chunks.json"
|
| 482 |
+
elif "svelte.dev" in url_lower:
|
| 483 |
+
return "svelte_docs_chunks.json"
|
| 484 |
+
elif "nextjs.org" in url_lower:
|
| 485 |
+
return "nextjs_docs_chunks.json"
|
| 486 |
+
elif "nuxt.com" in url_lower:
|
| 487 |
+
return "nuxt_docs_chunks.json"
|
| 488 |
+
elif "djangoproject.com" in url_lower or "django" in url_lower:
|
| 489 |
+
return "django_docs_chunks.json"
|
| 490 |
+
elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
|
| 491 |
+
return "fastapi_docs_chunks.json"
|
| 492 |
+
elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
|
| 493 |
+
return "docker_docs_chunks.json"
|
| 494 |
+
elif "kubernetes.io" in url_lower:
|
| 495 |
+
return "kubernetes_docs_chunks.json"
|
| 496 |
+
elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
|
| 497 |
+
return "mongodb_docs_chunks.json"
|
| 498 |
+
elif "postgresql.org" in url_lower or "postgresql" in url_lower:
|
| 499 |
+
return "postgresql_docs_chunks.json"
|
| 500 |
+
else:
|
| 501 |
+
# For unknown URLs, create a generic ID based on domain
|
| 502 |
+
from urllib.parse import urlparse
|
| 503 |
+
|
| 504 |
+
parsed = urlparse(url)
|
| 505 |
+
domain = parsed.netloc.replace(".", "_").replace("www_", "")
|
| 506 |
+
return f"{domain}_docs_chunks.json"
|
| 507 |
+
|
| 508 |
+
def chunks_already_exist(self, url: str) -> bool:
|
| 509 |
+
"""Check if chunks for this URL already exist in storage"""
|
| 510 |
+
try:
|
| 511 |
+
file_id = self.get_chunks_file_id(url)
|
| 512 |
+
# Try to get the file from storage
|
| 513 |
+
self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
|
| 514 |
+
logger.info(f"Chunks already exist for {url} (file: {file_id})")
|
| 515 |
+
return True
|
| 516 |
+
except Exception as e:
|
| 517 |
+
logger.info(f"Chunks do not exist for {url}: {str(e)}")
|
| 518 |
+
return False
|
| 519 |
+
|
| 520 |
+
def save_chunks_to_storage(
|
| 521 |
+
self, chunks: List[Dict[str, Any]], url: str = None
|
| 522 |
+
) -> bool:
|
| 523 |
+
"""Save document chunks as JSON file to Appwrite storage bucket (FAST)"""
|
| 524 |
+
temp_file_path = None
|
| 525 |
+
max_retries = 3
|
| 526 |
+
retry_delay = 2 # seconds
|
| 527 |
+
|
| 528 |
+
for attempt in range(max_retries):
|
| 529 |
+
try:
|
| 530 |
+
logger.info(
|
| 531 |
+
f"Saving {len(chunks)} chunks to Appwrite storage (attempt {attempt + 1}/{max_retries})"
|
| 532 |
+
)
|
| 533 |
+
|
| 534 |
+
# Generate file ID based on URL
|
| 535 |
+
file_id = (
|
| 536 |
+
self.get_chunks_file_id(url) if url else "unknown_docs_chunks.json"
|
| 537 |
+
)
|
| 538 |
+
logger.info(f"Using chunks file ID: {file_id}")
|
| 539 |
+
|
| 540 |
+
# Create JSON content
|
| 541 |
+
json_content = json.dumps(chunks, indent=2, ensure_ascii=False)
|
| 542 |
+
|
| 543 |
+
# Create temporary file with a unique name
|
| 544 |
+
temp_file_path = tempfile.mktemp(suffix=".json")
|
| 545 |
+
|
| 546 |
+
# Write content to temporary file
|
| 547 |
+
with open(temp_file_path, "w", encoding="utf-8") as temp_file:
|
| 548 |
+
temp_file.write(json_content)
|
| 549 |
+
|
| 550 |
+
# Upload file to storage bucket
|
| 551 |
+
input_file = InputFile.from_path(temp_file_path)
|
| 552 |
+
|
| 553 |
+
# Try to delete existing file first, then create new one
|
| 554 |
+
try:
|
| 555 |
+
# Try to delete existing file
|
| 556 |
+
self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
|
| 557 |
+
logger.info(f"Deleted existing chunks file: {file_id}")
|
| 558 |
+
except Exception as e:
|
| 559 |
+
# File doesn't exist or can't be deleted, that's okay
|
| 560 |
+
logger.info(
|
| 561 |
+
f"Could not delete existing chunks file (may not exist): {str(e)}"
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
# Upload to storage with retry logic
|
| 565 |
+
result = self.storage.create_file(
|
| 566 |
+
bucket_id=self.bucket_id,
|
| 567 |
+
file_id=file_id,
|
| 568 |
+
file=input_file,
|
| 569 |
+
)
|
| 570 |
+
|
| 571 |
+
logger.info(f"Successfully saved chunks to storage: {result['$id']}")
|
| 572 |
+
return True
|
| 573 |
+
|
| 574 |
+
except Exception as e:
|
| 575 |
+
logger.error(
|
| 576 |
+
f"Error saving chunks to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
# Clean up temporary file on error
|
| 580 |
+
if temp_file_path and os.path.exists(temp_file_path):
|
| 581 |
+
try:
|
| 582 |
+
os.unlink(temp_file_path)
|
| 583 |
+
temp_file_path = None
|
| 584 |
+
except (OSError, PermissionError) as cleanup_error:
|
| 585 |
+
logger.warning(
|
| 586 |
+
f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
# If this is the last attempt, return False
|
| 590 |
+
if attempt == max_retries - 1:
|
| 591 |
+
logger.error(f"Failed to save chunks after {max_retries} attempts")
|
| 592 |
+
return False
|
| 593 |
+
|
| 594 |
+
# Wait before retrying
|
| 595 |
+
logger.info(f"Retrying in {retry_delay} seconds...")
|
| 596 |
+
time.sleep(retry_delay)
|
| 597 |
+
retry_delay *= 2 # Exponential backoff
|
| 598 |
+
|
| 599 |
+
return False
|
| 600 |
+
|
| 601 |
+
def get_chunks_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
|
| 602 |
+
"""Retrieve document chunks from Appwrite storage bucket (FAST)"""
|
| 603 |
+
max_retries = 3
|
| 604 |
+
retry_delay = 2 # seconds
|
| 605 |
+
|
| 606 |
+
for attempt in range(max_retries):
|
| 607 |
+
try:
|
| 608 |
+
logger.info(
|
| 609 |
+
f"Retrieving chunks from Appwrite storage (attempt {attempt + 1}/{max_retries})"
|
| 610 |
+
)
|
| 611 |
+
|
| 612 |
+
# Generate file ID based on URL
|
| 613 |
+
file_id = (
|
| 614 |
+
self.get_chunks_file_id(url) if url else "react_docs_chunks.json"
|
| 615 |
+
)
|
| 616 |
+
logger.info(f"Looking for chunks file: {file_id}")
|
| 617 |
+
|
| 618 |
+
# Download file from storage
|
| 619 |
+
result = self.storage.get_file_download(
|
| 620 |
+
bucket_id=self.bucket_id, file_id=file_id
|
| 621 |
+
)
|
| 622 |
+
|
| 623 |
+
logger.info(f"Download result type: {type(result)}")
|
| 624 |
+
|
| 625 |
+
# Handle different possible return types
|
| 626 |
+
chunks = None
|
| 627 |
+
|
| 628 |
+
# Case 1: Result is already a list of dicts (JSON content)
|
| 629 |
+
if isinstance(result, list) and result and isinstance(result[0], dict):
|
| 630 |
+
chunks = result
|
| 631 |
+
logger.info("Result is already a list of chunks")
|
| 632 |
+
|
| 633 |
+
# Case 2: Result is bytes
|
| 634 |
+
elif isinstance(result, bytes):
|
| 635 |
+
json_content = result.decode("utf-8")
|
| 636 |
+
chunks = json.loads(json_content)
|
| 637 |
+
logger.info("Result is bytes, decoded successfully")
|
| 638 |
+
|
| 639 |
+
# Case 3: Result is a list of bytes
|
| 640 |
+
elif (
|
| 641 |
+
isinstance(result, list) and result and isinstance(result[0], bytes)
|
| 642 |
+
):
|
| 643 |
+
json_bytes = b"".join(result)
|
| 644 |
+
json_content = json_bytes.decode("utf-8")
|
| 645 |
+
chunks = json.loads(json_content)
|
| 646 |
+
logger.info("Result is list of bytes, joined and decoded")
|
| 647 |
+
|
| 648 |
+
# Case 4: Result is a single dict
|
| 649 |
+
elif isinstance(result, dict):
|
| 650 |
+
chunks = [result]
|
| 651 |
+
logger.info("Result is a single chunk dict")
|
| 652 |
+
|
| 653 |
+
# Case 5: Try to convert to string and parse
|
| 654 |
+
else:
|
| 655 |
+
try:
|
| 656 |
+
json_str = str(result)
|
| 657 |
+
chunks = json.loads(json_str)
|
| 658 |
+
logger.info("Result converted to string and parsed")
|
| 659 |
+
except Exception as e:
|
| 660 |
+
logger.error(f"Failed to parse result: {str(e)}")
|
| 661 |
+
raise ValueError(
|
| 662 |
+
f"Could not parse downloaded chunks file content: {str(e)}"
|
| 663 |
+
)
|
| 664 |
+
|
| 665 |
+
if chunks is None:
|
| 666 |
+
raise ValueError(
|
| 667 |
+
"Could not parse the downloaded chunks file content"
|
| 668 |
+
)
|
| 669 |
+
|
| 670 |
+
logger.info(f"Retrieved {len(chunks)} chunks from storage")
|
| 671 |
+
return chunks
|
| 672 |
+
|
| 673 |
+
except Exception as e:
|
| 674 |
+
logger.error(
|
| 675 |
+
f"Error retrieving chunks from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
|
| 676 |
+
)
|
| 677 |
+
|
| 678 |
+
# If this is the last attempt, return empty list
|
| 679 |
+
if attempt == max_retries - 1:
|
| 680 |
+
logger.error(
|
| 681 |
+
f"Failed to retrieve chunks after {max_retries} attempts"
|
| 682 |
+
)
|
| 683 |
+
return []
|
| 684 |
+
|
| 685 |
+
# Wait before retrying
|
| 686 |
+
logger.info(f"Retrying in {retry_delay} seconds...")
|
| 687 |
+
time.sleep(retry_delay)
|
| 688 |
+
retry_delay *= 2 # Exponential backoff
|
| 689 |
+
|
| 690 |
+
return []
|
| 691 |
+
|
| 692 |
+
def save_chunks(self, chunks: List[Dict[str, Any]], url: str = None) -> bool:
|
| 693 |
+
"""Save document chunks - optimized version using storage bucket"""
|
| 694 |
+
try:
|
| 695 |
+
logger.info(f"Saving {len(chunks)} chunks using optimized method")
|
| 696 |
+
|
| 697 |
+
# Use the fast storage method instead of database
|
| 698 |
+
return self.save_chunks_to_storage(chunks, url)
|
| 699 |
+
|
| 700 |
+
except Exception as e:
|
| 701 |
+
logger.error(f"Error saving chunks: {str(e)}")
|
| 702 |
+
return False
|
| 703 |
+
|
| 704 |
+
def get_all_chunks(self, url: str = None) -> List[Dict[str, Any]]:
|
| 705 |
+
"""Retrieve all document chunks - optimized version using storage bucket"""
|
| 706 |
+
try:
|
| 707 |
+
logger.info("Retrieving all chunks using optimized method")
|
| 708 |
+
|
| 709 |
+
# Use the fast storage method instead of database
|
| 710 |
+
return self.get_chunks_from_storage(url)
|
| 711 |
+
|
| 712 |
+
except Exception as e:
|
| 713 |
+
logger.error(f"Error retrieving chunks: {str(e)}")
|
| 714 |
+
return []
|
| 715 |
+
|
| 716 |
+
def search_chunks(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
|
| 717 |
+
"""Search for chunks containing specific text"""
|
| 718 |
+
try:
|
| 719 |
+
logger.info(f"Searching for chunks with query: {query}")
|
| 720 |
+
|
| 721 |
+
# Search documents in the collection
|
| 722 |
+
response = self.databases.list_documents(
|
| 723 |
+
database_id=self.database_id,
|
| 724 |
+
collection_id=self.chunks_collection_id,
|
| 725 |
+
queries=[],
|
| 726 |
+
)
|
| 727 |
+
|
| 728 |
+
chunks = []
|
| 729 |
+
for doc in response["documents"]:
|
| 730 |
+
# Simple client-side search for now
|
| 731 |
+
if (
|
| 732 |
+
query.lower() in doc["content"].lower()
|
| 733 |
+
or query.lower() in doc["title"].lower()
|
| 734 |
+
):
|
| 735 |
+
chunks.append(
|
| 736 |
+
{
|
| 737 |
+
"content": doc["content"],
|
| 738 |
+
"title": doc["title"],
|
| 739 |
+
"url": doc.get("url", ""),
|
| 740 |
+
"chunk_id": doc["chunk_id"],
|
| 741 |
+
}
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
logger.info(f"Found {len(chunks)} matching chunks")
|
| 745 |
+
return chunks[:limit]
|
| 746 |
+
|
| 747 |
+
except Exception as e:
|
| 748 |
+
logger.error(f"Error searching chunks in Appwrite: {str(e)}")
|
| 749 |
+
return []
|
| 750 |
+
|
| 751 |
+
def delete_raw_docs_from_storage(self) -> bool:
|
| 752 |
+
"""Delete raw documents file from storage bucket"""
|
| 753 |
+
try:
|
| 754 |
+
logger.info("Deleting raw documents from storage")
|
| 755 |
+
|
| 756 |
+
# Delete file from storage
|
| 757 |
+
self.storage.delete_file(
|
| 758 |
+
bucket_id=self.bucket_id, file_id="react_docs_raw.json"
|
| 759 |
+
)
|
| 760 |
+
|
| 761 |
+
logger.info("Successfully deleted raw documents from storage")
|
| 762 |
+
return True
|
| 763 |
+
|
| 764 |
+
except Exception as e:
|
| 765 |
+
logger.error(f"Error deleting raw documents from storage: {str(e)}")
|
| 766 |
+
return False
|
| 767 |
+
|
| 768 |
+
def delete_all_chunks(self) -> bool:
|
| 769 |
+
"""Delete all chunks from the database (use with caution)"""
|
| 770 |
+
try:
|
| 771 |
+
logger.info("Deleting all chunks from Appwrite")
|
| 772 |
+
|
| 773 |
+
# Get all documents
|
| 774 |
+
response = self.databases.list_documents(
|
| 775 |
+
database_id=self.database_id,
|
| 776 |
+
collection_id=self.chunks_collection_id,
|
| 777 |
+
)
|
| 778 |
+
|
| 779 |
+
# Delete each document
|
| 780 |
+
for doc in response["documents"]:
|
| 781 |
+
self.databases.delete_document(
|
| 782 |
+
database_id=self.database_id,
|
| 783 |
+
collection_id=self.chunks_collection_id,
|
| 784 |
+
document_id=doc["$id"],
|
| 785 |
+
)
|
| 786 |
+
|
| 787 |
+
logger.info("Successfully deleted all chunks")
|
| 788 |
+
return True
|
| 789 |
+
|
| 790 |
+
except Exception as e:
|
| 791 |
+
logger.error(f"Error deleting chunks from Appwrite: {str(e)}")
|
| 792 |
+
return False
|
| 793 |
+
|
| 794 |
+
def get_raw_docs_count(self) -> int:
|
| 795 |
+
"""Get the total number of raw documents in storage"""
|
| 796 |
+
try:
|
| 797 |
+
# Check if raw docs file exists
|
| 798 |
+
try:
|
| 799 |
+
self.storage.get_file(
|
| 800 |
+
bucket_id=self.bucket_id, file_id="react_docs_raw.json"
|
| 801 |
+
)
|
| 802 |
+
# If file exists, get the count from the content
|
| 803 |
+
docs = self.get_raw_docs_from_storage()
|
| 804 |
+
return len(docs)
|
| 805 |
+
except Exception:
|
| 806 |
+
return 0
|
| 807 |
+
except Exception as e:
|
| 808 |
+
logger.error(f"Error getting raw docs count: {str(e)}")
|
| 809 |
+
return 0
|
| 810 |
+
|
| 811 |
+
def get_chunks_count(self) -> int:
|
| 812 |
+
"""Get the total number of chunks in the database"""
|
| 813 |
+
try:
|
| 814 |
+
response = self.databases.list_documents(
|
| 815 |
+
database_id=self.database_id,
|
| 816 |
+
collection_id=self.chunks_collection_id,
|
| 817 |
+
)
|
| 818 |
+
return response["total"]
|
| 819 |
+
except Exception as e:
|
| 820 |
+
logger.error(f"Error getting chunks count: {str(e)}")
|
| 821 |
+
return 0
|
| 822 |
+
|
| 823 |
+
def clear_all_data(self) -> bool:
|
| 824 |
+
"""Clear all data from both storage and database"""
|
| 825 |
+
try:
|
| 826 |
+
logger.info("Clearing all data from storage and database")
|
| 827 |
+
success1 = self.delete_raw_docs_from_storage()
|
| 828 |
+
success2 = self.delete_all_chunks()
|
| 829 |
+
return success1 and success2
|
| 830 |
+
except Exception as e:
|
| 831 |
+
logger.error(f"Error clearing all data: {str(e)}")
|
| 832 |
+
return False
|
| 833 |
+
|
| 834 |
+
def list_storage_files(self) -> List[str]:
|
| 835 |
+
"""List all files in the storage bucket"""
|
| 836 |
+
try:
|
| 837 |
+
response = self.storage.list_files(bucket_id=self.bucket_id)
|
| 838 |
+
files = [file["$id"] for file in response["files"]]
|
| 839 |
+
logger.info(f"Found {len(files)} files in storage")
|
| 840 |
+
return files
|
| 841 |
+
except Exception as e:
|
| 842 |
+
logger.error(f"Error listing storage files: {str(e)}")
|
| 843 |
+
return []
|
| 844 |
+
|
| 845 |
+
def save_completion_status(self, url: str, chunks_count: int) -> bool:
|
| 846 |
+
"""Save completion status for a documentation URL"""
|
| 847 |
+
try:
|
| 848 |
+
import datetime
|
| 849 |
+
|
| 850 |
+
# Check if completion record already exists
|
| 851 |
+
existing_record = self.get_completion_status(url)
|
| 852 |
+
|
| 853 |
+
if existing_record:
|
| 854 |
+
# Update existing record
|
| 855 |
+
self.databases.update_document(
|
| 856 |
+
database_id=self.database_id,
|
| 857 |
+
collection_id=self.completion_collection_id,
|
| 858 |
+
document_id=existing_record["$id"],
|
| 859 |
+
data={
|
| 860 |
+
"url": url,
|
| 861 |
+
"status": "completed",
|
| 862 |
+
"completed_at": datetime.datetime.now().isoformat(),
|
| 863 |
+
"chunks_count": chunks_count,
|
| 864 |
+
},
|
| 865 |
+
)
|
| 866 |
+
logger.info(f"Updated completion status for {url}")
|
| 867 |
+
else:
|
| 868 |
+
# Create new record
|
| 869 |
+
self.databases.create_document(
|
| 870 |
+
database_id=self.database_id,
|
| 871 |
+
collection_id=self.completion_collection_id,
|
| 872 |
+
document_id="unique()",
|
| 873 |
+
data={
|
| 874 |
+
"url": url,
|
| 875 |
+
"status": "completed",
|
| 876 |
+
"completed_at": datetime.datetime.now().isoformat(),
|
| 877 |
+
"chunks_count": chunks_count,
|
| 878 |
+
},
|
| 879 |
+
)
|
| 880 |
+
logger.info(f"Saved completion status for {url}")
|
| 881 |
+
|
| 882 |
+
return True
|
| 883 |
+
except Exception as e:
|
| 884 |
+
logger.error(f"Error saving completion status: {str(e)}")
|
| 885 |
+
return False
|
| 886 |
+
|
| 887 |
+
def get_completion_status(self, url: str) -> Optional[Dict[str, Any]]:
|
| 888 |
+
"""Get completion status for a documentation URL"""
|
| 889 |
+
try:
|
| 890 |
+
from appwrite.query import Query
|
| 891 |
+
|
| 892 |
+
response = self.databases.list_documents(
|
| 893 |
+
database_id=self.database_id,
|
| 894 |
+
collection_id=self.completion_collection_id,
|
| 895 |
+
queries=[Query.equal("url", url)],
|
| 896 |
+
)
|
| 897 |
+
|
| 898 |
+
if response["documents"]:
|
| 899 |
+
return response["documents"][0]
|
| 900 |
+
return None
|
| 901 |
+
except Exception as e:
|
| 902 |
+
logger.error(f"Error getting completion status: {str(e)}")
|
| 903 |
+
return None
|
| 904 |
+
|
| 905 |
+
def is_fully_processed(self, url: str) -> bool:
|
| 906 |
+
"""Check if documentation is fully processed (has completion status)"""
|
| 907 |
+
try:
|
| 908 |
+
completion_status = self.get_completion_status(url)
|
| 909 |
+
return (
|
| 910 |
+
completion_status is not None
|
| 911 |
+
and completion_status.get("status") == "completed"
|
| 912 |
+
)
|
| 913 |
+
except Exception as e:
|
| 914 |
+
logger.error(f"Error checking if fully processed: {str(e)}")
|
| 915 |
+
return False
|
| 916 |
+
|
| 917 |
+
|
| 918 |
+
# Global instance
|
| 919 |
+
appwrite_service = AppwriteService()
|