Spaces:

iamfaham
/

dev-docs-rag-hf

Runtime error

App Files Files Community

iamfaham commited on Jul 12, 2025

Commit

3603603

verified ·

1 Parent(s): c3980ee

Upload appwrite_service.py

Browse files

Files changed (1) hide show

appwrite_service.py +919 -0

appwrite_service.py ADDED Viewed

	@@ -0,0 +1,919 @@

+import os
+from dotenv import load_dotenv
+from appwrite.client import Client
+from appwrite.services.databases import Databases
+from appwrite.services.storage import Storage
+from appwrite.input_file import InputFile
+import json
+import logging
+from typing import List, Dict, Any, Optional
+import tempfile
+import time
+# Load environment variables
+load_dotenv()
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class AppwriteService:
+    def __init__(self):
+        """Initialize Appwrite client and services"""
+        # Validate required environment variables
+        self._validate_environment()
+        self.client = Client()
+        # Set up client with environment variables
+        self.client.set_endpoint(
+            os.getenv("APPWRITE_ENDPOINT", "https://cloud.appwrite.io/v1")
+        )
+        self.client.set_project(os.getenv("APPWRITE_PROJECT_ID"))
+        self.client.set_key(os.getenv("APPWRITE_API_KEY"))
+        # Initialize services
+        self.databases = Databases(self.client)
+        self.storage = Storage(self.client)
+        # Database and collection IDs
+        self.database_id = os.getenv("APPWRITE_DATABASE_ID", "react_docs_db")
+        self.chunks_collection_id = os.getenv(
+            "APPWRITE_COLLECTION_ID", "document_chunks"
+        )
+        self.completion_collection_id = "completion_status"
+        self.bucket_id = os.getenv("APPWRITE_BUCKET_ID", "react_docs_bucket")
+        # Initialize database and storage if they don't exist
+        self._initialize_database()
+        self._initialize_storage()
+    def _validate_environment(self):
+        """Validate that required environment variables are set"""
+        required_vars = ["APPWRITE_PROJECT_ID", "APPWRITE_API_KEY"]
+        missing_vars = []
+        for var in required_vars:
+            if not os.getenv(var):
+                missing_vars.append(var)
+        if missing_vars:
+            error_msg = (
+                f"Missing required environment variables: {', '.join(missing_vars)}"
+            )
+            logger.error(error_msg)
+            logger.error("Please set these variables in your .env file:")
+            for var in missing_vars:
+                logger.error(f"  {var}=your_value_here")
+            raise ValueError(error_msg)
+    def _initialize_database(self):
+        """Initialize database and chunks collection if they don't exist"""
+        try:
+            # Check if database exists
+            try:
+                self.databases.get(database_id=self.database_id)
+                logger.info(f"Database {self.database_id} already exists")
+            except Exception:
+                # Create database
+                self.databases.create(
+                    database_id=self.database_id, name="React Documentation Database"
+                )
+                logger.info(f"Created database {self.database_id}")
+            # Initialize chunks collection
+            self._initialize_chunks_collection()
+            # Initialize completion status collection
+            self._initialize_completion_collection()
+        except Exception as e:
+            logger.error(f"Error initializing database: {str(e)}")
+            raise
+    def _initialize_storage(self):
+        """Check if storage bucket exists (don't create if it doesn't)"""
+        try:
+            # Check if bucket exists
+            try:
+                self.storage.get_bucket(bucket_id=self.bucket_id)
+                logger.info(f"Storage bucket {self.bucket_id} exists and is accessible")
+            except Exception as e:
+                logger.error(
+                    f"Storage bucket {self.bucket_id} not found or not accessible: {str(e)}"
+                )
+                logger.error(
+                    "Please make sure the bucket exists and your API key has access to it"
+                )
+                raise
+        except Exception as e:
+            logger.error(f"Error checking storage bucket: {str(e)}")
+            raise
+    def _initialize_chunks_collection(self):
+        """Initialize chunks collection"""
+        try:
+            # Check if chunks collection exists
+            try:
+                self.databases.get_collection(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                )
+                logger.info(
+                    f"Chunks collection {self.chunks_collection_id} already exists"
+                )
+            except Exception:
+                # Create chunks collection
+                self.databases.create_collection(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    name="Document Chunks",
+                )
+                # Create attributes for the chunks collection
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    key="content",
+                    size=65536,  # 64KB for content
+                    required=True,
+                )
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    key="title",
+                    size=255,
+                    required=True,
+                )
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    key="url",
+                    size=500,
+                    required=False,
+                )
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    key="chunk_id",
+                    size=100,
+                    required=True,
+                )
+                logger.info(
+                    f"Created chunks collection {self.chunks_collection_id} with attributes"
+                )
+        except Exception as e:
+            logger.error(f"Error initializing chunks collection: {str(e)}")
+            raise
+    def _initialize_completion_collection(self):
+        """Initialize completion status collection"""
+        try:
+            # Check if completion collection exists
+            try:
+                self.databases.get_collection(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                )
+                logger.info(
+                    f"Completion collection {self.completion_collection_id} already exists"
+                )
+            except Exception:
+                # Create completion collection
+                self.databases.create_collection(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    name="Completion Status",
+                )
+                # Create attributes for the completion collection
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    key="url",
+                    size=500,
+                    required=True,
+                )
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    key="status",
+                    size=50,
+                    required=True,
+                )
+                self.databases.create_string_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    key="completed_at",
+                    size=100,
+                    required=True,
+                )
+                self.databases.create_integer_attribute(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    key="chunks_count",
+                    required=True,
+                )
+                logger.info(
+                    f"Created completion collection {self.completion_collection_id} with attributes"
+                )
+        except Exception as e:
+            logger.error(f"Error initializing completion collection: {str(e)}")
+            raise
+    def get_docs_file_id(self, url: str) -> str:
+        """Generate file ID based on the documentation URL"""
+        url_lower = url.lower()
+        # Map URLs to file IDs
+        if "react.dev" in url_lower or "reactjs.org" in url_lower:
+            return "react_docs_raw.json"
+        elif "docs.python.org" in url_lower or "python.org" in url_lower:
+            return "python_docs_raw.json"
+        elif "golang.org" in url_lower or "go.dev" in url_lower:
+            return "golang_docs_raw.json"
+        elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
+            return "mdn_docs_raw.json"
+        elif "vuejs.org" in url_lower:
+            return "vue_docs_raw.json"
+        elif "nodejs.org" in url_lower:
+            return "nodejs_docs_raw.json"
+        elif "angular.io" in url_lower:
+            return "angular_docs_raw.json"
+        elif "svelte.dev" in url_lower:
+            return "svelte_docs_raw.json"
+        elif "nextjs.org" in url_lower:
+            return "nextjs_docs_raw.json"
+        elif "nuxt.com" in url_lower:
+            return "nuxt_docs_raw.json"
+        elif "djangoproject.com" in url_lower or "django" in url_lower:
+            return "django_docs_raw.json"
+        elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
+            return "fastapi_docs_raw.json"
+        elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
+            return "docker_docs_raw.json"
+        elif "kubernetes.io" in url_lower:
+            return "kubernetes_docs_raw.json"
+        elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
+            return "mongodb_docs_raw.json"
+        elif "postgresql.org" in url_lower or "postgresql" in url_lower:
+            return "postgresql_docs_raw.json"
+        else:
+            # For unknown URLs, create a generic ID based on domain
+            from urllib.parse import urlparse
+            parsed = urlparse(url)
+            domain = parsed.netloc.replace(".", "_").replace("www_", "")
+            return f"{domain}_docs_raw.json"
+    def docs_already_exist(self, url: str) -> bool:
+        """Check if documentation for this URL already exists in storage"""
+        try:
+            file_id = self.get_docs_file_id(url)
+            # Try to get the file from storage
+            self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
+            logger.info(f"Documentation already exists for {url} (file: {file_id})")
+            return True
+        except Exception as e:
+            logger.info(f"Documentation does not exist for {url}: {str(e)}")
+            return False
+    def save_raw_docs_to_storage(
+        self, docs: List[Dict[str, Any]], url: str = None
+    ) -> bool:
+        """Save raw documents as JSON file to Appwrite storage bucket"""
+        temp_file_path = None
+        max_retries = 3
+        retry_delay = 2  # seconds
+        for attempt in range(max_retries):
+            try:
+                logger.info(
+                    f"Saving {len(docs)} raw documents to Appwrite storage (attempt {attempt + 1}/{max_retries})"
+                )
+                # Generate file ID based on URL
+                file_id = self.get_docs_file_id(url) if url else "unknown_docs_raw.json"
+                logger.info(f"Using file ID: {file_id}")
+                # Create JSON content
+                json_content = json.dumps(docs, indent=2, ensure_ascii=False)
+                # Create temporary file with a unique name
+                temp_file_path = tempfile.mktemp(suffix=".json")
+                # Write content to temporary file
+                with open(temp_file_path, "w", encoding="utf-8") as temp_file:
+                    temp_file.write(json_content)
+                # Upload file to storage bucket
+                input_file = InputFile.from_path(temp_file_path)
+                # Try to delete existing file first, then create new one
+                try:
+                    # Try to delete existing file
+                    self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
+                    logger.info(f"Deleted existing file: {file_id}")
+                except Exception as e:
+                    # File doesn't exist or can't be deleted, that's okay
+                    logger.info(
+                        f"Could not delete existing file (may not exist): {str(e)}"
+                    )
+                # Upload to storage with retry logic
+                result = self.storage.create_file(
+                    bucket_id=self.bucket_id,
+                    file_id=file_id,
+                    file=input_file,
+                )
+                logger.info(
+                    f"Successfully saved raw documents to storage: {result['$id']}"
+                )
+                return True
+            except Exception as e:
+                logger.error(
+                    f"Error saving raw documents to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
+                )
+                # Clean up temporary file on error
+                if temp_file_path and os.path.exists(temp_file_path):
+                    try:
+                        os.unlink(temp_file_path)
+                        temp_file_path = None
+                    except (OSError, PermissionError) as cleanup_error:
+                        logger.warning(
+                            f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
+                        )
+                # If this is the last attempt, return False
+                if attempt == max_retries - 1:
+                    logger.error(
+                        f"Failed to save raw documents after {max_retries} attempts"
+                    )
+                    return False
+                # Wait before retrying
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+        return False
+    def get_raw_docs_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
+        """Retrieve raw documents from Appwrite storage bucket"""
+        max_retries = 3
+        retry_delay = 2  # seconds
+        for attempt in range(max_retries):
+            try:
+                logger.info(
+                    f"Retrieving raw documents from Appwrite storage (attempt {attempt + 1}/{max_retries})"
+                )
+                # Generate file ID based on URL
+                file_id = self.get_docs_file_id(url) if url else "react_docs_raw.json"
+                logger.info(f"Looking for file: {file_id}")
+                # Download file from storage
+                result = self.storage.get_file_download(
+                    bucket_id=self.bucket_id, file_id=file_id
+                )
+                logger.info(f"Download result type: {type(result)}")
+                # Handle different possible return types
+                docs = None
+                # Case 1: Result is already a list of dicts (JSON content)
+                if isinstance(result, list) and result and isinstance(result[0], dict):
+                    docs = result
+                    logger.info("Result is already a list of documents")
+                # Case 2: Result is bytes
+                elif isinstance(result, bytes):
+                    json_content = result.decode("utf-8")
+                    docs = json.loads(json_content)
+                    logger.info("Result is bytes, decoded successfully")
+                # Case 3: Result is a list of bytes
+                elif (
+                    isinstance(result, list) and result and isinstance(result[0], bytes)
+                ):
+                    json_bytes = b"".join(result)
+                    json_content = json_bytes.decode("utf-8")
+                    docs = json.loads(json_content)
+                    logger.info("Result is list of bytes, joined and decoded")
+                # Case 4: Result is a single dict
+                elif isinstance(result, dict):
+                    docs = [result]
+                    logger.info("Result is a single document dict")
+                # Case 5: Try to convert to string and parse
+                else:
+                    try:
+                        json_str = str(result)
+                        docs = json.loads(json_str)
+                        logger.info("Result converted to string and parsed")
+                    except Exception as e:
+                        logger.error(f"Failed to parse result: {str(e)}")
+                        raise ValueError(
+                            f"Could not parse downloaded file content: {str(e)}"
+                        )
+                if docs is None:
+                    raise ValueError("Could not parse the downloaded file content")
+                logger.info(f"Retrieved {len(docs)} raw documents from storage")
+                return docs
+            except Exception as e:
+                logger.error(
+                    f"Error retrieving raw documents from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
+                )
+                # If this is the last attempt, return empty list
+                if attempt == max_retries - 1:
+                    logger.error(
+                        f"Failed to retrieve raw documents after {max_retries} attempts"
+                    )
+                    return []
+                # Wait before retrying
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+        return []
+    def get_chunks_file_id(self, url: str) -> str:
+        """Generate chunks file ID based on the documentation URL"""
+        url_lower = url.lower()
+        # Map URLs to chunks file IDs
+        if "react.dev" in url_lower or "reactjs.org" in url_lower:
+            return "react_docs_chunks.json"
+        elif "docs.python.org" in url_lower or "python.org" in url_lower:
+            return "python_docs_chunks.json"
+        elif "golang.org" in url_lower or "go.dev" in url_lower:
+            return "golang_docs_chunks.json"
+        elif "developer.mozilla.org" in url_lower or "mdn" in url_lower:
+            return "mdn_docs_chunks.json"
+        elif "vuejs.org" in url_lower:
+            return "vue_docs_chunks.json"
+        elif "nodejs.org" in url_lower:
+            return "nodejs_docs_chunks.json"
+        elif "angular.io" in url_lower:
+            return "angular_docs_chunks.json"
+        elif "svelte.dev" in url_lower:
+            return "svelte_docs_chunks.json"
+        elif "nextjs.org" in url_lower:
+            return "nextjs_docs_chunks.json"
+        elif "nuxt.com" in url_lower:
+            return "nuxt_docs_chunks.json"
+        elif "djangoproject.com" in url_lower or "django" in url_lower:
+            return "django_docs_chunks.json"
+        elif "fastapi.tiangolo.com" in url_lower or "fastapi" in url_lower:
+            return "fastapi_docs_chunks.json"
+        elif "docs.docker.com" in url_lower or "docker.com" in url_lower:
+            return "docker_docs_chunks.json"
+        elif "kubernetes.io" in url_lower:
+            return "kubernetes_docs_chunks.json"
+        elif "docs.mongodb.com" in url_lower or "mongodb.com" in url_lower:
+            return "mongodb_docs_chunks.json"
+        elif "postgresql.org" in url_lower or "postgresql" in url_lower:
+            return "postgresql_docs_chunks.json"
+        else:
+            # For unknown URLs, create a generic ID based on domain
+            from urllib.parse import urlparse
+            parsed = urlparse(url)
+            domain = parsed.netloc.replace(".", "_").replace("www_", "")
+            return f"{domain}_docs_chunks.json"
+    def chunks_already_exist(self, url: str) -> bool:
+        """Check if chunks for this URL already exist in storage"""
+        try:
+            file_id = self.get_chunks_file_id(url)
+            # Try to get the file from storage
+            self.storage.get_file(bucket_id=self.bucket_id, file_id=file_id)
+            logger.info(f"Chunks already exist for {url} (file: {file_id})")
+            return True
+        except Exception as e:
+            logger.info(f"Chunks do not exist for {url}: {str(e)}")
+            return False
+    def save_chunks_to_storage(
+        self, chunks: List[Dict[str, Any]], url: str = None
+    ) -> bool:
+        """Save document chunks as JSON file to Appwrite storage bucket (FAST)"""
+        temp_file_path = None
+        max_retries = 3
+        retry_delay = 2  # seconds
+        for attempt in range(max_retries):
+            try:
+                logger.info(
+                    f"Saving {len(chunks)} chunks to Appwrite storage (attempt {attempt + 1}/{max_retries})"
+                )
+                # Generate file ID based on URL
+                file_id = (
+                    self.get_chunks_file_id(url) if url else "unknown_docs_chunks.json"
+                )
+                logger.info(f"Using chunks file ID: {file_id}")
+                # Create JSON content
+                json_content = json.dumps(chunks, indent=2, ensure_ascii=False)
+                # Create temporary file with a unique name
+                temp_file_path = tempfile.mktemp(suffix=".json")
+                # Write content to temporary file
+                with open(temp_file_path, "w", encoding="utf-8") as temp_file:
+                    temp_file.write(json_content)
+                # Upload file to storage bucket
+                input_file = InputFile.from_path(temp_file_path)
+                # Try to delete existing file first, then create new one
+                try:
+                    # Try to delete existing file
+                    self.storage.delete_file(bucket_id=self.bucket_id, file_id=file_id)
+                    logger.info(f"Deleted existing chunks file: {file_id}")
+                except Exception as e:
+                    # File doesn't exist or can't be deleted, that's okay
+                    logger.info(
+                        f"Could not delete existing chunks file (may not exist): {str(e)}"
+                    )
+                # Upload to storage with retry logic
+                result = self.storage.create_file(
+                    bucket_id=self.bucket_id,
+                    file_id=file_id,
+                    file=input_file,
+                )
+                logger.info(f"Successfully saved chunks to storage: {result['$id']}")
+                return True
+            except Exception as e:
+                logger.error(
+                    f"Error saving chunks to storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
+                )
+                # Clean up temporary file on error
+                if temp_file_path and os.path.exists(temp_file_path):
+                    try:
+                        os.unlink(temp_file_path)
+                        temp_file_path = None
+                    except (OSError, PermissionError) as cleanup_error:
+                        logger.warning(
+                            f"Could not delete temporary file {temp_file_path}: {str(cleanup_error)}"
+                        )
+                # If this is the last attempt, return False
+                if attempt == max_retries - 1:
+                    logger.error(f"Failed to save chunks after {max_retries} attempts")
+                    return False
+                # Wait before retrying
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+        return False
+    def get_chunks_from_storage(self, url: str = None) -> List[Dict[str, Any]]:
+        """Retrieve document chunks from Appwrite storage bucket (FAST)"""
+        max_retries = 3
+        retry_delay = 2  # seconds
+        for attempt in range(max_retries):
+            try:
+                logger.info(
+                    f"Retrieving chunks from Appwrite storage (attempt {attempt + 1}/{max_retries})"
+                )
+                # Generate file ID based on URL
+                file_id = (
+                    self.get_chunks_file_id(url) if url else "react_docs_chunks.json"
+                )
+                logger.info(f"Looking for chunks file: {file_id}")
+                # Download file from storage
+                result = self.storage.get_file_download(
+                    bucket_id=self.bucket_id, file_id=file_id
+                )
+                logger.info(f"Download result type: {type(result)}")
+                # Handle different possible return types
+                chunks = None
+                # Case 1: Result is already a list of dicts (JSON content)
+                if isinstance(result, list) and result and isinstance(result[0], dict):
+                    chunks = result
+                    logger.info("Result is already a list of chunks")
+                # Case 2: Result is bytes
+                elif isinstance(result, bytes):
+                    json_content = result.decode("utf-8")
+                    chunks = json.loads(json_content)
+                    logger.info("Result is bytes, decoded successfully")
+                # Case 3: Result is a list of bytes
+                elif (
+                    isinstance(result, list) and result and isinstance(result[0], bytes)
+                ):
+                    json_bytes = b"".join(result)
+                    json_content = json_bytes.decode("utf-8")
+                    chunks = json.loads(json_content)
+                    logger.info("Result is list of bytes, joined and decoded")
+                # Case 4: Result is a single dict
+                elif isinstance(result, dict):
+                    chunks = [result]
+                    logger.info("Result is a single chunk dict")
+                # Case 5: Try to convert to string and parse
+                else:
+                    try:
+                        json_str = str(result)
+                        chunks = json.loads(json_str)
+                        logger.info("Result converted to string and parsed")
+                    except Exception as e:
+                        logger.error(f"Failed to parse result: {str(e)}")
+                        raise ValueError(
+                            f"Could not parse downloaded chunks file content: {str(e)}"
+                        )
+                if chunks is None:
+                    raise ValueError(
+                        "Could not parse the downloaded chunks file content"
+                    )
+                logger.info(f"Retrieved {len(chunks)} chunks from storage")
+                return chunks
+            except Exception as e:
+                logger.error(
+                    f"Error retrieving chunks from storage (attempt {attempt + 1}/{max_retries}): {str(e)}"
+                )
+                # If this is the last attempt, return empty list
+                if attempt == max_retries - 1:
+                    logger.error(
+                        f"Failed to retrieve chunks after {max_retries} attempts"
+                    )
+                    return []
+                # Wait before retrying
+                logger.info(f"Retrying in {retry_delay} seconds...")
+                time.sleep(retry_delay)
+                retry_delay *= 2  # Exponential backoff
+        return []
+    def save_chunks(self, chunks: List[Dict[str, Any]], url: str = None) -> bool:
+        """Save document chunks - optimized version using storage bucket"""
+        try:
+            logger.info(f"Saving {len(chunks)} chunks using optimized method")
+            # Use the fast storage method instead of database
+            return self.save_chunks_to_storage(chunks, url)
+        except Exception as e:
+            logger.error(f"Error saving chunks: {str(e)}")
+            return False
+    def get_all_chunks(self, url: str = None) -> List[Dict[str, Any]]:
+        """Retrieve all document chunks - optimized version using storage bucket"""
+        try:
+            logger.info("Retrieving all chunks using optimized method")
+            # Use the fast storage method instead of database
+            return self.get_chunks_from_storage(url)
+        except Exception as e:
+            logger.error(f"Error retrieving chunks: {str(e)}")
+            return []
+    def search_chunks(self, query: str, limit: int = 10) -> List[Dict[str, Any]]:
+        """Search for chunks containing specific text"""
+        try:
+            logger.info(f"Searching for chunks with query: {query}")
+            # Search documents in the collection
+            response = self.databases.list_documents(
+                database_id=self.database_id,
+                collection_id=self.chunks_collection_id,
+                queries=[],
+            )
+            chunks = []
+            for doc in response["documents"]:
+                # Simple client-side search for now
+                if (
+                    query.lower() in doc["content"].lower()
+                    or query.lower() in doc["title"].lower()
+                ):
+                    chunks.append(
+                        {
+                            "content": doc["content"],
+                            "title": doc["title"],
+                            "url": doc.get("url", ""),
+                            "chunk_id": doc["chunk_id"],
+                        }
+                    )
+            logger.info(f"Found {len(chunks)} matching chunks")
+            return chunks[:limit]
+        except Exception as e:
+            logger.error(f"Error searching chunks in Appwrite: {str(e)}")
+            return []
+    def delete_raw_docs_from_storage(self) -> bool:
+        """Delete raw documents file from storage bucket"""
+        try:
+            logger.info("Deleting raw documents from storage")
+            # Delete file from storage
+            self.storage.delete_file(
+                bucket_id=self.bucket_id, file_id="react_docs_raw.json"
+            )
+            logger.info("Successfully deleted raw documents from storage")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting raw documents from storage: {str(e)}")
+            return False
+    def delete_all_chunks(self) -> bool:
+        """Delete all chunks from the database (use with caution)"""
+        try:
+            logger.info("Deleting all chunks from Appwrite")
+            # Get all documents
+            response = self.databases.list_documents(
+                database_id=self.database_id,
+                collection_id=self.chunks_collection_id,
+            )
+            # Delete each document
+            for doc in response["documents"]:
+                self.databases.delete_document(
+                    database_id=self.database_id,
+                    collection_id=self.chunks_collection_id,
+                    document_id=doc["$id"],
+                )
+            logger.info("Successfully deleted all chunks")
+            return True
+        except Exception as e:
+            logger.error(f"Error deleting chunks from Appwrite: {str(e)}")
+            return False
+    def get_raw_docs_count(self) -> int:
+        """Get the total number of raw documents in storage"""
+        try:
+            # Check if raw docs file exists
+            try:
+                self.storage.get_file(
+                    bucket_id=self.bucket_id, file_id="react_docs_raw.json"
+                )
+                # If file exists, get the count from the content
+                docs = self.get_raw_docs_from_storage()
+                return len(docs)
+            except Exception:
+                return 0
+        except Exception as e:
+            logger.error(f"Error getting raw docs count: {str(e)}")
+            return 0
+    def get_chunks_count(self) -> int:
+        """Get the total number of chunks in the database"""
+        try:
+            response = self.databases.list_documents(
+                database_id=self.database_id,
+                collection_id=self.chunks_collection_id,
+            )
+            return response["total"]
+        except Exception as e:
+            logger.error(f"Error getting chunks count: {str(e)}")
+            return 0
+    def clear_all_data(self) -> bool:
+        """Clear all data from both storage and database"""
+        try:
+            logger.info("Clearing all data from storage and database")
+            success1 = self.delete_raw_docs_from_storage()
+            success2 = self.delete_all_chunks()
+            return success1 and success2
+        except Exception as e:
+            logger.error(f"Error clearing all data: {str(e)}")
+            return False
+    def list_storage_files(self) -> List[str]:
+        """List all files in the storage bucket"""
+        try:
+            response = self.storage.list_files(bucket_id=self.bucket_id)
+            files = [file["$id"] for file in response["files"]]
+            logger.info(f"Found {len(files)} files in storage")
+            return files
+        except Exception as e:
+            logger.error(f"Error listing storage files: {str(e)}")
+            return []
+    def save_completion_status(self, url: str, chunks_count: int) -> bool:
+        """Save completion status for a documentation URL"""
+        try:
+            import datetime
+            # Check if completion record already exists
+            existing_record = self.get_completion_status(url)
+            if existing_record:
+                # Update existing record
+                self.databases.update_document(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    document_id=existing_record["$id"],
+                    data={
+                        "url": url,
+                        "status": "completed",
+                        "completed_at": datetime.datetime.now().isoformat(),
+                        "chunks_count": chunks_count,
+                    },
+                )
+                logger.info(f"Updated completion status for {url}")
+            else:
+                # Create new record
+                self.databases.create_document(
+                    database_id=self.database_id,
+                    collection_id=self.completion_collection_id,
+                    document_id="unique()",
+                    data={
+                        "url": url,
+                        "status": "completed",
+                        "completed_at": datetime.datetime.now().isoformat(),
+                        "chunks_count": chunks_count,
+                    },
+                )
+                logger.info(f"Saved completion status for {url}")
+            return True
+        except Exception as e:
+            logger.error(f"Error saving completion status: {str(e)}")
+            return False
+    def get_completion_status(self, url: str) -> Optional[Dict[str, Any]]:
+        """Get completion status for a documentation URL"""
+        try:
+            from appwrite.query import Query
+            response = self.databases.list_documents(
+                database_id=self.database_id,
+                collection_id=self.completion_collection_id,
+                queries=[Query.equal("url", url)],
+            )
+            if response["documents"]:
+                return response["documents"][0]
+            return None
+        except Exception as e:
+            logger.error(f"Error getting completion status: {str(e)}")
+            return None
+    def is_fully_processed(self, url: str) -> bool:
+        """Check if documentation is fully processed (has completion status)"""
+        try:
+            completion_status = self.get_completion_status(url)
+            return (
+                completion_status is not None
+                and completion_status.get("status") == "completed"
+            )
+        except Exception as e:
+            logger.error(f"Error checking if fully processed: {str(e)}")
+            return False
+# Global instance
+appwrite_service = AppwriteService()