Bima Ardhia commited on
Commit
d2d5a16
·
1 Parent(s): e7d5bbb
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ .env
2
+ __pycache__/
3
+ .venv/
4
+ rag_agent/
DockerFile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Step 1: Use an official Python runtime as a parent image
2
+ # Using 'slim' is a good practice for smaller image sizes
3
+ FROM python:3.11-slim
4
+
5
+ # Step 2: Set the working directory inside the container
6
+ WORKDIR /app
7
+
8
+ # Step 3: Copy the requirements file into the container
9
+ # This is done first to leverage Docker's layer caching.
10
+ # Dependencies won't be re-installed unless requirements.txt changes.
11
+ COPY requirements.txt .
12
+
13
+ # Step 4: Install any needed packages specified in requirements.txt
14
+ RUN pip install --no-cache-dir -r requirements.txt
15
+
16
+ # Step 5: Copy the rest of your application's code into the container
17
+ COPY . .
18
+
19
+ # Step 6: Expose the port the app runs on
20
+ # The ADK web server defaults to port 8080
21
+ EXPOSE 8080
22
+
23
+ # Step 7: Define the command to run your application
24
+ # Use --host=0.0.0.0 to make the server accessible from outside the container
25
+ CMD ["adk", "web", "--host=0.0.0.0"]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ google-cloud-aiplatform==1.92.0
2
+ google-cloud-storage==2.19.0
3
+ google-genai==1.14.0
4
+ gitpython==3.1.40
5
+ google-adk==0.5.0
user_agent/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Vertex AI RAG Agent
3
+
4
+ A package for interacting with Google Cloud Vertex AI RAG capabilities.
5
+ """
6
+
7
+ import os
8
+
9
+ import vertexai
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Get Vertex AI configuration from environment
16
+ PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
17
+ LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION")
18
+
19
+ # Initialize Vertex AI at package load time
20
+ try:
21
+ if PROJECT_ID and LOCATION:
22
+ print(f"Initializing Vertex AI with project={PROJECT_ID}, location={LOCATION}")
23
+ vertexai.init(project=PROJECT_ID, location=LOCATION)
24
+ print("Vertex AI initialization successful")
25
+ else:
26
+ print(
27
+ f"Missing Vertex AI configuration. PROJECT_ID={PROJECT_ID}, LOCATION={LOCATION}. "
28
+ f"Tools requiring Vertex AI may not work properly."
29
+ )
30
+ except Exception as e:
31
+ print(f"Failed to initialize Vertex AI: {str(e)}")
32
+ print("Please check your Google Cloud credentials and project settings.")
33
+
34
+ # Import agent after initialization is complete
35
+ from . import agent
user_agent/agent.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from google.adk.agents import Agent
2
+
3
+ # Import the necessary tools for the agent.
4
+ from .tools.get_corpus_info import get_corpus_info
5
+ from .tools.list_corpora import list_corpora
6
+ from .tools.rag_query import rag_query
7
+
8
+ # The agent's instructions have been updated to enforce a specific workflow.
9
+ root_agent = Agent(
10
+ name="RagAgent",
11
+ # Using Gemini 2.5 Flash for best performance with RAG operations
12
+ model="gemini-2.5-flash",
13
+ description="An assistant for finding information, listing available document collections, and viewing their details.",
14
+ tools=[
15
+ rag_query,
16
+ list_corpora,
17
+ get_corpus_info,
18
+ ],
19
+ # The instructions have been significantly updated to enforce a mandatory workflow.
20
+ instruction="""
21
+ # 🧠 Document Query Assistant
22
+
23
+ You are a helpful assistant designed to answer questions based on a collection of documents.
24
+ You must follow a strict workflow to ensure you always have the right context before acting.
25
+
26
+ ## Mandatory Workflow for Every User Request
27
+
28
+ **For every single request from the user, you MUST follow these steps in order:**
29
+
30
+ 1. **Always Run `list_corpora` First**: Before doing anything else, you must call the `list_corpora` tool. This step is mandatory to get the current context of available document collections.
31
+ 2. **Analyze the Context**: Review the output from `list_corpora` and analyze the user's request.
32
+ 3. **Decide the Next Action**: Based on the list of corpora and the user's prompt, choose one of the following paths:
33
+ * **If the user is asking a knowledge-based question**: Determine the most relevant corpus from the list for the user's query. Then, call the `rag_query` tool using that `corpus_name`. If no corpus seems relevant, inform the user.
34
+ * **If the user is asking for details about a specific corpus**: Find the corpus in the list and then call the `get_corpus_info` tool with the correct `corpus_name`.
35
+ * **If the user is asking to see the list of corpora**: Simply present the results from the `list_corpora` call you already made in step 1.
36
+
37
+ ## Your Capabilities (Derived from your workflow)
38
+
39
+ * **Answer Questions**: By first checking available corpora and then querying the most relevant one.
40
+ * **List Document Collections**: By running your mandatory first step and presenting the result.
41
+ * **Get Corpus Details**: By first checking that the corpus exists and then fetching its details.
42
+
43
+ ## Available Tools (To be used only after step 1)
44
+
45
+ 1. `rag_query`: Searches a document collection to answer a specific question.
46
+ - Parameters:
47
+ - `corpus_name`: The name of the document collection to search.
48
+ - `query`: The user's question.
49
+
50
+ 2. `list_corpora`: Lists all available document collections. (Your mandatory first step).
51
+ - This tool takes no parameters.
52
+
53
+ 3. `get_corpus_info`: Gets detailed information and metadata about a specific corpus.
54
+ - Parameters:
55
+ - `corpus_name`: The name of the corpus to get information about.
56
+
57
+ ## Communication Guidelines
58
+
59
+ - Be clear and direct in your responses.
60
+ - When you answer a question, state which document collection you used.
61
+ - If you cannot find a relevant document collection for a query, inform the user clearly. Do not try to answer without a source.
62
+ """,
63
+ )
user_agent/config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration settings for the RAG Agent.
3
+
4
+ These settings are used by the various RAG tools.
5
+ Vertex AI initialization is performed in the package's __init__.py
6
+ """
7
+
8
+ import os
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ # Load environment variables (this is redundant if __init__.py is imported first,
13
+ # but included for safety when importing config directly)
14
+ load_dotenv()
15
+
16
+ # Vertex AI settings
17
+ PROJECT_ID = os.environ.get("GOOGLE_CLOUD_PROJECT")
18
+ LOCATION = os.environ.get("GOOGLE_CLOUD_LOCATION")
19
+
20
+ # RAG settings
21
+ DEFAULT_CHUNK_SIZE = 512
22
+ DEFAULT_CHUNK_OVERLAP = 100
23
+ DEFAULT_TOP_K = 3
24
+ DEFAULT_DISTANCE_THRESHOLD = 0.5
25
+ DEFAULT_EMBEDDING_MODEL = "publishers/google/models/text-embedding-005"
26
+ DEFAULT_EMBEDDING_REQUESTS_PER_MIN = 1000
user_agent/tools/__init__.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Tools package for interacting with Vertex AI RAG corpora.
3
+ """
4
+
5
+ from .add_data import add_data
6
+ from .create_corpus import create_corpus
7
+ from .delete_corpus import delete_corpus
8
+ from .delete_document import delete_document
9
+ from .get_corpus_info import get_corpus_info
10
+ from .list_corpora import list_corpora
11
+ from .rag_query import rag_query
12
+ from .utils import (
13
+ check_corpus_exists,
14
+ get_corpus_resource_name,
15
+ set_current_corpus,
16
+ )
17
+
18
+ __all__ = [
19
+ "add_data",
20
+ "create_corpus",
21
+ "list_corpora",
22
+ "rag_query",
23
+ "get_corpus_info",
24
+ "delete_corpus",
25
+ "delete_document",
26
+ "check_corpus_exists",
27
+ "get_corpus_resource_name",
28
+ "set_current_corpus",
29
+ ]
user_agent/tools/add_data.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for adding new data sources to a Vertex AI RAG corpus.
3
+ """
4
+
5
+ import re
6
+ from typing import List
7
+
8
+ from google.adk.tools.tool_context import ToolContext
9
+ from vertexai import rag
10
+
11
+ from ..config import (
12
+ DEFAULT_CHUNK_OVERLAP,
13
+ DEFAULT_CHUNK_SIZE,
14
+ DEFAULT_EMBEDDING_REQUESTS_PER_MIN,
15
+ )
16
+ from .utils import check_corpus_exists, get_corpus_resource_name
17
+
18
+
19
+ def add_data(
20
+ corpus_name: str,
21
+ paths: List[str],
22
+ tool_context: ToolContext,
23
+ ) -> dict:
24
+ """
25
+ Add new data sources to a Vertex AI RAG corpus.
26
+
27
+ Args:
28
+ corpus_name (str): The name of the corpus to add data to. If empty, the current corpus will be used.
29
+ paths (List[str]): List of URLs or GCS paths to add to the corpus.
30
+ Supported formats:
31
+ - Google Drive: "https://drive.google.com/file/d/{FILE_ID}/view"
32
+ - Google Docs/Sheets/Slides: "https://docs.google.com/{type}/d/{FILE_ID}/..."
33
+ - Google Cloud Storage: "gs://{BUCKET}/{PATH}"
34
+ Example: ["https://drive.google.com/file/d/123", "gs://my_bucket/my_files_dir"]
35
+ tool_context (ToolContext): The tool context
36
+
37
+ Returns:
38
+ dict: Information about the added data and status
39
+ """
40
+ # Check if the corpus exists
41
+ if not check_corpus_exists(corpus_name, tool_context):
42
+ return {
43
+ "status": "error",
44
+ "message": f"Corpus '{corpus_name}' does not exist. Please create it first using the create_corpus tool.",
45
+ "corpus_name": corpus_name,
46
+ "paths": paths,
47
+ }
48
+
49
+ # Validate inputs
50
+ if not paths or not all(isinstance(path, str) for path in paths):
51
+ return {
52
+ "status": "error",
53
+ "message": "Invalid paths: Please provide a list of URLs or GCS paths",
54
+ "corpus_name": corpus_name,
55
+ "paths": paths,
56
+ }
57
+
58
+ # Pre-process paths to validate and convert Google Docs URLs to Drive format if needed
59
+ validated_paths = []
60
+ invalid_paths = []
61
+ conversions = []
62
+
63
+ for path in paths:
64
+ if not path or not isinstance(path, str):
65
+ invalid_paths.append(f"{path} (Not a valid string)")
66
+ continue
67
+
68
+ # Check for Google Docs/Sheets/Slides URLs and convert them to Drive format
69
+ docs_match = re.match(
70
+ r"https:\/\/docs\.google\.com\/(?:document|spreadsheets|presentation)\/d\/([a-zA-Z0-9_-]+)(?:\/|$)",
71
+ path,
72
+ )
73
+ if docs_match:
74
+ file_id = docs_match.group(1)
75
+ drive_url = f"https://drive.google.com/file/d/{file_id}/view"
76
+ validated_paths.append(drive_url)
77
+ conversions.append(f"{path} → {drive_url}")
78
+ continue
79
+
80
+ # Check for valid Drive URL format
81
+ drive_match = re.match(
82
+ r"https:\/\/drive\.google\.com\/(?:file\/d\/|open\?id=)([a-zA-Z0-9_-]+)(?:\/|$)",
83
+ path,
84
+ )
85
+ if drive_match:
86
+ # Normalize to the standard Drive URL format
87
+ file_id = drive_match.group(1)
88
+ drive_url = f"https://drive.google.com/file/d/{file_id}/view"
89
+ validated_paths.append(drive_url)
90
+ if drive_url != path:
91
+ conversions.append(f"{path} → {drive_url}")
92
+ continue
93
+
94
+ # Check for GCS paths
95
+ if path.startswith("gs://"):
96
+ validated_paths.append(path)
97
+ continue
98
+
99
+ # If we're here, the path wasn't in a recognized format
100
+ invalid_paths.append(f"{path} (Invalid format)")
101
+
102
+ # Check if we have any valid paths after validation
103
+ if not validated_paths:
104
+ return {
105
+ "status": "error",
106
+ "message": "No valid paths provided. Please provide Google Drive URLs or GCS paths.",
107
+ "corpus_name": corpus_name,
108
+ "invalid_paths": invalid_paths,
109
+ }
110
+
111
+ try:
112
+ # Get the corpus resource name
113
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
114
+
115
+ # Set up chunking configuration
116
+ transformation_config = rag.TransformationConfig(
117
+ chunking_config=rag.ChunkingConfig(
118
+ chunk_size=DEFAULT_CHUNK_SIZE,
119
+ chunk_overlap=DEFAULT_CHUNK_OVERLAP,
120
+ ),
121
+ )
122
+
123
+ # Import files to the corpus
124
+ import_result = rag.import_files(
125
+ corpus_resource_name,
126
+ validated_paths,
127
+ transformation_config=transformation_config,
128
+ max_embedding_requests_per_min=DEFAULT_EMBEDDING_REQUESTS_PER_MIN,
129
+ )
130
+
131
+ # Set this as the current corpus if not already set
132
+ if not tool_context.state.get("current_corpus"):
133
+ tool_context.state["current_corpus"] = corpus_name
134
+
135
+ # Build the success message
136
+ conversion_msg = ""
137
+ if conversions:
138
+ conversion_msg = " (Converted Google Docs URLs to Drive format)"
139
+
140
+ return {
141
+ "status": "success",
142
+ "message": f"Successfully added {import_result.imported_rag_files_count} file(s) to corpus '{corpus_name}'{conversion_msg}",
143
+ "corpus_name": corpus_name,
144
+ "files_added": import_result.imported_rag_files_count,
145
+ "paths": validated_paths,
146
+ "invalid_paths": invalid_paths,
147
+ "conversions": conversions,
148
+ }
149
+
150
+ except Exception as e:
151
+ return {
152
+ "status": "error",
153
+ "message": f"Error adding data to corpus: {str(e)}",
154
+ "corpus_name": corpus_name,
155
+ "paths": paths,
156
+ }
user_agent/tools/create_corpus.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for creating a new Vertex AI RAG corpus.
3
+ """
4
+
5
+ import re
6
+
7
+ from google.adk.tools.tool_context import ToolContext
8
+ from vertexai import rag
9
+
10
+ from ..config import (
11
+ DEFAULT_EMBEDDING_MODEL,
12
+ )
13
+ from .utils import check_corpus_exists
14
+
15
+
16
+ def create_corpus(
17
+ corpus_name: str,
18
+ tool_context: ToolContext,
19
+ ) -> dict:
20
+ """
21
+ Create a new Vertex AI RAG corpus with the specified name.
22
+
23
+ Args:
24
+ corpus_name (str): The name for the new corpus
25
+ tool_context (ToolContext): The tool context for state management
26
+
27
+ Returns:
28
+ dict: Status information about the operation
29
+ """
30
+ # Check if corpus already exists
31
+ if check_corpus_exists(corpus_name, tool_context):
32
+ return {
33
+ "status": "info",
34
+ "message": f"Corpus '{corpus_name}' already exists",
35
+ "corpus_name": corpus_name,
36
+ "corpus_created": False,
37
+ }
38
+
39
+ try:
40
+ # Clean corpus name for use as display name
41
+ display_name = re.sub(r"[^a-zA-Z0-9_-]", "_", corpus_name)
42
+
43
+ # Configure embedding model
44
+ embedding_model_config = rag.RagEmbeddingModelConfig(
45
+ vertex_prediction_endpoint=rag.VertexPredictionEndpoint(
46
+ publisher_model=DEFAULT_EMBEDDING_MODEL
47
+ )
48
+ )
49
+
50
+ # Create the corpus
51
+ rag_corpus = rag.create_corpus(
52
+ display_name=display_name,
53
+ backend_config=rag.RagVectorDbConfig(
54
+ rag_embedding_model_config=embedding_model_config
55
+ ),
56
+ )
57
+
58
+ # Update state to track corpus existence
59
+ tool_context.state[f"corpus_exists_{corpus_name}"] = True
60
+
61
+ # Set this as the current corpus
62
+ tool_context.state["current_corpus"] = corpus_name
63
+
64
+ return {
65
+ "status": "success",
66
+ "message": f"Successfully created corpus '{corpus_name}'",
67
+ "corpus_name": rag_corpus.name,
68
+ "display_name": rag_corpus.display_name,
69
+ "corpus_created": True,
70
+ }
71
+
72
+ except Exception as e:
73
+ return {
74
+ "status": "error",
75
+ "message": f"Error creating corpus: {str(e)}",
76
+ "corpus_name": corpus_name,
77
+ "corpus_created": False,
78
+ }
user_agent/tools/delete_corpus.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for deleting a Vertex AI RAG corpus when it's no longer needed.
3
+ """
4
+
5
+ from google.adk.tools.tool_context import ToolContext
6
+ from vertexai import rag
7
+
8
+ from .utils import check_corpus_exists, get_corpus_resource_name
9
+
10
+
11
+ def delete_corpus(
12
+ corpus_name: str,
13
+ confirm: bool,
14
+ tool_context: ToolContext,
15
+ ) -> dict:
16
+ """
17
+ Delete a Vertex AI RAG corpus when it's no longer needed.
18
+ Requires confirmation to prevent accidental deletion.
19
+
20
+ Args:
21
+ corpus_name (str): The full resource name of the corpus to delete.
22
+ Preferably use the resource_name from list_corpora results.
23
+ confirm (bool): Must be set to True to confirm deletion
24
+ tool_context (ToolContext): The tool context
25
+
26
+ Returns:
27
+ dict: Status information about the deletion operation
28
+ """
29
+ # Check if corpus exists
30
+ if not check_corpus_exists(corpus_name, tool_context):
31
+ return {
32
+ "status": "error",
33
+ "message": f"Corpus '{corpus_name}' does not exist",
34
+ "corpus_name": corpus_name,
35
+ }
36
+
37
+ # Check if deletion is confirmed
38
+ if not confirm:
39
+ return {
40
+ "status": "error",
41
+ "message": "Deletion requires explicit confirmation. Set confirm=True to delete this corpus.",
42
+ "corpus_name": corpus_name,
43
+ }
44
+
45
+ try:
46
+ # Get the corpus resource name
47
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
48
+
49
+ # Delete the corpus
50
+ rag.delete_corpus(corpus_resource_name)
51
+
52
+ # Remove from state by setting to False
53
+ state_key = f"corpus_exists_{corpus_name}"
54
+ if state_key in tool_context.state:
55
+ tool_context.state[state_key] = False
56
+
57
+ return {
58
+ "status": "success",
59
+ "message": f"Successfully deleted corpus '{corpus_name}'",
60
+ "corpus_name": corpus_name,
61
+ }
62
+ except Exception as e:
63
+ return {
64
+ "status": "error",
65
+ "message": f"Error deleting corpus: {str(e)}",
66
+ "corpus_name": corpus_name,
67
+ }
user_agent/tools/delete_document.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for deleting a specific document from a Vertex AI RAG corpus.
3
+ """
4
+
5
+ from google.adk.tools.tool_context import ToolContext
6
+ from vertexai import rag
7
+
8
+ from .utils import check_corpus_exists, get_corpus_resource_name
9
+
10
+
11
+ def delete_document(
12
+ corpus_name: str,
13
+ document_id: str,
14
+ tool_context: ToolContext,
15
+ ) -> dict:
16
+ """
17
+ Delete a specific document from a Vertex AI RAG corpus.
18
+
19
+ Args:
20
+ corpus_name (str): The full resource name of the corpus containing the document.
21
+ Preferably use the resource_name from list_corpora results.
22
+ document_id (str): The ID of the specific document/file to delete. This can be
23
+ obtained from get_corpus_info results.
24
+ tool_context (ToolContext): The tool context
25
+
26
+ Returns:
27
+ dict: Status information about the deletion operation
28
+ """
29
+ # Check if corpus exists
30
+ if not check_corpus_exists(corpus_name, tool_context):
31
+ return {
32
+ "status": "error",
33
+ "message": f"Corpus '{corpus_name}' does not exist",
34
+ "corpus_name": corpus_name,
35
+ "document_id": document_id,
36
+ }
37
+
38
+ try:
39
+ # Get the corpus resource name
40
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
41
+
42
+ # Delete the document
43
+ rag_file_path = f"{corpus_resource_name}/ragFiles/{document_id}"
44
+ rag.delete_file(rag_file_path)
45
+
46
+ return {
47
+ "status": "success",
48
+ "message": f"Successfully deleted document '{document_id}' from corpus '{corpus_name}'",
49
+ "corpus_name": corpus_name,
50
+ "document_id": document_id,
51
+ }
52
+ except Exception as e:
53
+ return {
54
+ "status": "error",
55
+ "message": f"Error deleting document: {str(e)}",
56
+ "corpus_name": corpus_name,
57
+ "document_id": document_id,
58
+ }
user_agent/tools/get_corpus_info.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for retrieving detailed information about a specific RAG corpus.
3
+ """
4
+
5
+ from google.adk.tools.tool_context import ToolContext
6
+ from vertexai import rag
7
+
8
+ from .utils import check_corpus_exists, get_corpus_resource_name
9
+
10
+
11
+ def get_corpus_info(
12
+ corpus_name: str,
13
+ tool_context: ToolContext,
14
+ ) -> dict:
15
+ """
16
+ Get detailed information about a specific RAG corpus, including its files.
17
+
18
+ Args:
19
+ corpus_name (str): The full resource name of the corpus to get information about.
20
+ Preferably use the resource_name from list_corpora results.
21
+ tool_context (ToolContext): The tool context
22
+
23
+ Returns:
24
+ dict: Information about the corpus and its files
25
+ """
26
+ try:
27
+ # Check if corpus exists
28
+ if not check_corpus_exists(corpus_name, tool_context):
29
+ return {
30
+ "status": "error",
31
+ "message": f"Corpus '{corpus_name}' does not exist",
32
+ "corpus_name": corpus_name,
33
+ }
34
+
35
+ # Get the corpus resource name
36
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
37
+
38
+ # Try to get corpus details first
39
+ corpus_display_name = corpus_name # Default if we can't get actual display name
40
+
41
+ # Process file information
42
+ file_details = []
43
+ try:
44
+ # Get the list of files
45
+ files = rag.list_files(corpus_resource_name)
46
+ for rag_file in files:
47
+ # Get document specific details
48
+ try:
49
+ # Extract the file ID from the name
50
+ file_id = rag_file.name.split("/")[-1]
51
+
52
+ file_info = {
53
+ "file_id": file_id,
54
+ "display_name": (
55
+ rag_file.display_name
56
+ if hasattr(rag_file, "display_name")
57
+ else ""
58
+ ),
59
+ "source_uri": (
60
+ rag_file.source_uri
61
+ if hasattr(rag_file, "source_uri")
62
+ else ""
63
+ ),
64
+ "create_time": (
65
+ str(rag_file.create_time)
66
+ if hasattr(rag_file, "create_time")
67
+ else ""
68
+ ),
69
+ "update_time": (
70
+ str(rag_file.update_time)
71
+ if hasattr(rag_file, "update_time")
72
+ else ""
73
+ ),
74
+ }
75
+
76
+ file_details.append(file_info)
77
+ except Exception:
78
+ # Continue to the next file
79
+ continue
80
+ except Exception:
81
+ # Continue without file details
82
+ pass
83
+
84
+ # Basic corpus info
85
+ return {
86
+ "status": "success",
87
+ "message": f"Successfully retrieved information for corpus '{corpus_display_name}'",
88
+ "corpus_name": corpus_name,
89
+ "corpus_display_name": corpus_display_name,
90
+ "file_count": len(file_details),
91
+ "files": file_details,
92
+ }
93
+
94
+ except Exception as e:
95
+ return {
96
+ "status": "error",
97
+ "message": f"Error getting corpus information: {str(e)}",
98
+ "corpus_name": corpus_name,
99
+ }
user_agent/tools/list_corpora.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for listing all available Vertex AI RAG corpora.
3
+ """
4
+
5
+ from typing import Dict, List, Union
6
+
7
+ from vertexai import rag
8
+
9
+
10
+ def list_corpora() -> dict:
11
+ """
12
+ List all available Vertex AI RAG corpora.
13
+
14
+ Returns:
15
+ dict: A list of available corpora and status, with each corpus containing:
16
+ - resource_name: The full resource name to use with other tools
17
+ - display_name: The human-readable name of the corpus
18
+ - create_time: When the corpus was created
19
+ - update_time: When the corpus was last updated
20
+ """
21
+ try:
22
+ # Get the list of corpora
23
+ corpora = rag.list_corpora()
24
+
25
+ # Process corpus information into a more usable format
26
+ corpus_info: List[Dict[str, Union[str, int]]] = []
27
+ for corpus in corpora:
28
+ corpus_data: Dict[str, Union[str, int]] = {
29
+ "resource_name": corpus.name, # Full resource name for use with other tools
30
+ "display_name": corpus.display_name,
31
+ "create_time": (
32
+ str(corpus.create_time) if hasattr(corpus, "create_time") else ""
33
+ ),
34
+ "update_time": (
35
+ str(corpus.update_time) if hasattr(corpus, "update_time") else ""
36
+ ),
37
+ }
38
+
39
+ corpus_info.append(corpus_data)
40
+
41
+ return {
42
+ "status": "success",
43
+ "message": f"Found {len(corpus_info)} available corpora",
44
+ "corpora": corpus_info,
45
+ }
46
+ except Exception as e:
47
+ return {
48
+ "status": "error",
49
+ "message": f"Error listing corpora: {str(e)}",
50
+ "corpora": [],
51
+ }
user_agent/tools/rag_query.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Tool for querying Vertex AI RAG corpora and retrieving relevant information.
3
+ """
4
+
5
+ import logging
6
+
7
+ from google.adk.tools.tool_context import ToolContext
8
+ from vertexai import rag
9
+
10
+ from ..config import (
11
+ DEFAULT_DISTANCE_THRESHOLD,
12
+ DEFAULT_TOP_K,
13
+ )
14
+ from .utils import check_corpus_exists, get_corpus_resource_name
15
+
16
+
17
+ def rag_query(
18
+ corpus_name: str,
19
+ query: str,
20
+ tool_context: ToolContext,
21
+ ) -> dict:
22
+ """
23
+ Query a Vertex AI RAG corpus with a user question and return relevant information.
24
+
25
+ Args:
26
+ corpus_name (str): The name of the corpus to query. If empty, the current corpus will be used.
27
+ Preferably use the resource_name from list_corpora results.
28
+ query (str): The text query to search for in the corpus
29
+ tool_context (ToolContext): The tool context
30
+
31
+ Returns:
32
+ dict: The query results and status
33
+ """
34
+ try:
35
+
36
+ # Check if the corpus exists
37
+ if not check_corpus_exists(corpus_name, tool_context):
38
+ return {
39
+ "status": "error",
40
+ "message": f"Corpus '{corpus_name}' does not exist. Please create it first using the create_corpus tool.",
41
+ "query": query,
42
+ "corpus_name": corpus_name,
43
+ }
44
+
45
+ # Get the corpus resource name
46
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
47
+
48
+ # Configure retrieval parameters
49
+ rag_retrieval_config = rag.RagRetrievalConfig(
50
+ top_k=DEFAULT_TOP_K,
51
+ filter=rag.Filter(vector_distance_threshold=DEFAULT_DISTANCE_THRESHOLD),
52
+ )
53
+
54
+ # Perform the query
55
+ print("Performing retrieval query...")
56
+ response = rag.retrieval_query(
57
+ rag_resources=[
58
+ rag.RagResource(
59
+ rag_corpus=corpus_resource_name,
60
+ )
61
+ ],
62
+ text=query,
63
+ rag_retrieval_config=rag_retrieval_config,
64
+ )
65
+
66
+ # Process the response into a more usable format
67
+ results = []
68
+ if hasattr(response, "contexts") and response.contexts:
69
+ for ctx_group in response.contexts.contexts:
70
+ result = {
71
+ "source_uri": (
72
+ ctx_group.source_uri if hasattr(ctx_group, "source_uri") else ""
73
+ ),
74
+ "source_name": (
75
+ ctx_group.source_display_name
76
+ if hasattr(ctx_group, "source_display_name")
77
+ else ""
78
+ ),
79
+ "text": ctx_group.text if hasattr(ctx_group, "text") else "",
80
+ "score": ctx_group.score if hasattr(ctx_group, "score") else 0.0,
81
+ }
82
+ results.append(result)
83
+
84
+ # If we didn't find any results
85
+ if not results:
86
+ return {
87
+ "status": "warning",
88
+ "message": f"No results found in corpus '{corpus_name}' for query: '{query}'",
89
+ "query": query,
90
+ "corpus_name": corpus_name,
91
+ "results": [],
92
+ "results_count": 0,
93
+ }
94
+
95
+ return {
96
+ "status": "success",
97
+ "message": f"Successfully queried corpus '{corpus_name}'",
98
+ "query": query,
99
+ "corpus_name": corpus_name,
100
+ "results": results,
101
+ "results_count": len(results),
102
+ }
103
+
104
+ except Exception as e:
105
+ error_msg = f"Error querying corpus: {str(e)}"
106
+ logging.error(error_msg)
107
+ return {
108
+ "status": "error",
109
+ "message": error_msg,
110
+ "query": query,
111
+ "corpus_name": corpus_name,
112
+ }
user_agent/tools/utils.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for the RAG tools.
3
+ """
4
+
5
+ import logging
6
+ import re
7
+
8
+ from google.adk.tools.tool_context import ToolContext
9
+ from vertexai import rag
10
+
11
+ from ..config import (
12
+ LOCATION,
13
+ PROJECT_ID,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ def get_corpus_resource_name(corpus_name: str) -> str:
20
+ """
21
+ Convert a corpus name to its full resource name if needed.
22
+ Handles various input formats and ensures the returned name follows Vertex AI's requirements.
23
+
24
+ Args:
25
+ corpus_name (str): The corpus name or display name
26
+
27
+ Returns:
28
+ str: The full resource name of the corpus
29
+ """
30
+ logger.info(f"Getting resource name for corpus: {corpus_name}")
31
+
32
+ # If it's already a full resource name with the projects/locations/ragCorpora format
33
+ if re.match(r"^projects/[^/]+/locations/[^/]+/ragCorpora/[^/]+$", corpus_name):
34
+ return corpus_name
35
+
36
+ # Check if this is a display name of an existing corpus
37
+ try:
38
+ # List all corpora and check if there's a match with the display name
39
+ corpora = rag.list_corpora()
40
+ for corpus in corpora:
41
+ if hasattr(corpus, "display_name") and corpus.display_name == corpus_name:
42
+ return corpus.name
43
+ except Exception as e:
44
+ logger.warning(f"Error when checking for corpus display name: {str(e)}")
45
+ # If we can't check, continue with the default behavior
46
+ pass
47
+
48
+ # If it contains partial path elements, extract just the corpus ID
49
+ if "/" in corpus_name:
50
+ # Extract the last part of the path as the corpus ID
51
+ corpus_id = corpus_name.split("/")[-1]
52
+ else:
53
+ corpus_id = corpus_name
54
+
55
+ # Remove any special characters that might cause issues
56
+ corpus_id = re.sub(r"[^a-zA-Z0-9_-]", "_", corpus_id)
57
+
58
+ # Construct the standardized resource name
59
+ return f"projects/{PROJECT_ID}/locations/{LOCATION}/ragCorpora/{corpus_id}"
60
+
61
+
62
+ def check_corpus_exists(corpus_name: str, tool_context: ToolContext) -> bool:
63
+ """
64
+ Check if a corpus with the given name exists.
65
+
66
+ Args:
67
+ corpus_name (str): The name of the corpus to check
68
+ tool_context (ToolContext): The tool context for state management
69
+
70
+ Returns:
71
+ bool: True if the corpus exists, False otherwise
72
+ """
73
+ # Check state first if tool_context is provided
74
+ if tool_context.state.get(f"corpus_exists_{corpus_name}"):
75
+ return True
76
+
77
+ try:
78
+ # Get full resource name
79
+ corpus_resource_name = get_corpus_resource_name(corpus_name)
80
+
81
+ # List all corpora and check if this one exists
82
+ corpora = rag.list_corpora()
83
+ for corpus in corpora:
84
+ if (
85
+ corpus.name == corpus_resource_name
86
+ or corpus.display_name == corpus_name
87
+ ):
88
+ # Update state
89
+ tool_context.state[f"corpus_exists_{corpus_name}"] = True
90
+ # Also set this as the current corpus if no current corpus is set
91
+ if not tool_context.state.get("current_corpus"):
92
+ tool_context.state["current_corpus"] = corpus_name
93
+ return True
94
+
95
+ return False
96
+ except Exception as e:
97
+ logger.error(f"Error checking if corpus exists: {str(e)}")
98
+ # If we can't check, assume it doesn't exist
99
+ return False
100
+
101
+
102
+ def set_current_corpus(corpus_name: str, tool_context: ToolContext) -> bool:
103
+ """
104
+ Set the current corpus in the tool context state.
105
+
106
+ Args:
107
+ corpus_name (str): The name of the corpus to set as current
108
+ tool_context (ToolContext): The tool context for state management
109
+
110
+ Returns:
111
+ bool: True if the corpus exists and was set as current, False otherwise
112
+ """
113
+ # Check if corpus exists first
114
+ if check_corpus_exists(corpus_name, tool_context):
115
+ tool_context.state["current_corpus"] = corpus_name
116
+ return True
117
+ return False