Spaces:

MCP-1st-Birthday
/

Hivenet_ComputeAgent

Sleeping

App Files Files Community

Hivenet_ComputeAgent / ComputeAgent /nodes /ReAct /generate_node.py

carraraig

finish (#8)

5dd4236 verified 3 months ago

raw

history blame contribute delete

18.8 kB

	"""
	ReAct Generate Node - Simplified version with 3 clear paths

	Node that generates final response using:
	1. DirectAnswerChain for direct answers (no tools)
	2. ResearcherChain for researcher tool results
	3. ToolResultChain for other tool results

	All chains provide consistent formatting and professional presentation with memory context support.
	Independent implementation for ReAct workflow - no dependency on AgenticRAG.
	"""

	from typing import Dict, Any
	from ComputeAgent.chains.tool_result_chain import ToolResultChain
	from ComputeAgent.models.model_manager import ModelManager
	from constant import Constants
	import asyncio
	import logging
	import json
	from langgraph.config import get_stream_writer
	from langchain_core.messages import HumanMessage, SystemMessage

	# Initialize model manager for LLM loading
	model_manager = ModelManager()

	# Initialize logger for generate node
	logger = logging.getLogger("ReAct Generate Node")


	def _create_error_response(state: Dict[str, Any], query: str, error_msg: str) -> Dict[str, Any]:
	"""Create a standardized error response"""
	final_response_dict = {
	"query": query,
	"final_response": f"I apologize, but I encountered an error: {error_msg}",
	"sources": []
	}

	updated_state = state.copy()
	updated_state["response"] = final_response_dict["final_response"]
	updated_state["final_response_dict"] = final_response_dict
	updated_state["current_step"] = "generate_complete"

	# Send it via custom stream
	writer = get_stream_writer()
	writer({"final_response_dict": final_response_dict})

	return updated_state


	async def _generate_deployment_instructions(state: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Generate deployment instructions when instance has been created.

	Args:
	state: Current state with instance_id and deployment info

	Returns:
	Updated state with deployment instructions
	"""
	logger.info("📝 Generating deployment instructions")

	# Extract deployment information
	instance_id = state.get("instance_id", "")
	instance_status = state.get("instance_status", "")
	model_name = state.get("model_name", "Unknown Model")
	model_info = state.get("model_info", {})
	gpu_requirements = state.get("gpu_requirements", {})
	estimated_gpu_memory = state.get("estimated_gpu_memory", 0)

	# Get deployment configuration
	location = model_info.get("location", "UAE-1")
	gpu_type = model_info.get("GPU_type", "RTX 4090")
	num_gpus = gpu_requirements.get(gpu_type, 1)
	config = f"{num_gpus}x {gpu_type}"

	# Determine capacity source
	custom_capacity = state.get("custom_capacity", {})
	capacity_source = "custom" if custom_capacity else "estimated"

	# Build SSH command
	ssh_command = f'ssh -i ~/.ssh/id_rsa -o "ProxyCommand=ssh bastion@ssh.hivecompute.ai %h" ubuntu@{instance_id}.ssh.hivecompute.ai'

	# Get capacity estimation parameters
	max_model_len = model_info.get("max_model_len", 2048)
	max_num_seqs = model_info.get("max_num_seqs", 256)
	max_batched_tokens = model_info.get("max_num_batched_tokens", 2048)
	dtype = model_info.get("dtype", "BF16")
	kv_cache_dtype = model_info.get("kv_cache_dtype", "auto")
	gpu_memory_utilization = model_info.get("gpu_memory_utilization", 0.9)

	# Use LLM to generate optimal vLLM command based on documentation and specs
	logger.info("🤖 Using LLM to determine optimal vLLM parameters")

	# Import vLLM documentation
	try:
	from vllm_engine_args import get_vllm_docs
	vllm_docs = get_vllm_docs()
	except ImportError:
	logger.warning("⚠️ Could not import vllm_engine_args, using basic documentation")
	vllm_docs = "Basic vLLM parameters: --model, --dtype, --max-model-len, --gpu-memory-utilization, --tensor-parallel-size, --enable-prefix-caching, --enable-chunked-prefill"

	vllm_params_prompt = f"""You are an expert in vLLM deployment. Based on the model specifications and capacity estimation, generate an optimal vLLM serve command.

	Model Information:
	- Model: {model_name}
	- GPU Type: {gpu_type}
	- Number of GPUs: {num_gpus}
	- GPU Memory: {estimated_gpu_memory:.2f} GB
	- Location: {location}

	Capacity Estimation Parameters:
	- Max Model Length: {max_model_len}
	- Max Sequences: {max_num_seqs}
	- Max Batched Tokens: {max_batched_tokens}
	- Data Type: {dtype}
	- KV Cache dtype: {kv_cache_dtype}
	- GPU Memory Utilization: {gpu_memory_utilization}

	vLLM Engine Arguments Documentation:
	{vllm_docs}

	Task:
	Generate the optimal vLLM serve command for this deployment. Consider:
	1. Use the capacity estimation parameters provided
	2. For multi-GPU setups ({num_gpus} GPUs), add --tensor-parallel-size {num_gpus} if num_gpus > 1
	3. Add --enable-chunked-prefill if max_model_len > 8192 for better long context handling
	4. Use --quantization fp8 only if dtype contains 'fp8' or 'FP8'
	5. Always include --enable-prefix-caching for better performance
	6. Set --host 0.0.0.0 and --port 8888
	7. Use --download-dir /home/ubuntu/workspace/models
	8. Consider other relevant parameters from the documentation based on the model and hardware specs

	Return ONLY the complete vLLM command without any explanation, starting with 'vllm serve'."""

	try:
	from langchain_openai import ChatOpenAI
	from constant import Constants

	llm = ChatOpenAI(
	base_url=Constants.LLM_BASE_URL,
	api_key=Constants.LLM_API_KEY,
	model=Constants.DEFAULT_LLM_NAME,
	temperature=0.0
	)

	vllm_response = await llm.ainvoke(vllm_params_prompt)
	vllm_command = vllm_response.content.strip()

	logger.info(f"✅ Generated vLLM command: {vllm_command}")

	except Exception as e:
	logger.error(f"❌ Failed to generate vLLM command with LLM: {e}")
	# Fallback to basic command
	quantization = "fp8" if "fp8" in dtype.lower() else None
	vllm_command = f'vllm serve {model_name} --download-dir /home/ubuntu/workspace/models --gpu-memory-utilization {gpu_memory_utilization} --max-model-len {max_model_len} --max-num-seqs {max_num_seqs} --max-num-batched-tokens {max_batched_tokens} --dtype {dtype}'
	if quantization:
	vllm_command += f' --quantization {quantization}'
	if num_gpus > 1:
	vllm_command += f' --tensor-parallel-size {num_gpus}'
	vllm_command += f' --kv-cache-dtype {kv_cache_dtype} --enable-prefix-caching --host 0.0.0.0 --port 8888'

	# Build curl test command
	curl_command = f'''curl -k https://{instance_id}-8888.tenants.hivecompute.ai/v1/chat/completions \\
	-H "Content-Type: application/json" \\
	-d '{{
	"model": "{model_name}",
	"messages": [
	{{"role": "user", "content": "What is the capital of France?"}}
	],
	"max_tokens": 512
	}}' '''

	# Build complete deployment instructions response
	final_response = f"""
	# 🚀 Deployment Instructions for {model_name}

	## ✅ Instance Created Successfully

	Instance ID: `{instance_id}`
	Status: `{instance_status}`
	Location: `{location}`
	Configuration: `{config}`

	---

	## 📊 Capacity Configuration

	- GPU Memory Required: {estimated_gpu_memory:.2f} GB
	- GPU Type: {gpu_type}
	- Number of GPUs: {num_gpus}
	- Capacity Source: {capacity_source}

	---

	## 🔐 Step 1: SSH to the Instance

	```bash
	{ssh_command}
	```

	---

	## 📁 Step 2: Create Models Directory

	Once connected via SSH, create the models directory inside the workspace:

	```bash
	mkdir -p /home/ubuntu/workspace/models
	mkdir -p /home/ubuntu/workspace/tmpdir
	```

	Note: Cannot use docker file in HiveCompute since there is no VM support. Use an instance from HiveCompute with Template with Pytorch.

	---

	## 📦 Step 3: Install Dependencies (Using UV)

	Install UV package manager:

	```bash
	curl -LsSf https://astral.sh/uv/install.sh \| sh
	source $HOME/.local/bin/env
	```

	Create and activate environment:

	```bash
	uv venv --python 3.12
	source .venv/bin/activate
	```

	Install vLLM and dependencies:

	```bash
	uv pip install vllm==0.11.0 ray[default]
	```

	---

	## 🤖 Step 4: Start vLLM Server

	Run the vLLM server with the following configuration:

	```bash
	{vllm_command}
	```

	Configuration Details:
	The vLLM command above was intelligently generated based on:
	- Model Specifications: {model_name} with {num_gpus}x {gpu_type}
	- Capacity Estimation: {estimated_gpu_memory:.2f} GB GPU memory, {int(gpu_memory_utilization * 100)}% utilization
	- Context Length: {max_model_len} tokens
	- Batch Configuration: {max_num_seqs} max sequences, {max_batched_tokens} max batched tokens
	- Data Type: {dtype} with {kv_cache_dtype} KV cache
	- vLLM Documentation: Optimized parameters from https://docs.vllm.ai/en/v0.7.2/serving/engine_args.html

	The LLM analyzed your deployment requirements and selected optimal parameters including tensor parallelism, chunked prefill, and caching strategies.

	---

	## 🧪 Step 5: Test the Deployment

	Test your deployed model with a curl command:

	```bash
	{curl_command}
	```

	This will send a test request to your model and verify it's responding correctly.

	---

	## 📝 Additional Notes

	- The vLLM server will download the model to `/home/ubuntu/workspace/models` on first run
	- Make sure to monitor GPU memory usage during model loading
	- The instance is accessible via the HiveCompute tenant URL: `https://{instance_id}-8888.tenants.hivecompute.ai`
	- For production use, consider setting up monitoring and health checks

	---

	Deployment Complete! 🎉
	"""

	final_response_dict = {
	"query": f"Deploy model {model_name}",
	"final_response": final_response,
	"instance_id": instance_id,
	"instance_status": instance_status,
	"sources": []
	}

	# Update state
	updated_state = state.copy()
	updated_state["response"] = final_response
	updated_state["final_response_dict"] = final_response_dict
	updated_state["current_step"] = "generate_complete"

	# Remove tools to avoid serialization issues
	if "tools" in updated_state:
	del updated_state["tools"]

	# Send via custom stream
	writer = get_stream_writer()
	writer({"final_response_dict": final_response_dict})

	logger.info("✅ Deployment instructions generated successfully")

	return updated_state


	async def _handle_tool_results(state: Dict[str, Any], query: str, user_id: str, session_id: str,
	tool_results: list, memory_context: str, llm) -> Dict[str, Any]:
	"""Handle general tool results using ToolResultChain"""
	try:
	logger.info(f"🤖 Synthesizing tool results using ToolResultChain...")

	tool_result_chain = ToolResultChain(llm=llm)
	formatted_response = await tool_result_chain.ainvoke(query, tool_results, memory_context)

	final_response_dict = {
	"query": query,
	"final_response": formatted_response,
	"sources": []
	}

	updated_state = state.copy()
	updated_state["response"] = formatted_response
	updated_state["final_response_dict"] = final_response_dict
	updated_state["current_step"] = "generate_complete"

	# Send it via custom stream
	writer = get_stream_writer()
	writer({"final_response_dict": final_response_dict})

	logger.info("✅ Tool results synthesized successfully")
	return updated_state

	except Exception as e:
	logger.error(f"❌ ToolResultChain Error: {str(e)}")
	# Final fallback to raw content
	fallback_response = "I executed the requested tools but encountered formatting issues. Here are the raw results:\n\n"
	for i, result in enumerate(tool_results, 1):
	content = result.content if hasattr(result, 'content') else str(result)
	fallback_response += f"Tool {i}: {content}\n"

	final_response_dict = {
	"query": query,
	"final_response": fallback_response,
	"sources": []
	}

	updated_state = state.copy()
	updated_state["response"] = fallback_response
	updated_state["final_response_dict"] = final_response_dict
	updated_state["current_step"] = "generate_complete"

	# Send it via custom stream
	writer = get_stream_writer()
	writer({"final_response_dict": final_response_dict})

	logger.info("✅ Tool results formatted using raw content fallback")
	return updated_state


	async def generate_node(state: Dict[str, Any]) -> Dict[str, Any]:
	"""
	Simple response generation with 4 clear paths:
	1. Deployment Instructions (when instance_created == True)
	2. Direct Answer (when current_step == "direct_answer_complete")
	3. Researcher Results (when researcher_used == True)
	4. General Tool Results (when tool_results exist but no researcher)

	Args:
	state: Current ReAct state

	Returns:
	Updated state with generated response
	"""
	logger.info("🤖 Starting response generation")

	# Extract common variables
	query = state.get("query", "")
	user_id = state.get("user_id", "")
	session_id = state.get("session_id", "")
	current_step = state.get("current_step", "")
	tool_results = state.get("tool_results", [])
	existing_response = state.get("response", "")
	researcher_used = state.get("researcher_used", False)
	instance_created = state.get("instance_created", False)

	# Debug logging to help diagnose path selection
	logger.info(f"🔍 DEBUG - instance_created: {instance_created}, researcher_used: {researcher_used}, tool_results count: {len(tool_results)}, current_step: {current_step}, existing_response: {bool(existing_response)}")

	# Special handling for deployment workflow
	if instance_created:
	logger.info("🚀 Deployment mode detected - generating deployment instructions")
	return await _generate_deployment_instructions(state)

	# Build memory context once
	memory_context = ""
	if user_id and session_id:
	try:
	from helpers.memory import get_memory_manager
	memory_manager = get_memory_manager()
	memory_context = await memory_manager.build_context_for_node(user_id, session_id, "general")
	if memory_context:
	logger.info("🧠 Using memory context for response generation")
	except Exception as e:
	logger.warning(f"⚠️ Could not load memory context: {e}")

	# Get model info once
	model_name = Constants.DEFAULT_LLM_NAME
	if hasattr(state.get("refining_llm"), 'model_name'):
	model_name = state.get("refining_llm").model_name
	try:
	llm = await model_manager.load_llm_model(model_name)
	except Exception as e:
	logger.error(f"❌ Failed to load model {model_name}: {e}")
	return _create_error_response(state, query, "Failed to load language model")

	# If no tool results, generate a direct response using LLM
	if not tool_results:
	logger.info("ℹ️ No tool results found - generating LLM response")

	system_prompt = """You are a helpful AI assistant. The user has made a request and you need to provide a comprehensive and helpful response.

	If there's an existing response or context, acknowledge it and build upon it.
	Be professional, clear, and concise in your response.
	If you don't have specific information to provide, politely explain what you can help with instead."""

	context_info = f"Query: {query}"
	if existing_response:
	context_info += f"\nExisting context: {existing_response}"
	if memory_context:
	context_info += f"\nMemory context: {memory_context}"

	messages = [
	SystemMessage(content=system_prompt),
	HumanMessage(content=context_info)
	]

	try:
	response = await llm.ainvoke(messages)
	direct_response = response.content

	# Create clean copy without tools (tools not serializable)
	updated_state = state.copy()
	updated_state["response"] = direct_response
	updated_state["current_step"] = "generate_complete"
	if "tools" in updated_state:
	del updated_state["tools"]

	logger.info("✅ Generated LLM response successfully")
	return updated_state

	except Exception as e:
	logger.error(f"❌ Error generating LLM response: {str(e)}")
	return _create_error_response(state, query, f"Failed to generate response: {str(e)}")

	# If we have tool results, use LLM to synthesize them
	logger.info("🔧 Processing tool results using LLM synthesis")

	# Prepare tool results summary
	tool_results_summary = ""
	for i, result in enumerate(tool_results, 1):
	content = result.content if hasattr(result, 'content') else str(result)
	tool_name = getattr(result, 'name', f'Tool {i}')
	tool_results_summary += f"\n{tool_name}: {content}\n"

	system_prompt = """You are a helpful AI assistant that synthesizes tool execution results into a comprehensive response.

	Your task is to:
	1. Analyze the tool results provided
	2. Generate a clear, professional response that summarizes what was accomplished
	3. Present the information in a well-structured format
	4. If there are any errors or issues, explain them clearly
	5. Be concise but thorough in your explanation

	Always maintain a helpful and professional tone."""

	context_info = f"Query: {query}\n\nTool Results:{tool_results_summary}"
	if memory_context:
	context_info += f"\nMemory context: {memory_context}"

	messages = [
	SystemMessage(content=system_prompt),
	HumanMessage(content=context_info)
	]

	try:
	response = await llm.ainvoke(messages)
	synthesized_response = response.content

	# Create clean copy without tools (tools not serializable)
	updated_state = state.copy()
	updated_state["response"] = synthesized_response
	updated_state["current_step"] = "generate_complete"
	if "tools" in updated_state:
	del updated_state["tools"]

	logger.info("✅ Synthesized tool results successfully using LLM")
	return updated_state

	except Exception as e:
	logger.error(f"❌ Error synthesizing tool results with LLM: {str(e)}")
	# Fallback to ToolResultChain if LLM synthesis fails
	logger.info("🔄 Falling back to ToolResultChain")
	return await _handle_tool_results(state, query, user_id, session_id, tool_results, memory_context, llm)