Spaces:
Sleeping
Sleeping
File size: 17,787 Bytes
daba587 41cb4a2 daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 a7035da daba587 f3172b2 daba587 3839c42 daba587 3839c42 daba587 3839c42 daba587 3839c42 daba587 a7035da daba587 41cb4a2 daba587 a7035da daba587 a7035da daba587 41cb4a2 daba587 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 |
"""
Modified agent.py - Fixed with Hugging Face models instead of OpenAI
Fixes LangSmith authentication and missing PostgreSQL function issues
"""
import os
import logging
import warnings
from typing import List, Dict, Any, Optional, Union
import pandas as pd
from supabase import create_client, Client
# Suppress LangSmith warnings to avoid authentication errors
warnings.filterwarnings("ignore", category=UserWarning, module="langsmith")
logging.getLogger("langsmith").setLevel(logging.ERROR)
# Disable LangSmith tracing to avoid 401 errors
os.environ["LANGCHAIN_TRACING_V2"] = "false"
try:
from langchain.agents import AgentType, AgentExecutor, create_react_agent
from langchain.tools import BaseTool, tool
from langchain.memory import ConversationBufferMemory
from langchain_community.llms import HuggingFacePipeline
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import SupabaseVectorStore
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate
# Hugging Face specific imports
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import torch
except ImportError as e:
print(f"Import error: {e}")
print("Please install required packages: pip install transformers sentence-transformers torch")
class RobotPaiAgent:
"""
RobotPai Agent using Hugging Face models instead of OpenAI
Fixes authentication and database function issues
"""
def __init__(self, model_name: str = "microsoft/DialoGPT-medium"):
print("π€ Initializing RobotPai Agent with Hugging Face models...")
self.model_name = model_name
self.setup_environment()
self.setup_supabase()
self.setup_models()
self.setup_vectorstore()
self.setup_tools()
self.setup_agent()
def setup_environment(self):
"""Setup environment variables with error handling"""
# Disable LangSmith to avoid authentication errors
os.environ["LANGCHAIN_TRACING_V2"] = "false"
# Required environment variables
self.supabase_url = os.getenv("SUPABASE_URL")
self.supabase_key = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
if not all([self.supabase_url, self.supabase_key]):
raise ValueError("Missing required environment variables: SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY")
print("β
Environment configured")
def setup_supabase(self):
"""Setup Supabase client and ensure database setup"""
try:
self.supabase_client: Client = create_client(self.supabase_url, self.supabase_key)
self.ensure_database_setup()
print("β
Supabase client initialized")
except Exception as e:
print(f"β οΈ Supabase setup failed: {e}")
self.supabase_client = None
def ensure_database_setup(self):
"""Ensure the database has required tables and functions"""
try:
# Check if documents table exists
result = self.supabase_client.table('documents').select('id').limit(1).execute()
print("β
Documents table exists")
except Exception as e:
print(f"β οΈ Database setup needed: {e}")
print("Please run the SQL setup in your Supabase dashboard:")
print("""
-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Create documents table
CREATE TABLE IF NOT EXISTS documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
content TEXT NOT NULL,
metadata JSONB DEFAULT '{}',
embedding VECTOR(384) -- Dimension for sentence-transformers
);
-- Create match_documents_langchain function
CREATE OR REPLACE FUNCTION match_documents_langchain(
query_embedding VECTOR(384),
match_count INT DEFAULT 10,
filter JSONB DEFAULT '{}'
)
RETURNS TABLE (
id UUID,
content TEXT,
metadata JSONB,
similarity FLOAT
)
LANGUAGE plpgsql
AS $$
BEGIN
RETURN QUERY
SELECT
documents.id,
documents.content,
documents.metadata,
1 - (documents.embedding <=> query_embedding) AS similarity
FROM documents
WHERE documents.metadata @> filter
ORDER BY documents.embedding <=> query_embedding
LIMIT match_count;
END;
$$;
""")
def setup_models(self):
"""Setup Hugging Face models for LLM and embeddings"""
try:
# Setup embeddings using sentence-transformers (faster and smaller)
print("π Loading embedding model...")
self.embeddings = HuggingFaceEmbeddings(
model_name="all-MiniLM-L6-v2", # 384 dimensions, fast and good quality
model_kwargs={'device': 'cpu'}, # Use CPU for compatibility
encode_kwargs={'normalize_embeddings': True}
)
print("β
Embeddings model loaded")
# Setup LLM using a lightweight model suitable for HF Spaces
print("π Loading language model...")
# Use a smaller, faster model for Hugging Face Spaces
model_id = "microsoft/DialoGPT-small" # Smaller model for faster inference
try:
# Create a text generation pipeline
self.llm_pipeline = pipeline(
"text-generation",
model=model_id,
tokenizer=model_id,
max_length=512,
temperature=0.7,
do_sample=True,
device_map="auto" if torch.cuda.is_available() else None,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)
# Wrap in LangChain HuggingFacePipeline
self.llm = HuggingFacePipeline(
pipeline=self.llm_pipeline,
model_kwargs={"temperature": 0.7, "max_length": 512}
)
print(f"β
Language model loaded: {model_id}")
except Exception as e:
print(f"β οΈ Failed to load {model_id}: {e}")
# Fallback to a simple text completion
print("π Using fallback model...")
self.llm = self._create_fallback_llm()
except Exception as e:
print(f"β Model setup failed: {e}")
# Create minimal fallback
self.embeddings = None
self.llm = self._create_fallback_llm()
def _create_fallback_llm(self):
"""Create a simple fallback LLM for when models fail to load"""
class SimpleLLM:
def __call__(self, prompt: str) -> str:
return f"I'm a simple AI assistant. You asked: {prompt[:100]}... I would help you search documents and analyze data, but I need proper model setup."
def invoke(self, prompt: str) -> str:
return self.__call__(prompt)
return SimpleLLM()
def setup_vectorstore(self):
"""Setup vector store with proper error handling"""
if not self.supabase_client or not self.embeddings:
print("β οΈ Skipping vector store setup - missing dependencies")
self.vectorstore = None
return
try:
# Initialize vector store with correct function name
self.vectorstore = SupabaseVectorStore(
client=self.supabase_client,
embedding=self.embeddings,
table_name="documents",
query_name="match_documents_langchain" # Use the function we created
)
print("β
Vector store initialized")
except Exception as e:
print(f"β οΈ Vector store setup failed: {e}")
self.vectorstore = None
def setup_tools(self):
"""Setup tools for the agent"""
self.tools = []
# Document Search Tool
@tool
def search_documents(query: str) -> str:
"""Search for relevant documents in the knowledge base."""
if not self.vectorstore:
return "Vector store not available. Please check database setup."
try:
docs = self.vectorstore.similarity_search(query, k=3)
if docs:
results = []
for i, doc in enumerate(docs, 1):
content = doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content
results.append(f"Document {i}: {content}")
return "\n\n".join(results)
else:
return "No relevant documents found."
except Exception as e:
return f"Error searching documents: {str(e)}"
# CSV Analysis Tool
@tool
def analyze_csv_data(query: str) -> str:
"""Analyze CSV data and answer questions about it."""
try:
# Load the CSV file if it exists
if os.path.exists("supabase_docs.csv"):
df = pd.read_csv("supabase_docs.csv")
# Basic analysis based on query
if "rows" in query.lower() or "count" in query.lower():
return f"The CSV has {len(df)} rows and {len(df.columns)} columns."
elif "columns" in query.lower():
return f"Columns: {', '.join(df.columns.tolist())}"
elif "head" in query.lower() or "first" in query.lower():
return f"First 5 rows:\n{df.head().to_string()}"
else:
return f"CSV loaded with {len(df)} rows. Available columns: {', '.join(df.columns.tolist())}"
else:
return "CSV file not found. Please upload supabase_docs.csv"
except Exception as e:
return f"Error analyzing CSV: {str(e)}"
# General Q&A Tool
@tool
def answer_question(question: str) -> str:
"""Answer general questions using the language model."""
try:
# Simple prompt for the question
prompt = f"Question: {question}\nAnswer:"
response = self.llm.invoke(prompt)
return response if isinstance(response, str) else str(response)
except Exception as e:
return f"I'm unable to process that question right now. Error: {str(e)}"
self.tools = [search_documents, analyze_csv_data, answer_question]
print(f"β
{len(self.tools)} tools initialized")
def setup_agent(self):
"""Setup the agent with React framework"""
try:
# Create a simple prompt template
template = """Answer the following questions as best you can. You have access to the following tools:
{tools}
Use the following format:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question
Begin!
Question: {input}
Thought: {agent_scratchpad}"""
prompt = PromptTemplate.from_template(template)
# Create a simple agent using React pattern
if hasattr(self.llm, 'invoke'):
agent = create_react_agent(self.llm, self.tools, prompt)
self.agent_executor = AgentExecutor(
agent=agent,
tools=self.tools,
verbose=True,
max_iterations=3,
handle_parsing_errors=True,
return_intermediate_steps=True
)
else:
# Fallback for simple LLM
self.agent_executor = self._create_simple_executor()
print("β
Agent initialized successfully")
except Exception as e:
print(f"β οΈ Agent setup failed: {e}")
self.agent_executor = self._create_simple_executor()
def _create_simple_executor(self):
"""Create a simple executor when full agent setup fails"""
class SimpleExecutor:
def __init__(self, tools, llm):
self.tools = {tool.name: tool for tool in tools}
self.llm = llm
def invoke(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
query = inputs.get("input", "")
# Simple routing logic
if "document" in query.lower() or "search" in query.lower():
if "search_documents" in self.tools:
result = self.tools["search_documents"].invoke(query)
return {"output": result}
elif "csv" in query.lower() or "data" in query.lower():
if "analyze_csv_data" in self.tools:
result = self.tools["analyze_csv_data"].invoke(query)
return {"output": result}
else:
if "answer_question" in self.tools:
result = self.tools["answer_question"].invoke(query)
return {"output": result}
return {"output": f"I can help you with document search, CSV analysis, or general questions. You asked: {query}"}
return SimpleExecutor(self.tools, self.llm)
def add_documents(self, texts: List[str], metadatas: List[Dict] = None):
"""Add documents to the vector store"""
if not self.vectorstore:
print("β οΈ Vector store not available")
return False
try:
# Split long texts into chunks
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500, # Smaller chunks for better performance
chunk_overlap=100
)
all_texts = []
all_metadatas = []
for i, text in enumerate(texts):
chunks = text_splitter.split_text(text)
all_texts.extend(chunks)
# Add metadata for each chunk
base_metadata = metadatas[i] if metadatas and i < len(metadatas) else {}
for j, chunk in enumerate(chunks):
chunk_metadata = base_metadata.copy()
chunk_metadata.update({"chunk_id": j, "source_doc": i})
all_metadatas.append(chunk_metadata)
# Add to vector store
ids = self.vectorstore.add_texts(all_texts, all_metadatas)
print(f"β
Added {len(ids)} document chunks to vector store")
return True
except Exception as e:
print(f"β Error adding documents: {e}")
return False
def process_query(self, query: str) -> str:
"""Process a user query through the agent"""
try:
if self.agent_executor:
response = self.agent_executor.invoke({"input": query})
return response.get("output", "Sorry, I couldn't process your query.")
else:
return "Agent not properly initialized. Please check your setup."
except Exception as e:
return f"Error processing query: {str(e)}"
def load_csv_for_analysis(self, file_path: str = "supabase_docs.csv") -> bool:
"""Load CSV data for analysis"""
try:
if not os.path.exists(file_path):
print(f"β οΈ CSV file not found: {file_path}")
return False
df = pd.read_csv(file_path)
print(f"β
Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
# Optionally add CSV content to vector store for searching
if self.vectorstore:
documents = []
for _, row in df.head(100).iterrows(): # Limit to first 100 rows
content = " | ".join([f"{col}: {val}" for col, val in row.items() if pd.notna(val)])
documents.append(content)
metadatas = [{"source": "csv_data", "row_id": i} for i in range(len(documents))]
self.add_documents(documents, metadatas)
print("β
CSV data added to vector store for searching")
return True
except Exception as e:
print(f"β Error loading CSV: {e}")
return False
# Utility function for direct usage
def create_agent():
"""Create and return a RobotPai agent instance"""
try:
agent = RobotPaiAgent()
return agent
except Exception as e:
print(f"Failed to create agent: {e}")
return None
# For backward compatibility
def get_agent():
"""Get agent instance - for backward compatibility"""
return create_agent() |