Final_Assignment_AGENT_GAIA

Sleeping

App Files Files Community

Final_Assignment_AGENT_GAIA / retriever.py

Isateles

Updated agent

e01c471 9 months ago

raw

history blame contribute delete

11.7 kB

	"""
	My Persona Database - RAG Implementation

	This is where I build my persona database using what I learned about RAG.
	I'm using:
	- HuggingFace dataset with persona descriptions
	- ChromaDB for vector storage (learned this is good for small projects)
	- Embeddings to find similar personas
	- LlamaIndex to tie it all together

	The goal is to have a database I can query like "find me creative people"
	and get back actual persona descriptions.

	Note: I made this work in HuggingFace Spaces by keeping everything in memory
	and using a smaller dataset so it doesn't crash.
	"""

	import logging
	import os
	from typing import List, Optional
	from pathlib import Path

	# Core LlamaIndex stuff
	from llama_index.core.schema import Document
	from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
	from llama_index.core.node_parser import SentenceSplitter

	# For embeddings and vector storage
	from llama_index.embeddings.huggingface import HuggingFaceEmbedding
	from llama_index.vector_stores.chroma import ChromaVectorStore

	# External stuff
	try:
	from datasets import load_dataset
	CAN_LOAD_DATASETS = True
	except ImportError:
	CAN_LOAD_DATASETS = False

	try:
	import chromadb
	CHROMADB_WORKS = True
	except ImportError:
	CHROMADB_WORKS = False

	logger = logging.getLogger(__name__)

	# My settings
	PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
	MAX_PERSONAS = 300 # Keep it small for HF Spaces
	EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" # This one works well
	CHUNK_SIZE = 400 # Smaller chunks work better

	# Cache so I don't rebuild this every time
	_my_persona_index = None

	def make_sample_personas():
	"""
	Backup personas in case I can't download the real dataset
	These are just examples but at least my agent will work
	"""
	samples = [
	"I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",

	"I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",

	"I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",

	"I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",

	"I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",

	"I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",

	"I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",

	"I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",

	"I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",

	"I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
	]

	logger.info(f"Created {len(samples)} backup personas")
	return samples

	def download_personas():
	"""
	Try to get the real persona dataset from HuggingFace
	If that fails, use my backup personas
	"""
	logger.info("Trying to download persona dataset...")

	if not CAN_LOAD_DATASETS:
	logger.warning("Can't load datasets library, using backups")
	return make_sample_personas()

	try:
	# Load the dataset (streaming to save memory)
	dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)

	personas = []
	for i, item in enumerate(dataset):
	if i >= MAX_PERSONAS: # Don't go over my limit
	break

	persona_text = item.get("persona", "")
	if persona_text.strip():
	personas.append(f"Person {i+1}: {persona_text}")

	if (i + 1) % 50 == 0:
	logger.info(f"Downloaded {i+1} personas...")

	logger.info(f"Got {len(personas)} personas from HuggingFace!")
	return personas

	except Exception as e:
	logger.warning(f"Download failed: {e}, using backups")
	return make_sample_personas()

	def make_documents(personas):
	"""
	Turn my persona strings into LlamaIndex documents
	"""
	logger.info(f"Making documents from {len(personas)} personas...")

	docs = []
	for i, persona_text in enumerate(personas):
	doc = Document(
	text=persona_text,
	metadata={
	"source": f"persona_{i}",
	"persona_id": i,
	"type": "persona_description"
	}
	)
	docs.append(doc)

	logger.info(f"Created {len(docs)} documents")
	return docs

	def setup_vector_store():
	"""
	Set up ChromaDB for storing my vectors
	Using in-memory so it works in HuggingFace Spaces
	"""
	if not CHROMADB_WORKS:
	logger.error("ChromaDB not available!")
	return None

	try:
	logger.info("Setting up in-memory vector store...")

	# In-memory client (no files to worry about)
	client = chromadb.Client()
	collection = client.get_or_create_collection("my_personas")

	# Wrap it for LlamaIndex
	vector_store = ChromaVectorStore(chroma_collection=collection)

	logger.info("Vector store ready!")
	return vector_store

	except Exception as e:
	logger.error(f"Vector store setup failed: {e}")
	return None

	def build_persona_index():
	"""
	Build my persona index from scratch
	This might take a minute the first time
	"""
	logger.info("Building persona index...")

	try:
	# Step 1: Get the persona data
	personas = download_personas()
	if not personas:
	logger.error("No persona data available")
	return None

	# Step 2: Make documents
	documents = make_documents(personas)

	# Step 3: Set up vector storage
	vector_store = setup_vector_store()
	if not vector_store:
	logger.error("Can't create vector store")
	return None

	# Step 4: Set up embeddings
	try:
	embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
	logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
	except Exception as e:
	logger.error(f"Can't load embeddings: {e}")
	return None

	# Step 5: Build the index
	logger.info("Creating vector index... this might take a moment")

	index = VectorStoreIndex.from_documents(
	documents=documents,
	vector_store=vector_store,
	embed_model=embed_model,
	show_progress=True
	)

	logger.info("Persona index built successfully!")
	return index

	except Exception as e:
	logger.error(f"Index building failed: {e}")
	return None

	def get_persona_index():
	"""
	Get my persona index (builds it if needed, caches it if possible)
	"""
	global _my_persona_index

	if _my_persona_index is None:
	logger.info("Building persona index for the first time...")
	_my_persona_index = build_persona_index()
	else:
	logger.info("Using cached persona index")

	return _my_persona_index

	def get_persona_query_engine(llm=None):
	"""
	Get a query engine I can use to search my personas
	This is what gets called from my tools
	"""
	try:
	index = get_persona_index()
	if index is None:
	logger.warning("No persona index available")
	return None

	# Make the query engine
	query_engine = index.as_query_engine(
	llm=llm, # Use the LLM from my agent
	response_mode="tree_summarize", # Good for combining multiple results
	similarity_top_k=3, # Get top 3 matches
	streaming=False
	)

	logger.info("Persona query engine ready")
	return query_engine

	except Exception as e:
	logger.error(f"Query engine creation failed: {e}")
	return None

	def test_my_personas():
	"""
	Test that my persona system works
	"""
	print("\n=== Testing My Persona Database ===")

	# Check dependencies
	print(f"Datasets available: {CAN_LOAD_DATASETS}")
	print(f"ChromaDB available: {CHROMADB_WORKS}")

	if not CHROMADB_WORKS:
	print("❌ ChromaDB missing - persona database won't work")
	return False

	# Test data loading
	print("\nTesting persona loading...")
	try:
	personas = download_personas()
	print(f"✅ Got {len(personas)} personas")
	if personas:
	print(f"Sample: {personas[0][:100]}...")
	except Exception as e:
	print(f"❌ Persona loading failed: {e}")
	return False

	# Test vector store
	print("\nTesting vector store...")
	try:
	vector_store = setup_vector_store()
	if vector_store:
	print("✅ Vector store created")
	else:
	print("❌ Vector store failed")
	return False
	except Exception as e:
	print(f"❌ Vector store error: {e}")
	return False

	# Test index building (small test)
	print("\nTesting index building...")
	try:
	# Use just a few personas for testing
	test_personas = make_sample_personas()[:3]
	test_docs = make_documents(test_personas)

	vector_store = setup_vector_store()
	embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)

	index = VectorStoreIndex.from_documents(
	documents=test_docs,
	vector_store=vector_store,
	embed_model=embed_model
	)

	print("✅ Index building works")

	# Test a simple query
	query_engine = index.as_query_engine(similarity_top_k=1)
	results = query_engine.query("software developer")
	print("✅ Query test passed")

	return True

	except Exception as e:
	print(f"❌ Index test failed: {e}")
	return False

	if __name__ == "__main__":
	# Test my persona system
	import logging
	logging.basicConfig(level=logging.INFO)

	print("Testing My Persona Database System")
	print("=" * 40)

	success = test_my_personas()

	if success:
	print("\n✅ Persona database is working!")
	else:
	print("\n❌ Persona database has issues")

	print("\nThis system is optimized for HuggingFace Spaces:")
	print("- Uses in-memory storage (no files)")
	print("- Limited personas (saves memory)")
	print("- Fallback data (works offline)")
	print("- Fast startup (cached building)")