Isateles's picture
Updated agent
e01c471
"""
My Persona Database - RAG Implementation
This is where I build my persona database using what I learned about RAG.
I'm using:
- HuggingFace dataset with persona descriptions
- ChromaDB for vector storage (learned this is good for small projects)
- Embeddings to find similar personas
- LlamaIndex to tie it all together
The goal is to have a database I can query like "find me creative people"
and get back actual persona descriptions.
Note: I made this work in HuggingFace Spaces by keeping everything in memory
and using a smaller dataset so it doesn't crash.
"""
import logging
import os
from typing import List, Optional
from pathlib import Path
# Core LlamaIndex stuff
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
# For embeddings and vector storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
# External stuff
try:
from datasets import load_dataset
CAN_LOAD_DATASETS = True
except ImportError:
CAN_LOAD_DATASETS = False
try:
import chromadb
CHROMADB_WORKS = True
except ImportError:
CHROMADB_WORKS = False
logger = logging.getLogger(__name__)
# My settings
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
MAX_PERSONAS = 300 # Keep it small for HF Spaces
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" # This one works well
CHUNK_SIZE = 400 # Smaller chunks work better
# Cache so I don't rebuild this every time
_my_persona_index = None
def make_sample_personas():
"""
Backup personas in case I can't download the real dataset
These are just examples but at least my agent will work
"""
samples = [
"I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",
"I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",
"I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",
"I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",
"I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",
"I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",
"I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",
"I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",
"I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",
"I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
]
logger.info(f"Created {len(samples)} backup personas")
return samples
def download_personas():
"""
Try to get the real persona dataset from HuggingFace
If that fails, use my backup personas
"""
logger.info("Trying to download persona dataset...")
if not CAN_LOAD_DATASETS:
logger.warning("Can't load datasets library, using backups")
return make_sample_personas()
try:
# Load the dataset (streaming to save memory)
dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)
personas = []
for i, item in enumerate(dataset):
if i >= MAX_PERSONAS: # Don't go over my limit
break
persona_text = item.get("persona", "")
if persona_text.strip():
personas.append(f"Person {i+1}: {persona_text}")
if (i + 1) % 50 == 0:
logger.info(f"Downloaded {i+1} personas...")
logger.info(f"Got {len(personas)} personas from HuggingFace!")
return personas
except Exception as e:
logger.warning(f"Download failed: {e}, using backups")
return make_sample_personas()
def make_documents(personas):
"""
Turn my persona strings into LlamaIndex documents
"""
logger.info(f"Making documents from {len(personas)} personas...")
docs = []
for i, persona_text in enumerate(personas):
doc = Document(
text=persona_text,
metadata={
"source": f"persona_{i}",
"persona_id": i,
"type": "persona_description"
}
)
docs.append(doc)
logger.info(f"Created {len(docs)} documents")
return docs
def setup_vector_store():
"""
Set up ChromaDB for storing my vectors
Using in-memory so it works in HuggingFace Spaces
"""
if not CHROMADB_WORKS:
logger.error("ChromaDB not available!")
return None
try:
logger.info("Setting up in-memory vector store...")
# In-memory client (no files to worry about)
client = chromadb.Client()
collection = client.get_or_create_collection("my_personas")
# Wrap it for LlamaIndex
vector_store = ChromaVectorStore(chroma_collection=collection)
logger.info("Vector store ready!")
return vector_store
except Exception as e:
logger.error(f"Vector store setup failed: {e}")
return None
def build_persona_index():
"""
Build my persona index from scratch
This might take a minute the first time
"""
logger.info("Building persona index...")
try:
# Step 1: Get the persona data
personas = download_personas()
if not personas:
logger.error("No persona data available")
return None
# Step 2: Make documents
documents = make_documents(personas)
# Step 3: Set up vector storage
vector_store = setup_vector_store()
if not vector_store:
logger.error("Can't create vector store")
return None
# Step 4: Set up embeddings
try:
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
except Exception as e:
logger.error(f"Can't load embeddings: {e}")
return None
# Step 5: Build the index
logger.info("Creating vector index... this might take a moment")
index = VectorStoreIndex.from_documents(
documents=documents,
vector_store=vector_store,
embed_model=embed_model,
show_progress=True
)
logger.info("Persona index built successfully!")
return index
except Exception as e:
logger.error(f"Index building failed: {e}")
return None
def get_persona_index():
"""
Get my persona index (builds it if needed, caches it if possible)
"""
global _my_persona_index
if _my_persona_index is None:
logger.info("Building persona index for the first time...")
_my_persona_index = build_persona_index()
else:
logger.info("Using cached persona index")
return _my_persona_index
def get_persona_query_engine(llm=None):
"""
Get a query engine I can use to search my personas
This is what gets called from my tools
"""
try:
index = get_persona_index()
if index is None:
logger.warning("No persona index available")
return None
# Make the query engine
query_engine = index.as_query_engine(
llm=llm, # Use the LLM from my agent
response_mode="tree_summarize", # Good for combining multiple results
similarity_top_k=3, # Get top 3 matches
streaming=False
)
logger.info("Persona query engine ready")
return query_engine
except Exception as e:
logger.error(f"Query engine creation failed: {e}")
return None
def test_my_personas():
"""
Test that my persona system works
"""
print("\n=== Testing My Persona Database ===")
# Check dependencies
print(f"Datasets available: {CAN_LOAD_DATASETS}")
print(f"ChromaDB available: {CHROMADB_WORKS}")
if not CHROMADB_WORKS:
print("❌ ChromaDB missing - persona database won't work")
return False
# Test data loading
print("\nTesting persona loading...")
try:
personas = download_personas()
print(f"✅ Got {len(personas)} personas")
if personas:
print(f"Sample: {personas[0][:100]}...")
except Exception as e:
print(f"❌ Persona loading failed: {e}")
return False
# Test vector store
print("\nTesting vector store...")
try:
vector_store = setup_vector_store()
if vector_store:
print("✅ Vector store created")
else:
print("❌ Vector store failed")
return False
except Exception as e:
print(f"❌ Vector store error: {e}")
return False
# Test index building (small test)
print("\nTesting index building...")
try:
# Use just a few personas for testing
test_personas = make_sample_personas()[:3]
test_docs = make_documents(test_personas)
vector_store = setup_vector_store()
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
index = VectorStoreIndex.from_documents(
documents=test_docs,
vector_store=vector_store,
embed_model=embed_model
)
print("✅ Index building works")
# Test a simple query
query_engine = index.as_query_engine(similarity_top_k=1)
results = query_engine.query("software developer")
print("✅ Query test passed")
return True
except Exception as e:
print(f"❌ Index test failed: {e}")
return False
if __name__ == "__main__":
# Test my persona system
import logging
logging.basicConfig(level=logging.INFO)
print("Testing My Persona Database System")
print("=" * 40)
success = test_my_personas()
if success:
print("\n✅ Persona database is working!")
else:
print("\n❌ Persona database has issues")
print("\nThis system is optimized for HuggingFace Spaces:")
print("- Uses in-memory storage (no files)")
print("- Limited personas (saves memory)")
print("- Fallback data (works offline)")
print("- Fast startup (cached building)")