File size: 11,705 Bytes
8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 8a7b3d1 e01c471 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 | """
My Persona Database - RAG Implementation
This is where I build my persona database using what I learned about RAG.
I'm using:
- HuggingFace dataset with persona descriptions
- ChromaDB for vector storage (learned this is good for small projects)
- Embeddings to find similar personas
- LlamaIndex to tie it all together
The goal is to have a database I can query like "find me creative people"
and get back actual persona descriptions.
Note: I made this work in HuggingFace Spaces by keeping everything in memory
and using a smaller dataset so it doesn't crash.
"""
import logging
import os
from typing import List, Optional
from pathlib import Path
# Core LlamaIndex stuff
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
# For embeddings and vector storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore
# External stuff
try:
from datasets import load_dataset
CAN_LOAD_DATASETS = True
except ImportError:
CAN_LOAD_DATASETS = False
try:
import chromadb
CHROMADB_WORKS = True
except ImportError:
CHROMADB_WORKS = False
logger = logging.getLogger(__name__)
# My settings
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
MAX_PERSONAS = 300 # Keep it small for HF Spaces
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" # This one works well
CHUNK_SIZE = 400 # Smaller chunks work better
# Cache so I don't rebuild this every time
_my_persona_index = None
def make_sample_personas():
"""
Backup personas in case I can't download the real dataset
These are just examples but at least my agent will work
"""
samples = [
"I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",
"I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",
"I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",
"I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",
"I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",
"I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",
"I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",
"I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",
"I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",
"I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
]
logger.info(f"Created {len(samples)} backup personas")
return samples
def download_personas():
"""
Try to get the real persona dataset from HuggingFace
If that fails, use my backup personas
"""
logger.info("Trying to download persona dataset...")
if not CAN_LOAD_DATASETS:
logger.warning("Can't load datasets library, using backups")
return make_sample_personas()
try:
# Load the dataset (streaming to save memory)
dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)
personas = []
for i, item in enumerate(dataset):
if i >= MAX_PERSONAS: # Don't go over my limit
break
persona_text = item.get("persona", "")
if persona_text.strip():
personas.append(f"Person {i+1}: {persona_text}")
if (i + 1) % 50 == 0:
logger.info(f"Downloaded {i+1} personas...")
logger.info(f"Got {len(personas)} personas from HuggingFace!")
return personas
except Exception as e:
logger.warning(f"Download failed: {e}, using backups")
return make_sample_personas()
def make_documents(personas):
"""
Turn my persona strings into LlamaIndex documents
"""
logger.info(f"Making documents from {len(personas)} personas...")
docs = []
for i, persona_text in enumerate(personas):
doc = Document(
text=persona_text,
metadata={
"source": f"persona_{i}",
"persona_id": i,
"type": "persona_description"
}
)
docs.append(doc)
logger.info(f"Created {len(docs)} documents")
return docs
def setup_vector_store():
"""
Set up ChromaDB for storing my vectors
Using in-memory so it works in HuggingFace Spaces
"""
if not CHROMADB_WORKS:
logger.error("ChromaDB not available!")
return None
try:
logger.info("Setting up in-memory vector store...")
# In-memory client (no files to worry about)
client = chromadb.Client()
collection = client.get_or_create_collection("my_personas")
# Wrap it for LlamaIndex
vector_store = ChromaVectorStore(chroma_collection=collection)
logger.info("Vector store ready!")
return vector_store
except Exception as e:
logger.error(f"Vector store setup failed: {e}")
return None
def build_persona_index():
"""
Build my persona index from scratch
This might take a minute the first time
"""
logger.info("Building persona index...")
try:
# Step 1: Get the persona data
personas = download_personas()
if not personas:
logger.error("No persona data available")
return None
# Step 2: Make documents
documents = make_documents(personas)
# Step 3: Set up vector storage
vector_store = setup_vector_store()
if not vector_store:
logger.error("Can't create vector store")
return None
# Step 4: Set up embeddings
try:
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
except Exception as e:
logger.error(f"Can't load embeddings: {e}")
return None
# Step 5: Build the index
logger.info("Creating vector index... this might take a moment")
index = VectorStoreIndex.from_documents(
documents=documents,
vector_store=vector_store,
embed_model=embed_model,
show_progress=True
)
logger.info("Persona index built successfully!")
return index
except Exception as e:
logger.error(f"Index building failed: {e}")
return None
def get_persona_index():
"""
Get my persona index (builds it if needed, caches it if possible)
"""
global _my_persona_index
if _my_persona_index is None:
logger.info("Building persona index for the first time...")
_my_persona_index = build_persona_index()
else:
logger.info("Using cached persona index")
return _my_persona_index
def get_persona_query_engine(llm=None):
"""
Get a query engine I can use to search my personas
This is what gets called from my tools
"""
try:
index = get_persona_index()
if index is None:
logger.warning("No persona index available")
return None
# Make the query engine
query_engine = index.as_query_engine(
llm=llm, # Use the LLM from my agent
response_mode="tree_summarize", # Good for combining multiple results
similarity_top_k=3, # Get top 3 matches
streaming=False
)
logger.info("Persona query engine ready")
return query_engine
except Exception as e:
logger.error(f"Query engine creation failed: {e}")
return None
def test_my_personas():
"""
Test that my persona system works
"""
print("\n=== Testing My Persona Database ===")
# Check dependencies
print(f"Datasets available: {CAN_LOAD_DATASETS}")
print(f"ChromaDB available: {CHROMADB_WORKS}")
if not CHROMADB_WORKS:
print("❌ ChromaDB missing - persona database won't work")
return False
# Test data loading
print("\nTesting persona loading...")
try:
personas = download_personas()
print(f"✅ Got {len(personas)} personas")
if personas:
print(f"Sample: {personas[0][:100]}...")
except Exception as e:
print(f"❌ Persona loading failed: {e}")
return False
# Test vector store
print("\nTesting vector store...")
try:
vector_store = setup_vector_store()
if vector_store:
print("✅ Vector store created")
else:
print("❌ Vector store failed")
return False
except Exception as e:
print(f"❌ Vector store error: {e}")
return False
# Test index building (small test)
print("\nTesting index building...")
try:
# Use just a few personas for testing
test_personas = make_sample_personas()[:3]
test_docs = make_documents(test_personas)
vector_store = setup_vector_store()
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
index = VectorStoreIndex.from_documents(
documents=test_docs,
vector_store=vector_store,
embed_model=embed_model
)
print("✅ Index building works")
# Test a simple query
query_engine = index.as_query_engine(similarity_top_k=1)
results = query_engine.query("software developer")
print("✅ Query test passed")
return True
except Exception as e:
print(f"❌ Index test failed: {e}")
return False
if __name__ == "__main__":
# Test my persona system
import logging
logging.basicConfig(level=logging.INFO)
print("Testing My Persona Database System")
print("=" * 40)
success = test_my_personas()
if success:
print("\n✅ Persona database is working!")
else:
print("\n❌ Persona database has issues")
print("\nThis system is optimized for HuggingFace Spaces:")
print("- Uses in-memory storage (no files)")
print("- Limited personas (saves memory)")
print("- Fallback data (works offline)")
print("- Fast startup (cached building)") |