""" My Persona Database - RAG Implementation This is where I build my persona database using what I learned about RAG. I'm using: - HuggingFace dataset with persona descriptions - ChromaDB for vector storage (learned this is good for small projects) - Embeddings to find similar personas - LlamaIndex to tie it all together The goal is to have a database I can query like "find me creative people" and get back actual persona descriptions. Note: I made this work in HuggingFace Spaces by keeping everything in memory and using a smaller dataset so it doesn't crash. """ import logging import os from typing import List, Optional from pathlib import Path # Core LlamaIndex stuff from llama_index.core.schema import Document from llama_index.core import VectorStoreIndex, SimpleDirectoryReader from llama_index.core.node_parser import SentenceSplitter # For embeddings and vector storage from llama_index.embeddings.huggingface import HuggingFaceEmbedding from llama_index.vector_stores.chroma import ChromaVectorStore # External stuff try: from datasets import load_dataset CAN_LOAD_DATASETS = True except ImportError: CAN_LOAD_DATASETS = False try: import chromadb CHROMADB_WORKS = True except ImportError: CHROMADB_WORKS = False logger = logging.getLogger(__name__) # My settings PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny" MAX_PERSONAS = 300 # Keep it small for HF Spaces EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" # This one works well CHUNK_SIZE = 400 # Smaller chunks work better # Cache so I don't rebuild this every time _my_persona_index = None def make_sample_personas(): """ Backup personas in case I can't download the real dataset These are just examples but at least my agent will work """ samples = [ "I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.", "I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.", "I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.", "I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.", "I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.", "I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.", "I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.", "I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.", "I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.", "I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby." ] logger.info(f"Created {len(samples)} backup personas") return samples def download_personas(): """ Try to get the real persona dataset from HuggingFace If that fails, use my backup personas """ logger.info("Trying to download persona dataset...") if not CAN_LOAD_DATASETS: logger.warning("Can't load datasets library, using backups") return make_sample_personas() try: # Load the dataset (streaming to save memory) dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True) personas = [] for i, item in enumerate(dataset): if i >= MAX_PERSONAS: # Don't go over my limit break persona_text = item.get("persona", "") if persona_text.strip(): personas.append(f"Person {i+1}: {persona_text}") if (i + 1) % 50 == 0: logger.info(f"Downloaded {i+1} personas...") logger.info(f"Got {len(personas)} personas from HuggingFace!") return personas except Exception as e: logger.warning(f"Download failed: {e}, using backups") return make_sample_personas() def make_documents(personas): """ Turn my persona strings into LlamaIndex documents """ logger.info(f"Making documents from {len(personas)} personas...") docs = [] for i, persona_text in enumerate(personas): doc = Document( text=persona_text, metadata={ "source": f"persona_{i}", "persona_id": i, "type": "persona_description" } ) docs.append(doc) logger.info(f"Created {len(docs)} documents") return docs def setup_vector_store(): """ Set up ChromaDB for storing my vectors Using in-memory so it works in HuggingFace Spaces """ if not CHROMADB_WORKS: logger.error("ChromaDB not available!") return None try: logger.info("Setting up in-memory vector store...") # In-memory client (no files to worry about) client = chromadb.Client() collection = client.get_or_create_collection("my_personas") # Wrap it for LlamaIndex vector_store = ChromaVectorStore(chroma_collection=collection) logger.info("Vector store ready!") return vector_store except Exception as e: logger.error(f"Vector store setup failed: {e}") return None def build_persona_index(): """ Build my persona index from scratch This might take a minute the first time """ logger.info("Building persona index...") try: # Step 1: Get the persona data personas = download_personas() if not personas: logger.error("No persona data available") return None # Step 2: Make documents documents = make_documents(personas) # Step 3: Set up vector storage vector_store = setup_vector_store() if not vector_store: logger.error("Can't create vector store") return None # Step 4: Set up embeddings try: embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL) logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}") except Exception as e: logger.error(f"Can't load embeddings: {e}") return None # Step 5: Build the index logger.info("Creating vector index... this might take a moment") index = VectorStoreIndex.from_documents( documents=documents, vector_store=vector_store, embed_model=embed_model, show_progress=True ) logger.info("Persona index built successfully!") return index except Exception as e: logger.error(f"Index building failed: {e}") return None def get_persona_index(): """ Get my persona index (builds it if needed, caches it if possible) """ global _my_persona_index if _my_persona_index is None: logger.info("Building persona index for the first time...") _my_persona_index = build_persona_index() else: logger.info("Using cached persona index") return _my_persona_index def get_persona_query_engine(llm=None): """ Get a query engine I can use to search my personas This is what gets called from my tools """ try: index = get_persona_index() if index is None: logger.warning("No persona index available") return None # Make the query engine query_engine = index.as_query_engine( llm=llm, # Use the LLM from my agent response_mode="tree_summarize", # Good for combining multiple results similarity_top_k=3, # Get top 3 matches streaming=False ) logger.info("Persona query engine ready") return query_engine except Exception as e: logger.error(f"Query engine creation failed: {e}") return None def test_my_personas(): """ Test that my persona system works """ print("\n=== Testing My Persona Database ===") # Check dependencies print(f"Datasets available: {CAN_LOAD_DATASETS}") print(f"ChromaDB available: {CHROMADB_WORKS}") if not CHROMADB_WORKS: print("❌ ChromaDB missing - persona database won't work") return False # Test data loading print("\nTesting persona loading...") try: personas = download_personas() print(f"✅ Got {len(personas)} personas") if personas: print(f"Sample: {personas[0][:100]}...") except Exception as e: print(f"❌ Persona loading failed: {e}") return False # Test vector store print("\nTesting vector store...") try: vector_store = setup_vector_store() if vector_store: print("✅ Vector store created") else: print("❌ Vector store failed") return False except Exception as e: print(f"❌ Vector store error: {e}") return False # Test index building (small test) print("\nTesting index building...") try: # Use just a few personas for testing test_personas = make_sample_personas()[:3] test_docs = make_documents(test_personas) vector_store = setup_vector_store() embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL) index = VectorStoreIndex.from_documents( documents=test_docs, vector_store=vector_store, embed_model=embed_model ) print("✅ Index building works") # Test a simple query query_engine = index.as_query_engine(similarity_top_k=1) results = query_engine.query("software developer") print("✅ Query test passed") return True except Exception as e: print(f"❌ Index test failed: {e}") return False if __name__ == "__main__": # Test my persona system import logging logging.basicConfig(level=logging.INFO) print("Testing My Persona Database System") print("=" * 40) success = test_my_personas() if success: print("\n✅ Persona database is working!") else: print("\n❌ Persona database has issues") print("\nThis system is optimized for HuggingFace Spaces:") print("- Uses in-memory storage (no files)") print("- Limited personas (saves memory)") print("- Fallback data (works offline)") print("- Fast startup (cached building)")