|
|
""" |
|
|
My Persona Database - RAG Implementation |
|
|
|
|
|
This is where I build my persona database using what I learned about RAG. |
|
|
I'm using: |
|
|
- HuggingFace dataset with persona descriptions |
|
|
- ChromaDB for vector storage (learned this is good for small projects) |
|
|
- Embeddings to find similar personas |
|
|
- LlamaIndex to tie it all together |
|
|
|
|
|
The goal is to have a database I can query like "find me creative people" |
|
|
and get back actual persona descriptions. |
|
|
|
|
|
Note: I made this work in HuggingFace Spaces by keeping everything in memory |
|
|
and using a smaller dataset so it doesn't crash. |
|
|
""" |
|
|
|
|
|
import logging |
|
|
import os |
|
|
from typing import List, Optional |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
from llama_index.core.schema import Document |
|
|
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader |
|
|
from llama_index.core.node_parser import SentenceSplitter |
|
|
|
|
|
|
|
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
from llama_index.vector_stores.chroma import ChromaVectorStore |
|
|
|
|
|
|
|
|
try: |
|
|
from datasets import load_dataset |
|
|
CAN_LOAD_DATASETS = True |
|
|
except ImportError: |
|
|
CAN_LOAD_DATASETS = False |
|
|
|
|
|
try: |
|
|
import chromadb |
|
|
CHROMADB_WORKS = True |
|
|
except ImportError: |
|
|
CHROMADB_WORKS = False |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny" |
|
|
MAX_PERSONAS = 300 |
|
|
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5" |
|
|
CHUNK_SIZE = 400 |
|
|
|
|
|
|
|
|
_my_persona_index = None |
|
|
|
|
|
def make_sample_personas(): |
|
|
""" |
|
|
Backup personas in case I can't download the real dataset |
|
|
These are just examples but at least my agent will work |
|
|
""" |
|
|
samples = [ |
|
|
"I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.", |
|
|
|
|
|
"I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.", |
|
|
|
|
|
"I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.", |
|
|
|
|
|
"I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.", |
|
|
|
|
|
"I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.", |
|
|
|
|
|
"I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.", |
|
|
|
|
|
"I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.", |
|
|
|
|
|
"I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.", |
|
|
|
|
|
"I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.", |
|
|
|
|
|
"I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby." |
|
|
] |
|
|
|
|
|
logger.info(f"Created {len(samples)} backup personas") |
|
|
return samples |
|
|
|
|
|
def download_personas(): |
|
|
""" |
|
|
Try to get the real persona dataset from HuggingFace |
|
|
If that fails, use my backup personas |
|
|
""" |
|
|
logger.info("Trying to download persona dataset...") |
|
|
|
|
|
if not CAN_LOAD_DATASETS: |
|
|
logger.warning("Can't load datasets library, using backups") |
|
|
return make_sample_personas() |
|
|
|
|
|
try: |
|
|
|
|
|
dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True) |
|
|
|
|
|
personas = [] |
|
|
for i, item in enumerate(dataset): |
|
|
if i >= MAX_PERSONAS: |
|
|
break |
|
|
|
|
|
persona_text = item.get("persona", "") |
|
|
if persona_text.strip(): |
|
|
personas.append(f"Person {i+1}: {persona_text}") |
|
|
|
|
|
if (i + 1) % 50 == 0: |
|
|
logger.info(f"Downloaded {i+1} personas...") |
|
|
|
|
|
logger.info(f"Got {len(personas)} personas from HuggingFace!") |
|
|
return personas |
|
|
|
|
|
except Exception as e: |
|
|
logger.warning(f"Download failed: {e}, using backups") |
|
|
return make_sample_personas() |
|
|
|
|
|
def make_documents(personas): |
|
|
""" |
|
|
Turn my persona strings into LlamaIndex documents |
|
|
""" |
|
|
logger.info(f"Making documents from {len(personas)} personas...") |
|
|
|
|
|
docs = [] |
|
|
for i, persona_text in enumerate(personas): |
|
|
doc = Document( |
|
|
text=persona_text, |
|
|
metadata={ |
|
|
"source": f"persona_{i}", |
|
|
"persona_id": i, |
|
|
"type": "persona_description" |
|
|
} |
|
|
) |
|
|
docs.append(doc) |
|
|
|
|
|
logger.info(f"Created {len(docs)} documents") |
|
|
return docs |
|
|
|
|
|
def setup_vector_store(): |
|
|
""" |
|
|
Set up ChromaDB for storing my vectors |
|
|
Using in-memory so it works in HuggingFace Spaces |
|
|
""" |
|
|
if not CHROMADB_WORKS: |
|
|
logger.error("ChromaDB not available!") |
|
|
return None |
|
|
|
|
|
try: |
|
|
logger.info("Setting up in-memory vector store...") |
|
|
|
|
|
|
|
|
client = chromadb.Client() |
|
|
collection = client.get_or_create_collection("my_personas") |
|
|
|
|
|
|
|
|
vector_store = ChromaVectorStore(chroma_collection=collection) |
|
|
|
|
|
logger.info("Vector store ready!") |
|
|
return vector_store |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Vector store setup failed: {e}") |
|
|
return None |
|
|
|
|
|
def build_persona_index(): |
|
|
""" |
|
|
Build my persona index from scratch |
|
|
This might take a minute the first time |
|
|
""" |
|
|
logger.info("Building persona index...") |
|
|
|
|
|
try: |
|
|
|
|
|
personas = download_personas() |
|
|
if not personas: |
|
|
logger.error("No persona data available") |
|
|
return None |
|
|
|
|
|
|
|
|
documents = make_documents(personas) |
|
|
|
|
|
|
|
|
vector_store = setup_vector_store() |
|
|
if not vector_store: |
|
|
logger.error("Can't create vector store") |
|
|
return None |
|
|
|
|
|
|
|
|
try: |
|
|
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL) |
|
|
logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}") |
|
|
except Exception as e: |
|
|
logger.error(f"Can't load embeddings: {e}") |
|
|
return None |
|
|
|
|
|
|
|
|
logger.info("Creating vector index... this might take a moment") |
|
|
|
|
|
index = VectorStoreIndex.from_documents( |
|
|
documents=documents, |
|
|
vector_store=vector_store, |
|
|
embed_model=embed_model, |
|
|
show_progress=True |
|
|
) |
|
|
|
|
|
logger.info("Persona index built successfully!") |
|
|
return index |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Index building failed: {e}") |
|
|
return None |
|
|
|
|
|
def get_persona_index(): |
|
|
""" |
|
|
Get my persona index (builds it if needed, caches it if possible) |
|
|
""" |
|
|
global _my_persona_index |
|
|
|
|
|
if _my_persona_index is None: |
|
|
logger.info("Building persona index for the first time...") |
|
|
_my_persona_index = build_persona_index() |
|
|
else: |
|
|
logger.info("Using cached persona index") |
|
|
|
|
|
return _my_persona_index |
|
|
|
|
|
def get_persona_query_engine(llm=None): |
|
|
""" |
|
|
Get a query engine I can use to search my personas |
|
|
This is what gets called from my tools |
|
|
""" |
|
|
try: |
|
|
index = get_persona_index() |
|
|
if index is None: |
|
|
logger.warning("No persona index available") |
|
|
return None |
|
|
|
|
|
|
|
|
query_engine = index.as_query_engine( |
|
|
llm=llm, |
|
|
response_mode="tree_summarize", |
|
|
similarity_top_k=3, |
|
|
streaming=False |
|
|
) |
|
|
|
|
|
logger.info("Persona query engine ready") |
|
|
return query_engine |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"Query engine creation failed: {e}") |
|
|
return None |
|
|
|
|
|
def test_my_personas(): |
|
|
""" |
|
|
Test that my persona system works |
|
|
""" |
|
|
print("\n=== Testing My Persona Database ===") |
|
|
|
|
|
|
|
|
print(f"Datasets available: {CAN_LOAD_DATASETS}") |
|
|
print(f"ChromaDB available: {CHROMADB_WORKS}") |
|
|
|
|
|
if not CHROMADB_WORKS: |
|
|
print("❌ ChromaDB missing - persona database won't work") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\nTesting persona loading...") |
|
|
try: |
|
|
personas = download_personas() |
|
|
print(f"✅ Got {len(personas)} personas") |
|
|
if personas: |
|
|
print(f"Sample: {personas[0][:100]}...") |
|
|
except Exception as e: |
|
|
print(f"❌ Persona loading failed: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\nTesting vector store...") |
|
|
try: |
|
|
vector_store = setup_vector_store() |
|
|
if vector_store: |
|
|
print("✅ Vector store created") |
|
|
else: |
|
|
print("❌ Vector store failed") |
|
|
return False |
|
|
except Exception as e: |
|
|
print(f"❌ Vector store error: {e}") |
|
|
return False |
|
|
|
|
|
|
|
|
print("\nTesting index building...") |
|
|
try: |
|
|
|
|
|
test_personas = make_sample_personas()[:3] |
|
|
test_docs = make_documents(test_personas) |
|
|
|
|
|
vector_store = setup_vector_store() |
|
|
embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL) |
|
|
|
|
|
index = VectorStoreIndex.from_documents( |
|
|
documents=test_docs, |
|
|
vector_store=vector_store, |
|
|
embed_model=embed_model |
|
|
) |
|
|
|
|
|
print("✅ Index building works") |
|
|
|
|
|
|
|
|
query_engine = index.as_query_engine(similarity_top_k=1) |
|
|
results = query_engine.query("software developer") |
|
|
print("✅ Query test passed") |
|
|
|
|
|
return True |
|
|
|
|
|
except Exception as e: |
|
|
print(f"❌ Index test failed: {e}") |
|
|
return False |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
import logging |
|
|
logging.basicConfig(level=logging.INFO) |
|
|
|
|
|
print("Testing My Persona Database System") |
|
|
print("=" * 40) |
|
|
|
|
|
success = test_my_personas() |
|
|
|
|
|
if success: |
|
|
print("\n✅ Persona database is working!") |
|
|
else: |
|
|
print("\n❌ Persona database has issues") |
|
|
|
|
|
print("\nThis system is optimized for HuggingFace Spaces:") |
|
|
print("- Uses in-memory storage (no files)") |
|
|
print("- Limited personas (saves memory)") |
|
|
print("- Fallback data (works offline)") |
|
|
print("- Fast startup (cached building)") |