File size: 11,705 Bytes
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
 
 
 
e01c471
8a7b3d1
 
e01c471
8a7b3d1
e01c471
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
 
e01c471
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
 
e01c471
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
8a7b3d1
 
 
e01c471
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
8a7b3d1
 
e01c471
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
"""
My Persona Database - RAG Implementation

This is where I build my persona database using what I learned about RAG.
I'm using:
- HuggingFace dataset with persona descriptions
- ChromaDB for vector storage (learned this is good for small projects)
- Embeddings to find similar personas
- LlamaIndex to tie it all together

The goal is to have a database I can query like "find me creative people" 
and get back actual persona descriptions.

Note: I made this work in HuggingFace Spaces by keeping everything in memory
and using a smaller dataset so it doesn't crash.
"""

import logging
import os
from typing import List, Optional
from pathlib import Path

# Core LlamaIndex stuff
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

# For embeddings and vector storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

# External stuff
try:
    from datasets import load_dataset
    CAN_LOAD_DATASETS = True
except ImportError:
    CAN_LOAD_DATASETS = False

try:
    import chromadb
    CHROMADB_WORKS = True
except ImportError:
    CHROMADB_WORKS = False

logger = logging.getLogger(__name__)

# My settings
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
MAX_PERSONAS = 300  # Keep it small for HF Spaces
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # This one works well
CHUNK_SIZE = 400  # Smaller chunks work better

# Cache so I don't rebuild this every time
_my_persona_index = None

def make_sample_personas():
    """
    Backup personas in case I can't download the real dataset
    These are just examples but at least my agent will work
    """
    samples = [
        "I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",
        
        "I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",
        
        "I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",
        
        "I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",
        
        "I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",
        
        "I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",
        
        "I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",
        
        "I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",
        
        "I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",
        
        "I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
    ]
    
    logger.info(f"Created {len(samples)} backup personas")
    return samples

def download_personas():
    """
    Try to get the real persona dataset from HuggingFace
    If that fails, use my backup personas
    """
    logger.info("Trying to download persona dataset...")
    
    if not CAN_LOAD_DATASETS:
        logger.warning("Can't load datasets library, using backups")
        return make_sample_personas()
    
    try:
        # Load the dataset (streaming to save memory)
        dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)
        
        personas = []
        for i, item in enumerate(dataset):
            if i >= MAX_PERSONAS:  # Don't go over my limit
                break
                
            persona_text = item.get("persona", "")
            if persona_text.strip():
                personas.append(f"Person {i+1}: {persona_text}")
                
            if (i + 1) % 50 == 0:
                logger.info(f"Downloaded {i+1} personas...")
        
        logger.info(f"Got {len(personas)} personas from HuggingFace!")
        return personas
        
    except Exception as e:
        logger.warning(f"Download failed: {e}, using backups")
        return make_sample_personas()

def make_documents(personas):
    """
    Turn my persona strings into LlamaIndex documents
    """
    logger.info(f"Making documents from {len(personas)} personas...")
    
    docs = []
    for i, persona_text in enumerate(personas):
        doc = Document(
            text=persona_text,
            metadata={
                "source": f"persona_{i}",
                "persona_id": i,
                "type": "persona_description"
            }
        )
        docs.append(doc)
    
    logger.info(f"Created {len(docs)} documents")
    return docs

def setup_vector_store():
    """
    Set up ChromaDB for storing my vectors
    Using in-memory so it works in HuggingFace Spaces
    """
    if not CHROMADB_WORKS:
        logger.error("ChromaDB not available!")
        return None
        
    try:
        logger.info("Setting up in-memory vector store...")
        
        # In-memory client (no files to worry about)
        client = chromadb.Client()
        collection = client.get_or_create_collection("my_personas")
        
        # Wrap it for LlamaIndex
        vector_store = ChromaVectorStore(chroma_collection=collection)
        
        logger.info("Vector store ready!")
        return vector_store
        
    except Exception as e:
        logger.error(f"Vector store setup failed: {e}")
        return None

def build_persona_index():
    """
    Build my persona index from scratch
    This might take a minute the first time
    """
    logger.info("Building persona index...")
    
    try:
        # Step 1: Get the persona data
        personas = download_personas()
        if not personas:
            logger.error("No persona data available")
            return None
        
        # Step 2: Make documents
        documents = make_documents(personas)
        
        # Step 3: Set up vector storage
        vector_store = setup_vector_store()
        if not vector_store:
            logger.error("Can't create vector store")
            return None
        
        # Step 4: Set up embeddings
        try:
            embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
            logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
        except Exception as e:
            logger.error(f"Can't load embeddings: {e}")
            return None
        
        # Step 5: Build the index
        logger.info("Creating vector index... this might take a moment")
        
        index = VectorStoreIndex.from_documents(
            documents=documents,
            vector_store=vector_store,
            embed_model=embed_model,
            show_progress=True
        )
        
        logger.info("Persona index built successfully!")
        return index
        
    except Exception as e:
        logger.error(f"Index building failed: {e}")
        return None

def get_persona_index():
    """
    Get my persona index (builds it if needed, caches it if possible)
    """
    global _my_persona_index
    
    if _my_persona_index is None:
        logger.info("Building persona index for the first time...")
        _my_persona_index = build_persona_index()
    else:
        logger.info("Using cached persona index")
    
    return _my_persona_index

def get_persona_query_engine(llm=None):
    """
    Get a query engine I can use to search my personas
    This is what gets called from my tools
    """
    try:
        index = get_persona_index()
        if index is None:
            logger.warning("No persona index available")
            return None
        
        # Make the query engine
        query_engine = index.as_query_engine(
            llm=llm,  # Use the LLM from my agent
            response_mode="tree_summarize",  # Good for combining multiple results
            similarity_top_k=3,  # Get top 3 matches
            streaming=False
        )
        
        logger.info("Persona query engine ready")
        return query_engine
        
    except Exception as e:
        logger.error(f"Query engine creation failed: {e}")
        return None

def test_my_personas():
    """
    Test that my persona system works
    """
    print("\n=== Testing My Persona Database ===")
    
    # Check dependencies
    print(f"Datasets available: {CAN_LOAD_DATASETS}")
    print(f"ChromaDB available: {CHROMADB_WORKS}")
    
    if not CHROMADB_WORKS:
        print("❌ ChromaDB missing - persona database won't work")
        return False
    
    # Test data loading
    print("\nTesting persona loading...")
    try:
        personas = download_personas()
        print(f"✅ Got {len(personas)} personas")
        if personas:
            print(f"Sample: {personas[0][:100]}...")
    except Exception as e:
        print(f"❌ Persona loading failed: {e}")
        return False
    
    # Test vector store
    print("\nTesting vector store...")
    try:
        vector_store = setup_vector_store()
        if vector_store:
            print("✅ Vector store created")
        else:
            print("❌ Vector store failed")
            return False
    except Exception as e:
        print(f"❌ Vector store error: {e}")
        return False
    
    # Test index building (small test)
    print("\nTesting index building...")
    try:
        # Use just a few personas for testing
        test_personas = make_sample_personas()[:3]
        test_docs = make_documents(test_personas)
        
        vector_store = setup_vector_store()
        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
        
        index = VectorStoreIndex.from_documents(
            documents=test_docs,
            vector_store=vector_store,
            embed_model=embed_model
        )
        
        print("✅ Index building works")
        
        # Test a simple query
        query_engine = index.as_query_engine(similarity_top_k=1)
        results = query_engine.query("software developer")
        print("✅ Query test passed")
        
        return True
        
    except Exception as e:
        print(f"❌ Index test failed: {e}")
        return False

if __name__ == "__main__":
    # Test my persona system
    import logging
    logging.basicConfig(level=logging.INFO)
    
    print("Testing My Persona Database System")
    print("=" * 40)
    
    success = test_my_personas()
    
    if success:
        print("\n✅ Persona database is working!")
    else:
        print("\n❌ Persona database has issues")
    
    print("\nThis system is optimized for HuggingFace Spaces:")
    print("- Uses in-memory storage (no files)")
    print("- Limited personas (saves memory)")
    print("- Fallback data (works offline)")
    print("- Fast startup (cached building)")