Final_Assignment_AGENT_GAIA

Sleeping

File size: 11,705 Bytes

8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
 
 
 
e01c471
8a7b3d1
 
e01c471
8a7b3d1
e01c471
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
e01c471
8a7b3d1
 
e01c471
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
 
e01c471
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
8a7b3d1
 
 
e01c471
 
 
8a7b3d1
e01c471
 
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
 
 
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
 
8a7b3d1
 
e01c471
8a7b3d1
 
 
e01c471
 
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
8a7b3d1
e01c471
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a7b3d1
e01c471
 
8a7b3d1
 
e01c471
 
8a7b3d1
 
e01c471
 
 
 
 
 
 
 
 
8a7b3d1
e01c471

"""
My Persona Database - RAG Implementation

This is where I build my persona database using what I learned about RAG.
I'm using:
- HuggingFace dataset with persona descriptions
- ChromaDB for vector storage (learned this is good for small projects)
- Embeddings to find similar personas
- LlamaIndex to tie it all together

The goal is to have a database I can query like "find me creative people" 
and get back actual persona descriptions.

Note: I made this work in HuggingFace Spaces by keeping everything in memory
and using a smaller dataset so it doesn't crash.
"""

import logging
import os
from typing import List, Optional
from pathlib import Path

# Core LlamaIndex stuff
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

# For embeddings and vector storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

# External stuff
try:
    from datasets import load_dataset
    CAN_LOAD_DATASETS = True
except ImportError:
    CAN_LOAD_DATASETS = False

try:
    import chromadb
    CHROMADB_WORKS = True
except ImportError:
    CHROMADB_WORKS = False

logger = logging.getLogger(__name__)

# My settings
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
MAX_PERSONAS = 300  # Keep it small for HF Spaces
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # This one works well
CHUNK_SIZE = 400  # Smaller chunks work better

# Cache so I don't rebuild this every time
_my_persona_index = None

def make_sample_personas():
    """
    Backup personas in case I can't download the real dataset
    These are just examples but at least my agent will work
    """
    samples = [
        "I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",
        
        "I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",
        
        "I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",
        
        "I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",
        
        "I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",
        
        "I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",
        
        "I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",
        
        "I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",
        
        "I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",
        
        "I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
    ]
    
    logger.info(f"Created {len(samples)} backup personas")
    return samples

def download_personas():
    """
    Try to get the real persona dataset from HuggingFace
    If that fails, use my backup personas
    """
    logger.info("Trying to download persona dataset...")
    
    if not CAN_LOAD_DATASETS:
        logger.warning("Can't load datasets library, using backups")
        return make_sample_personas()
    
    try:
        # Load the dataset (streaming to save memory)
        dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)
        
        personas = []
        for i, item in enumerate(dataset):
            if i >= MAX_PERSONAS:  # Don't go over my limit
                break
                
            persona_text = item.get("persona", "")
            if persona_text.strip():
                personas.append(f"Person {i+1}: {persona_text}")
                
            if (i + 1) % 50 == 0:
                logger.info(f"Downloaded {i+1} personas...")
        
        logger.info(f"Got {len(personas)} personas from HuggingFace!")
        return personas
        
    except Exception as e:
        logger.warning(f"Download failed: {e}, using backups")
        return make_sample_personas()

def make_documents(personas):
    """
    Turn my persona strings into LlamaIndex documents
    """
    logger.info(f"Making documents from {len(personas)} personas...")
    
    docs = []
    for i, persona_text in enumerate(personas):
        doc = Document(
            text=persona_text,
            metadata={
                "source": f"persona_{i}",
                "persona_id": i,
                "type": "persona_description"
            }
        )
        docs.append(doc)
    
    logger.info(f"Created {len(docs)} documents")
    return docs

def setup_vector_store():
    """
    Set up ChromaDB for storing my vectors
    Using in-memory so it works in HuggingFace Spaces
    """
    if not CHROMADB_WORKS:
        logger.error("ChromaDB not available!")
        return None
        
    try:
        logger.info("Setting up in-memory vector store...")
        
        # In-memory client (no files to worry about)
        client = chromadb.Client()
        collection = client.get_or_create_collection("my_personas")
        
        # Wrap it for LlamaIndex
        vector_store = ChromaVectorStore(chroma_collection=collection)
        
        logger.info("Vector store ready!")
        return vector_store
        
    except Exception as e:
        logger.error(f"Vector store setup failed: {e}")
        return None

def build_persona_index():
    """
    Build my persona index from scratch
    This might take a minute the first time
    """
    logger.info("Building persona index...")
    
    try:
        # Step 1: Get the persona data
        personas = download_personas()
        if not personas:
            logger.error("No persona data available")
            return None
        
        # Step 2: Make documents
        documents = make_documents(personas)
        
        # Step 3: Set up vector storage
        vector_store = setup_vector_store()
        if not vector_store:
            logger.error("Can't create vector store")
            return None
        
        # Step 4: Set up embeddings
        try:
            embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
            logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
        except Exception as e:
            logger.error(f"Can't load embeddings: {e}")
            return None
        
        # Step 5: Build the index
        logger.info("Creating vector index... this might take a moment")
        
        index = VectorStoreIndex.from_documents(
            documents=documents,
            vector_store=vector_store,
            embed_model=embed_model,
            show_progress=True
        )
        
        logger.info("Persona index built successfully!")
        return index
        
    except Exception as e:
        logger.error(f"Index building failed: {e}")
        return None

def get_persona_index():
    """
    Get my persona index (builds it if needed, caches it if possible)
    """
    global _my_persona_index
    
    if _my_persona_index is None:
        logger.info("Building persona index for the first time...")
        _my_persona_index = build_persona_index()
    else:
        logger.info("Using cached persona index")
    
    return _my_persona_index

def get_persona_query_engine(llm=None):
    """
    Get a query engine I can use to search my personas
    This is what gets called from my tools
    """
    try:
        index = get_persona_index()
        if index is None:
            logger.warning("No persona index available")
            return None
        
        # Make the query engine
        query_engine = index.as_query_engine(
            llm=llm,  # Use the LLM from my agent
            response_mode="tree_summarize",  # Good for combining multiple results
            similarity_top_k=3,  # Get top 3 matches
            streaming=False
        )
        
        logger.info("Persona query engine ready")
        return query_engine
        
    except Exception as e:
        logger.error(f"Query engine creation failed: {e}")
        return None

def test_my_personas():
    """
    Test that my persona system works
    """
    print("\n=== Testing My Persona Database ===")
    
    # Check dependencies
    print(f"Datasets available: {CAN_LOAD_DATASETS}")
    print(f"ChromaDB available: {CHROMADB_WORKS}")
    
    if not CHROMADB_WORKS:
        print("❌ ChromaDB missing - persona database won't work")
        return False
    
    # Test data loading
    print("\nTesting persona loading...")
    try:
        personas = download_personas()
        print(f"✅ Got {len(personas)} personas")
        if personas:
            print(f"Sample: {personas[0][:100]}...")
    except Exception as e:
        print(f"❌ Persona loading failed: {e}")
        return False
    
    # Test vector store
    print("\nTesting vector store...")
    try:
        vector_store = setup_vector_store()
        if vector_store:
            print("✅ Vector store created")
        else:
            print("❌ Vector store failed")
            return False
    except Exception as e:
        print(f"❌ Vector store error: {e}")
        return False
    
    # Test index building (small test)
    print("\nTesting index building...")
    try:
        # Use just a few personas for testing
        test_personas = make_sample_personas()[:3]
        test_docs = make_documents(test_personas)
        
        vector_store = setup_vector_store()
        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
        
        index = VectorStoreIndex.from_documents(
            documents=test_docs,
            vector_store=vector_store,
            embed_model=embed_model
        )
        
        print("✅ Index building works")
        
        # Test a simple query
        query_engine = index.as_query_engine(similarity_top_k=1)
        results = query_engine.query("software developer")
        print("✅ Query test passed")
        
        return True
        
    except Exception as e:
        print(f"❌ Index test failed: {e}")
        return False

if __name__ == "__main__":
    # Test my persona system
    import logging
    logging.basicConfig(level=logging.INFO)
    
    print("Testing My Persona Database System")
    print("=" * 40)
    
    success = test_my_personas()
    
    if success:
        print("\n✅ Persona database is working!")
    else:
        print("\n❌ Persona database has issues")
    
    print("\nThis system is optimized for HuggingFace Spaces:")
    print("- Uses in-memory storage (no files)")
    print("- Limited personas (saves memory)")
    print("- Fallback data (works offline)")
    print("- Fast startup (cached building)")