"""
My Persona Database - RAG Implementation

This is where I build my persona database using what I learned about RAG.
I'm using:
- HuggingFace dataset with persona descriptions
- ChromaDB for vector storage (learned this is good for small projects)
- Embeddings to find similar personas
- LlamaIndex to tie it all together

The goal is to have a database I can query like "find me creative people" 
and get back actual persona descriptions.

Note: I made this work in HuggingFace Spaces by keeping everything in memory
and using a smaller dataset so it doesn't crash.
"""

import logging
import os
from typing import List, Optional
from pathlib import Path

# Core LlamaIndex stuff
from llama_index.core.schema import Document
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

# For embeddings and vector storage
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.vector_stores.chroma import ChromaVectorStore

# External stuff
try:
    from datasets import load_dataset
    CAN_LOAD_DATASETS = True
except ImportError:
    CAN_LOAD_DATASETS = False

try:
    import chromadb
    CHROMADB_WORKS = True
except ImportError:
    CHROMADB_WORKS = False

logger = logging.getLogger(__name__)

# My settings
PERSONA_DATASET = "dvilasuero/finepersonas-v0.1-tiny"
MAX_PERSONAS = 300  # Keep it small for HF Spaces
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # This one works well
CHUNK_SIZE = 400  # Smaller chunks work better

# Cache so I don't rebuild this every time
_my_persona_index = None

def make_sample_personas():
    """
    Backup personas in case I can't download the real dataset
    These are just examples but at least my agent will work
    """
    samples = [
        "I'm a 28-year-old software developer from Seattle. I love hiking on weekends, coding in Python, and playing indie video games. I work at a tech startup and dream of building my own app someday.",
        
        "I'm a 35-year-old high school teacher in Boston. I teach English literature and spend my free time writing poetry. I volunteer at the local animal shelter and love mystery novels.",
        
        "I'm a 42-year-old chef who owns a small Italian restaurant in Chicago. I learned to cook from my grandmother and love experimenting with fusion cuisine. I teach cooking classes on Sundays.",
        
        "I'm a 24-year-old graphic designer in Los Angeles. I freelance for indie game studios and love creating digital art. My hobbies include skateboarding and visiting coffee shops for inspiration.",
        
        "I'm a 39-year-old veterinarian in Denver. I specialize in wildlife rehabilitation and spend weekends hiking in the mountains. I volunteer at the local zoo and love photography.",
        
        "I'm a 31-year-old journalist in New York covering tech trends. I write a weekly newsletter about AI and automation. I practice yoga daily and love exploring the city's food scene.",
        
        "I'm a 45-year-old musician who plays guitar in a blues band. I teach music lessons during the day and perform at local venues on weekends. I collect vintage vinyl records.",
        
        "I'm a 27-year-old marine biologist studying coral reefs in San Diego. I love scuba diving and underwater photography. I'm passionate about ocean conservation and climate change.",
        
        "I'm a 33-year-old architect designing sustainable buildings in Portland. I believe in green construction and volunteer for Habitat for Humanity. I enjoy urban sketching.",
        
        "I'm a 29-year-old data scientist working in healthcare analytics in Austin. I love solving puzzles and play chess competitively. I brew craft beer as a hobby."
    ]
    
    logger.info(f"Created {len(samples)} backup personas")
    return samples

def download_personas():
    """
    Try to get the real persona dataset from HuggingFace
    If that fails, use my backup personas
    """
    logger.info("Trying to download persona dataset...")
    
    if not CAN_LOAD_DATASETS:
        logger.warning("Can't load datasets library, using backups")
        return make_sample_personas()
    
    try:
        # Load the dataset (streaming to save memory)
        dataset = load_dataset(PERSONA_DATASET, split="train", streaming=True)
        
        personas = []
        for i, item in enumerate(dataset):
            if i >= MAX_PERSONAS:  # Don't go over my limit
                break
                
            persona_text = item.get("persona", "")
            if persona_text.strip():
                personas.append(f"Person {i+1}: {persona_text}")
                
            if (i + 1) % 50 == 0:
                logger.info(f"Downloaded {i+1} personas...")
        
        logger.info(f"Got {len(personas)} personas from HuggingFace!")
        return personas
        
    except Exception as e:
        logger.warning(f"Download failed: {e}, using backups")
        return make_sample_personas()

def make_documents(personas):
    """
    Turn my persona strings into LlamaIndex documents
    """
    logger.info(f"Making documents from {len(personas)} personas...")
    
    docs = []
    for i, persona_text in enumerate(personas):
        doc = Document(
            text=persona_text,
            metadata={
                "source": f"persona_{i}",
                "persona_id": i,
                "type": "persona_description"
            }
        )
        docs.append(doc)
    
    logger.info(f"Created {len(docs)} documents")
    return docs

def setup_vector_store():
    """
    Set up ChromaDB for storing my vectors
    Using in-memory so it works in HuggingFace Spaces
    """
    if not CHROMADB_WORKS:
        logger.error("ChromaDB not available!")
        return None
        
    try:
        logger.info("Setting up in-memory vector store...")
        
        # In-memory client (no files to worry about)
        client = chromadb.Client()
        collection = client.get_or_create_collection("my_personas")
        
        # Wrap it for LlamaIndex
        vector_store = ChromaVectorStore(chroma_collection=collection)
        
        logger.info("Vector store ready!")
        return vector_store
        
    except Exception as e:
        logger.error(f"Vector store setup failed: {e}")
        return None

def build_persona_index():
    """
    Build my persona index from scratch
    This might take a minute the first time
    """
    logger.info("Building persona index...")
    
    try:
        # Step 1: Get the persona data
        personas = download_personas()
        if not personas:
            logger.error("No persona data available")
            return None
        
        # Step 2: Make documents
        documents = make_documents(personas)
        
        # Step 3: Set up vector storage
        vector_store = setup_vector_store()
        if not vector_store:
            logger.error("Can't create vector store")
            return None
        
        # Step 4: Set up embeddings
        try:
            embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
            logger.info(f"Loaded embedding model: {EMBEDDING_MODEL}")
        except Exception as e:
            logger.error(f"Can't load embeddings: {e}")
            return None
        
        # Step 5: Build the index
        logger.info("Creating vector index... this might take a moment")
        
        index = VectorStoreIndex.from_documents(
            documents=documents,
            vector_store=vector_store,
            embed_model=embed_model,
            show_progress=True
        )
        
        logger.info("Persona index built successfully!")
        return index
        
    except Exception as e:
        logger.error(f"Index building failed: {e}")
        return None

def get_persona_index():
    """
    Get my persona index (builds it if needed, caches it if possible)
    """
    global _my_persona_index
    
    if _my_persona_index is None:
        logger.info("Building persona index for the first time...")
        _my_persona_index = build_persona_index()
    else:
        logger.info("Using cached persona index")
    
    return _my_persona_index

def get_persona_query_engine(llm=None):
    """
    Get a query engine I can use to search my personas
    This is what gets called from my tools
    """
    try:
        index = get_persona_index()
        if index is None:
            logger.warning("No persona index available")
            return None
        
        # Make the query engine
        query_engine = index.as_query_engine(
            llm=llm,  # Use the LLM from my agent
            response_mode="tree_summarize",  # Good for combining multiple results
            similarity_top_k=3,  # Get top 3 matches
            streaming=False
        )
        
        logger.info("Persona query engine ready")
        return query_engine
        
    except Exception as e:
        logger.error(f"Query engine creation failed: {e}")
        return None

def test_my_personas():
    """
    Test that my persona system works
    """
    print("\n=== Testing My Persona Database ===")
    
    # Check dependencies
    print(f"Datasets available: {CAN_LOAD_DATASETS}")
    print(f"ChromaDB available: {CHROMADB_WORKS}")
    
    if not CHROMADB_WORKS:
        print("❌ ChromaDB missing - persona database won't work")
        return False
    
    # Test data loading
    print("\nTesting persona loading...")
    try:
        personas = download_personas()
        print(f"✅ Got {len(personas)} personas")
        if personas:
            print(f"Sample: {personas[0][:100]}...")
    except Exception as e:
        print(f"❌ Persona loading failed: {e}")
        return False
    
    # Test vector store
    print("\nTesting vector store...")
    try:
        vector_store = setup_vector_store()
        if vector_store:
            print("✅ Vector store created")
        else:
            print("❌ Vector store failed")
            return False
    except Exception as e:
        print(f"❌ Vector store error: {e}")
        return False
    
    # Test index building (small test)
    print("\nTesting index building...")
    try:
        # Use just a few personas for testing
        test_personas = make_sample_personas()[:3]
        test_docs = make_documents(test_personas)
        
        vector_store = setup_vector_store()
        embed_model = HuggingFaceEmbedding(model_name=EMBEDDING_MODEL)
        
        index = VectorStoreIndex.from_documents(
            documents=test_docs,
            vector_store=vector_store,
            embed_model=embed_model
        )
        
        print("✅ Index building works")
        
        # Test a simple query
        query_engine = index.as_query_engine(similarity_top_k=1)
        results = query_engine.query("software developer")
        print("✅ Query test passed")
        
        return True
        
    except Exception as e:
        print(f"❌ Index test failed: {e}")
        return False

if __name__ == "__main__":
    # Test my persona system
    import logging
    logging.basicConfig(level=logging.INFO)
    
    print("Testing My Persona Database System")
    print("=" * 40)
    
    success = test_my_personas()
    
    if success:
        print("\n✅ Persona database is working!")
    else:
        print("\n❌ Persona database has issues")
    
    print("\nThis system is optimized for HuggingFace Spaces:")
    print("- Uses in-memory storage (no files)")
    print("- Limited personas (saves memory)")
    print("- Fallback data (works offline)")
    print("- Fast startup (cached building)")