Spaces:

Supra-Nexus
/

supra-nexus-o2

Sleeping

File size: 12,041 Bytes

#!/usr/bin/env python3
"""
SUPRA RAG System with CPU/MPS/CUDA Optimizations
Optimized for CPU (HF Spaces), MPS (Apple Silicon), and CUDA with efficient memory management
"""

import json
import chromadb
import torch
import os
from sentence_transformers import SentenceTransformer
from pathlib import Path
from typing import List, Dict, Any
import streamlit as st
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class SupraRAG:
    def __init__(self, rag_data_path: str = None):
        # Default RAG data path (for HF Spaces deployment)
        if rag_data_path is None:
            # Try multiple possible locations
            possible_paths = [
                Path("data/processed/rag_seeds/rag_seeds.jsonl"),
                Path(__file__).parent.parent / "data/processed/rag_seeds/rag_seeds.jsonl",
                Path("rag_seeds.jsonl"),
            ]
            for path in possible_paths:
                if path.exists():
                    rag_data_path = str(path)
                    break
            else:
                # Default fallback
                rag_data_path = "data/processed/rag_seeds/rag_seeds.jsonl"
        self.rag_data_path = Path(rag_data_path)
        
        # Device-specific optimizations
        self._setup_device_optimizations()
        
        # Initialize ChromaDB with device optimizations
        self.client = chromadb.Client()
        self.collection_name = "supra_knowledge"
        
        # Use efficient embedding model (CPU for HF Spaces free tier)
        # CPU is optimal for sentence-transformers on CPU-only deployments
        embedding_device = 'cpu' if self.device == 'cpu' else self.device
        self.embedding_model = SentenceTransformer(
            'all-MiniLM-L6-v2',
            device=embedding_device
        )
        
        # Initialize or load collection
        try:
            self.collection = self.client.get_collection(self.collection_name)
            # Check if collection needs to be reloaded (count doesn't match JSONL file)
            current_count = len(self.collection.get()['ids']) if hasattr(self.collection, 'get') else 0
            # Count expected documents from JSONL
            expected_count = sum(1 for _ in open(self.rag_data_path, 'r', encoding='utf-8') if _.strip()) if self.rag_data_path.exists() else 0
            
            if current_count != expected_count:
                logger.info(f"🔄 Reloading RAG documents (current: {current_count}, expected: {expected_count})")
                # Delete and recreate collection to reload
                self.client.delete_collection(self.collection_name)
                self.collection = self.client.create_collection(self.collection_name)
                self._load_rag_documents()
            else:
                logger.info(f"✅ RAG knowledge base loaded ({current_count} documents)")
                # Removed UI success message - shown in sidebar instead
        except:
            self.collection = self.client.create_collection(self.collection_name)
            self._load_rag_documents()
    
    def _setup_device_optimizations(self):
        """Configure optimizations for CPU/MPS/CUDA."""
        logger.info("🔧 Setting up device optimizations...")
        
        # Environment variables
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
        
        # Detect device: MPS > CUDA > CPU
        if torch.backends.mps.is_available():
            logger.info("✅ MPS (Metal Performance Shaders) available - using MPS")
            self.device = "mps"
            os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
            torch.backends.mps.is_built()
        elif torch.cuda.is_available():
            logger.info("✅ CUDA available - using GPU")
            self.device = "cuda"
        else:
            logger.info("💻 CPU detected - using CPU optimizations")
            self.device = "cpu"
        
        logger.info(f"🔧 Using device: {self.device}")
    
    def _load_rag_documents(self):
        """Load RAG documents from JSONL file with device optimizations."""
        if not self.rag_data_path.exists():
            logger.warning("⚠️ RAG data file not found")
            if st:
                st.warning("⚠️ RAG data file not found")
            return
        
        documents = []
        metadatas = []
        ids = []
        
        logger.info(f"📚 Loading RAG documents from {self.rag_data_path}")
        
        with open(self.rag_data_path, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                if line.strip():
                    try:
                        doc = json.loads(line)
                        if 'content' in doc and 'id' in doc:
                            # Truncate content for memory efficiency
                            content = doc['content']
                            if len(content) > 2000:  # Limit content length
                                content = content[:2000] + "..."
                            
                            documents.append(content)
                            metadatas.append({
                                'title': doc.get('title', ''),
                                'type': doc.get('type', ''),
                                'source': doc.get('source', ''),
                                'word_count': len(content.split())
                            })
                            ids.append(doc['id'])
                        else:
                            logger.warning(f"⚠️ Skipping line {line_num}: missing required fields")
                    except json.JSONDecodeError as e:
                        logger.warning(f"⚠️ Skipping line {line_num}: JSON decode error - {e}")
        
        if documents:
            # Add to ChromaDB with batch processing
            batch_size = 50  # Smaller batches for memory efficiency
            for i in range(0, len(documents), batch_size):
                batch_docs = documents[i:i+batch_size]
                batch_metadatas = metadatas[i:i+batch_size]
                batch_ids = ids[i:i+batch_size]
                
                self.collection.add(
                    documents=batch_docs,
                    metadatas=batch_metadatas,
                    ids=batch_ids
                )
                
                logger.info(f"📊 Processed batch {i//batch_size + 1}/{(len(documents)-1)//batch_size + 1}")
            
            logger.info(f"✅ Loaded {len(documents)} RAG documents")
            # Removed UI success message - shown in sidebar instead
        else:
            logger.warning("⚠️ No valid documents found in RAG data file")
            if st:
                st.warning("⚠️ No valid documents found in RAG data file")
    
    def retrieve_context(self, query: str, n_results: int = 3) -> List[Dict[str, Any]]:
        """Retrieve relevant context for a query with device optimizations."""
        try:
            # Limit query length for efficiency
            if len(query) > 500:
                query = query[:500]
            
            results = self.collection.query(
                query_texts=[query],
                n_results=min(n_results, 5)  # Limit results for efficiency
            )
            
            context_docs = []
            for i, doc in enumerate(results['documents'][0]):
                # Truncate retrieved content for memory efficiency
                content = doc
                if len(content) > 1500:
                    content = content[:1500] + "..."
                
                context_docs.append({
                    'content': content,
                    'metadata': results['metadatas'][0][i],
                    'distance': results['distances'][0][i]
                })
            
            logger.info(f"🔍 Retrieved {len(context_docs)} context documents")
            return context_docs
            
        except Exception as e:
            logger.error(f"RAG retrieval error: {e}")
            if st:
                st.error(f"RAG retrieval error: {e}")
            return []
    
    def build_enhanced_prompt(self, user_query: str, context_docs: List[Dict[str, Any]]) -> str:
        """Build enhanced prompt with RAG context and SUPRA facts with device optimizations."""
        # Import SUPRA facts system
        from .supra_facts import build_supra_prompt, inject_facts_for_query
        
        # Extract RAG context chunks
        rag_context = None
        if context_docs:
            # Limit context length for memory efficiency
            max_context_length = 2000  # Reduced for memory efficiency
            context_text = ""
            
            for doc in context_docs:
                doc_text = f"{doc['content'][:800]}"
                if len(context_text + doc_text) > max_context_length:
                    break
                context_text += doc_text + "\n\n"
            
            rag_context = [context_text] if context_text else None
        
        # Auto-detect relevant facts from query
        facts = inject_facts_for_query(user_query)
        
        # Get model name from model_loader to detect chat template
        from .model_loader import get_model_info
        try:
            model_info = get_model_info()
            # Get base model name to detect Llama vs Mistral
            base_model = model_info.get('base_model', '')
            if 'llama' in base_model.lower() or 'meta-llama' in base_model.lower():
                model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'
            else:
                model_name = model_info.get('model_name', 'unsloth/mistral-7b-instruct-v0.3-bnb-4bit')
        except:
            # Default to Llama since latest models use Llama
            model_name = 'unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit'
        
        # Build complete SUPRA prompt with system prompt, facts, and RAG context
        enhanced_prompt = build_supra_prompt(
            user_query=user_query,
            facts=facts,
            rag_context=rag_context,
            model_name=model_name
        )
        
        return enhanced_prompt
    
    def generate_response(self, query: str, model, tokenizer, max_new_tokens: int = 800) -> str:
        """Generate response using the enhanced model with RAG context."""
        try:
            logger.info(f"🤖 Generating response for query: {query[:50]}...")
            
            # Get RAG context
            context_docs = self.retrieve_context(query, n_results=3)
            enhanced_prompt = self.build_enhanced_prompt(query, context_docs)
            
            # Import the generation function
            from .model_loader import generate_response_optimized
            
            # Generate with enhanced model - tighter parameters for better quality
            response = generate_response_optimized(
                model=model,
                tokenizer=tokenizer,
                prompt=enhanced_prompt,
                max_new_tokens=max_new_tokens,
                temperature=0.6,  # Lower temperature for more focused responses
                top_p=0.85  # Tighter sampling
            )
            
            logger.info(f"✅ Generated response ({len(response)} characters)")
            return response
            
        except Exception as e:
            logger.error(f"Error generating response: {e}")
            if st:
                st.error(f"Error generating response: {e}")
            return f"I apologize, but I encountered an error while generating a response: {e}"

# Global RAG instance with device-specific optimizations
@st.cache_resource
def get_supra_rag():
    """Get cached SUPRA RAG instance optimized for CPU/MPS/CUDA."""
    return SupraRAG()

# Backward compatibility (kept for compatibility with old imports)
def get_supra_rag_m2max():
    """Backward compatible function that returns device-optimized RAG."""
    return get_supra_rag()