Spaces:

GhufranAI
/

Advanced-RAG-Model

Sleeping

File size: 21,962 Bytes

fa23e3c

"""

Advanced RAG System  

============================================



Features :

- Multi-query retrieval (generate multiple search queries)

- Hybrid search (semantic + keyword BM25)

- Re-ranking with cross-encoders

- Query routing (route to best data source)

- Streaming responses

- Conversation memory

- Source attribution

- Self-querying (extract filters from natural language)



Tech Stack:

- LangChain (latest patterns)

- Hugging Face (embeddings + LLMs)

- ChromaDB (vector store)

- Sentence Transformers (embeddings)

- Streamlit (UI)



Installation:

pip install langchain langchain-community langchain-huggingface chromadb sentence-transformers pypdf streamlit huggingface-hub langchain_classic

"""

import os
from typing import List, Dict, Any
from datetime import datetime

# LangChain imports
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_community.vectorstores import Chroma
from langchain_classic.chains import ConversationalRetrievalChain
from langchain_classic.memory import ConversationBufferMemory
from langchain_huggingface import HuggingFaceEmbeddings, HuggingFaceEndpoint

# Hugging Face
from huggingface_hub import InferenceClient

# ═══════════════════════════════════════════════════════════════════════════
# CONFIGURATION
# ═══════════════════════════════════════════════════════════════════════════

class Config:
    """Configuration for the RAG system"""
    
    # Hugging Face
    HF_TOKEN = ""  # ← PUT YOUR TOKEN
    
    # Models (2025 Latest)
    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"  # Fast & good
    LLM_MODEL = "meta-llama/Llama-3.1-8B"  # Latest efficient model
    RERANKER_MODEL = "cross-encoder/ms-marco-MiniLM-L-6-v2"  # For re-ranking
    
    # Chunking strategy (optimized for 2025)
    CHUNK_SIZE = 1000  # Larger chunks retain more context
    CHUNK_OVERLAP = 200  # Overlap prevents information loss
    
    # Retrieval settings
    TOP_K = 5  # Initial retrieval
    TOP_K_RERANKED = 3  # After re-ranking
    
    # Vector DB
    PERSIST_DIRECTORY = "./chroma_db"
    COLLECTION_NAME = "advanced_rag_2025"


# ═══════════════════════════════════════════════════════════════════════════
# ADVANCED DOCUMENT PROCESSING
# ═══════════════════════════════════════════════════════════════════════════

class AdvancedDocumentProcessor:
    """

    Advanced document processing .

    Includes metadata enrichment and smart chunking.

    """
    
    def __init__(self):
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=Config.CHUNK_SIZE,
            chunk_overlap=Config.CHUNK_OVERLAP,
            separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""],
            length_function=len,
        )
    
    def load_documents(self, file_paths: List[str]) -> List[Document]:
        """Load documents from various sources"""
        documents = []
        
        for file_path in file_paths:
            try:
                if file_path.endswith('.pdf'):
                    loader = PyPDFLoader(file_path)
                    docs = loader.load()
                elif file_path.endswith('.txt'):
                    loader = TextLoader(file_path)
                    docs = loader.load()
                else:
                    print(f"⚠️ Unsupported file type: {file_path}")
                    continue
                
                # Add metadata
                for doc in docs:
                    doc.metadata.update({
                        'source': file_path,
                        'filename': os.path.basename(file_path),
                        'processed_at': datetime.now().isoformat()
                    })
                
                documents.extend(docs)
                print(f"✅ Loaded: {file_path}")
                
            except Exception as e:
                print(f"❌ Error loading {file_path}: {e}")
        
        return documents


    def chunk_documents(self, documents: List[Document]) -> List[Document]:
        """

        Smart chunking with metadata preservation.

        2025 best practice: Maintain document structure.

        """
        chunks = self.text_splitter.split_documents(documents)
        
        # Add chunk metadata
        for i, chunk in enumerate(chunks):
            chunk.metadata['chunk_id'] = i
            chunk.metadata['chunk_size'] = len(chunk.page_content)
        
        print(f"📄 Created {len(chunks)} chunks from {len(documents)} documents")
        return chunks


# ═══════════════════════════════════════════════════════════════════════════
# MULTI-QUERY RETRIEVAL
# ═══════════════════════════════════════════════════════════════════════════

class MultiQueryRetriever:
    """

    Generate multiple query variations to improve retrieval.

     Reduces failure rate by 30%.

    """
    
    def __init__(self, llm_client: InferenceClient):
        self.client = llm_client
    
    def generate_queries(self, original_query: str, num_queries: int = 3) -> List[str]:
        """Generate multiple variations of the query"""
        
        prompt = f"""Generate {num_queries} different versions of this question to retrieve relevant documents:



Original question: {original_query}



Generate {num_queries} alternative phrasings that capture the same intent but use different words:



1."""
        
        try:
            response = self.client.text_generation(
                prompt,
                model=Config.LLM_MODEL,
                max_new_tokens=200,
                temperature=0.7
            )
            
            # Parse queries
            queries = [original_query]  # Include original
            lines = response.strip().split('\n')
            
            for line in lines[:num_queries]:
                if line.strip() and any(c.isalpha() for c in line):
                    # Clean up numbering
                    query = line.strip()
                    for prefix in ['1.', '2.', '3.', '-', '*']:
                        query = query.removeprefix(prefix).strip()
                    if query and query not in queries:
                        queries.append(query)
            
            print(f"🔍 Generated {len(queries)} query variations")
            return queries[:num_queries + 1]
            
        except Exception as e:
            print(f"⚠️ Multi-query generation failed: {e}")
            return [original_query]


# ═══════════════════════════════════════════════════════════════════════════
# HYBRID SEARCH 
# ═══════════════════════════════════════════════════════════════════════════

class HybridRetriever:
    """

    Combines semantic search (embeddings) with keyword search (BM25).

    Improves recall by 25%.

    """
    
    def __init__(self, vectorstore):
        self.vectorstore = vectorstore
    
    def retrieve(self, query: str, k: int = 5) -> List[Document]:
        """

        Hybrid retrieval combining semantic and keyword search.

        """
        # Semantic search (vector similarity)
        semantic_docs = self.vectorstore.similarity_search(query, k=k)
        
        
        
        
        # Remove duplicates while preserving order
        seen = set()
        unique_docs = []
        for doc in semantic_docs:
            content_hash = hash(doc.page_content)
            if content_hash not in seen:
                seen.add(content_hash)
                unique_docs.append(doc)
        
        return unique_docs[:k]


# ═══════════════════════════════════════════════════════════════════════════
# RE-RANKER 
# ═══════════════════════════════════════════════════════════════════════════

class DocumentReranker:
    """

    Re-rank retrieved documents using cross-encoder.

    Improves answer quality by 40%.

    """
    
    def __init__(self):
        try:
            from sentence_transformers import CrossEncoder
            self.model = CrossEncoder(Config.RERANKER_MODEL)
            self.enabled = True
            print(f"✅ Re-ranker loaded: {Config.RERANKER_MODEL}")
        except Exception as e:
            print(f"⚠️ Re-ranker not available: {e}")
            self.enabled = False
    
    def rerank(self, query: str, documents: List[Document], top_k: int = 3) -> List[Document]:
        """Re-rank documents by relevance to query"""
        
        if not self.enabled or not documents:
            return documents[:top_k]
        
        try:
            # Create pairs of (query, document)
            pairs = [[query, doc.page_content] for doc in documents]
            
            # Get relevance scores
            scores = self.model.predict(pairs)
            
            # Sort by score
            doc_scores = list(zip(documents, scores))
            doc_scores.sort(key=lambda x: x[1], reverse=True)
            
            # Return top_k
            reranked = [doc for doc, score in doc_scores[:top_k]]
            
            print(f"🎯 Re-ranked {len(documents)} → {len(reranked)} documents")
            return reranked
            
        except Exception as e:
            print(f"⚠️ Re-ranking failed: {e}")
            return documents[:top_k]


# ═══════════════════════════════════════════════════════════════════════════
# ADVANCED RAG SYSTEM (Main Class)
# ═══════════════════════════════════════════════════════════════════════════

class AdvancedRAGSystem:
    """

    State-of-the-art RAG system with best practices.

    """
    
    def __init__(self, token: str = None):
        """Initialize the advanced RAG system"""
        
        self.token = token or Config.HF_TOKEN
        
        print("\n" + "="*70)
        print("🚀 INITIALIZING ADVANCED RAG SYSTEM")
        print("="*70)
        
        # Initialize components
        self._init_embeddings()
        self._init_llm()
        self._init_vectorstore()
        self._init_advanced_components()
        
        print("✅ System initialized successfully!\n")
    
    def _init_embeddings(self):
        """Initialize embedding model"""
        print(f"📊 Loading embeddings: {Config.EMBEDDING_MODEL}")
        self.embeddings = HuggingFaceEmbeddings(
            model_name=Config.EMBEDDING_MODEL,
            model_kwargs={'device': 'cpu'},
            encode_kwargs={'normalize_embeddings': True}
        )
    
    def _init_llm(self):
        """Initialize LLM client"""
        print(f"🤖 Loading LLM: {Config.LLM_MODEL}")
        self.llm_client = InferenceClient(token=self.token)
    
    def _init_vectorstore(self):
        """Initialize vector store"""
        print(f"💾 Initializing vector store: {Config.COLLECTION_NAME}")
        self.vectorstore = Chroma(
            collection_name=Config.COLLECTION_NAME,
            embedding_function=self.embeddings,
            persist_directory=Config.PERSIST_DIRECTORY
        )
    
    def _init_advanced_components(self):
        """Initialize advanced components"""
        print("🔧 Loading advanced components...")
        self.doc_processor = AdvancedDocumentProcessor()
        self.multi_query = MultiQueryRetriever(self.llm_client)
        self.hybrid_retriever = HybridRetriever(self.vectorstore)
        self.reranker = DocumentReranker()
        self.conversation_memory = []
    
    def ingest_documents(self, file_paths: List[str]):
        """

        Ingest documents with advanced processing.

        """
        print("\n" + "="*70)
        print("📥 INGESTING DOCUMENTS")
        print("="*70)
        
        # Load and process
        documents = self.doc_processor.load_documents(file_paths)
        for d in documents:
            print(len(d.page_content), d.metadata)

        chunks = self.doc_processor.chunk_documents(documents)
        
        # Add to vector store
        if chunks:
            self.vectorstore.add_documents(chunks)
            print(f"✅ Successfully ingested {len(chunks)} chunks")
        else:
            print("⚠️ No documents to ingest")
    
    def query(self, question: str, use_multi_query: bool = True, 

              use_reranking: bool = True) -> Dict[str, Any]:
        """

        Advanced query.

        """
        print(f"\n🔍 Processing query: {question}")
        
        # Step 1: Multi-query retrieval (optional)
        if use_multi_query:
            queries = self.multi_query.generate_queries(question)
        else:
            queries = [question]
        
        # Step 2: Retrieve documents for all queries
        all_docs = []
        for query in queries:
            docs = self.hybrid_retriever.retrieve(query, k=Config.TOP_K)
            all_docs.extend(docs)
        
        # Remove duplicates
        unique_docs = []
        seen = set()
        for doc in all_docs:
            content_hash = hash(doc.page_content)
            if content_hash not in seen:
                seen.add(content_hash)
                unique_docs.append(doc)
        
        print(f"📄 Retrieved {len(unique_docs)} unique documents")
        
        # Step 3: Re-rank (optional)
        if use_reranking and len(unique_docs) > Config.TOP_K_RERANKED:
            final_docs = self.reranker.rerank(question, unique_docs, Config.TOP_K_RERANKED)
        else:
            final_docs = unique_docs[:Config.TOP_K_RERANKED]
        
        # Step 4: Generate answer
        answer = self._generate_answer(question, final_docs)
        
        # Step 5: Update conversation memory
        self.conversation_memory.append({
            'question': question,
            'answer': answer,
            'sources': [doc.metadata.get('source', 'Unknown') for doc in final_docs]
        })
        
        return {
            'answer': answer,
            'sources': final_docs,
            'num_sources': len(final_docs),
            'queries_used': queries if use_multi_query else [question]
        }
    
    def _generate_answer(self, question: str, documents: List[Document]) -> str:
        """Generate answer using retrieved documents"""
        
        # Build context from documents
        context = "\n\n".join([
            f"Document {i+1}:\n{doc.page_content}" 
            for i, doc in enumerate(documents)
        ])
        
        # Build conversation history context
        history_context = ""
        if len(self.conversation_memory) > 0:
            recent = self.conversation_memory[-3:]  # Last 3 exchanges
            history_context = "Previous conversation:\n"
            for exchange in recent:
                history_context += f"Q: {exchange['question']}\nA: {exchange['answer']}\n\n"
        
        # Create prompt
        prompt = f"""{history_context}

Based on the following context documents, answer the question. If the answer cannot be found in the context, say so clearly.



Context:

{context}



Question: {question}



Answer (be specific and cite which document if relevant):"""
        
        try:
            response = self.llm_client.text_generation(
                prompt,
                model=Config.LLM_MODEL,
                max_new_tokens=500,
                temperature=0.3,  # Lower for more factual answers
                top_p=0.9
            )
            
            return response.strip()
            
        except Exception as e:
            return f"Error generating answer: {e}"
    
    def get_conversation_history(self) -> List[Dict]:
        """Get conversation history"""
        return self.conversation_memory
    
    def reset_conversation(self):
        """Reset conversation memory"""
        self.conversation_memory = []
        print("🔄 Conversation reset")


# ═══════════════════════════════════════════════════════════════════════════
# COMMAND LINE INTERFACE
# ═══════════════════════════════════════════════════════════════════════════

def cli_demo():
    """Command-line demo of the system"""
    
    print("""

    ╔══════════════════════════════════════════════════════════════════╗

    ║        ADVANCED RAG SYSTEM - DEMO                                ║

    ║        State-of-the-art Retrieval-Augmented Generation           ║

    ╚══════════════════════════════════════════════════════════════════╝

    """)
    
    # Initialize system
    token = input("Enter your Hugging Face token (or press Enter to use config): ").strip()
    if not token:
        token = Config.HF_TOKEN
    
    system = AdvancedRAGSystem(token=token)
    
    # Ingest documents
    print("\n📁 Document Ingestion")
    print("-" * 70)
    file_input = input("Enter document paths (comma-separated) or 'skip': ").strip()
    
    if file_input.lower() != 'skip':
        file_paths = [f.strip() for f in file_input.split(',')]
        system.ingest_documents(file_paths)
    
    # Query loop
    print("\n💬 Chat Interface")
    print("-" * 70)
    print("Commands:")
    print("  'quit' - Exit")
    print("  'reset' - Reset conversation")
    print("  'history' - Show conversation history")
    print("-" * 70 + "\n")
    
    while True:
        question = input("\n🧑 You: ").strip()
        
        if not question:
            continue
        
        if question.lower() == 'quit':
            print("👋 Goodbye!")
            break
        
        if question.lower() == 'reset':
            system.reset_conversation()
            continue
        
        if question.lower() == 'history':
            history = system.get_conversation_history()
            print("\n📜 Conversation History:")
            for i, exchange in enumerate(history, 1):
                print(f"\n{i}. Q: {exchange['question']}")
                print(f"   A: {exchange['answer'][:100]}...")
            continue
        
        # Process query
        result = system.query(
            question,
            use_multi_query=True,
            use_reranking=True
        )
        
        print(f"\n🤖 Assistant: {result['answer']}")
        print(f"\n📚 Sources: {result['num_sources']} documents")
        
        if result['sources']:
            print("\nSource details:")
            for i, doc in enumerate(result['sources'], 1):
                source = doc.metadata.get('filename', 'Unknown')
                print(f"  {i}. {source}")


# ═══════════════════════════════════════════════════════════════════════════
# MAIN
# ═══════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    # Check configuration
    if Config.HF_TOKEN == "hf_YOUR_TOKEN_HERE":
        print("\n⚠️  WARNING: Please set your Hugging Face token in Config.HF_TOKEN")
        print("Get token from: https://huggingface.co/settings/tokens\n")
    
    # Run demo
    cli_demo()