Spaces:

alx-d
/

PhiRAG

Running

App Files Files Community

alx-d commited on Feb 23

Commit

cc814aa

verified ·

1 Parent(s): 43a5cae

Upload folder using huggingface_hub

Browse files

Files changed (1) hide show

advanced_rag.py +150 -133

advanced_rag.py CHANGED Viewed

@@ -19,14 +19,13 @@ from langchain_community.retrievers import BM25Retriever
 from langchain.retrievers import EnsembleRetriever
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema import StrOutputParser, Document
-from langchain_core.runnables import RunnableParallel, RunnablePassthrough, RunnableLambda
 from transformers.quantizers.auto import AutoQuantizationConfig
 import gradio as gr
 import requests
 # Add Mistral imports with fallback handling
 try:
-    # Try importing from the latest package structure
     from mistralai import Mistral
     MISTRAL_AVAILABLE = True
     debug_print = lambda msg: print(f"[{datetime.datetime.now().isoformat()}] {msg}")
@@ -36,14 +35,13 @@ except ImportError:
     debug_print = lambda msg: print(f"[{datetime.datetime.now().isoformat()}] {msg}")
     debug_print("Mistral client library not found. Install with: pip install mistralai")
-# Debug print function (already defined above in the try block)
 def debug_print(message: str):
     print(f"[{datetime.datetime.now().isoformat()}] {message}")
 def word_count(text: str) -> int:
     return len(text.split())
-# Initialize tokenizer for counting
 def initialize_tokenizer():
     try:
         return AutoTokenizer.from_pretrained("gpt2")
@@ -61,7 +59,20 @@ def count_tokens(text: str) -> int:
             return len(text.split())
     return len(text.split())
-# Updated prompt template to include conversation history
 default_prompt = """\
 {conversation_history}
 Use the following context to provide a detailed technical answer to the user's question.
@@ -75,7 +86,6 @@ User's question:
 {question}
 """
-# Helper function to load TXT files from URL with error checking
 def load_txt_from_url(url: str) -> Document:
     response = requests.get(url)
     if response.status_code == 200:
@@ -86,18 +96,10 @@ def load_txt_from_url(url: str) -> Document:
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:
         debug_print(f"Initializing ElevatedRagChain with model: {llm_choice}")
-        # Check for required API keys based on model choice
-        if "mistral-api" in llm_choice.lower() and not os.environ.get("MISTRAL_API_KEY"):
-            debug_print("WARNING: Mistral API selected but MISTRAL_API_KEY environment variable not set")
-            if not MISTRAL_AVAILABLE:
-                debug_print("WARNING: Mistral API package not installed. Install with: pip install mistralai")
         self.embed_func = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={"device": "cpu"}
@@ -110,29 +112,45 @@ class ElevatedRagChain:
         self.top_p = top_p
         self.prompt_template = prompt_template
         self.context = ""
-        self.conversation_history: List[Dict[str, str]] = []  # List of dicts with keys "query" and "response"
     def create_llm_pipeline(self):
-        if "remote" in self.llm_choice.lower():
             debug_print("Creating remote Meta-Llama-3 pipeline via Hugging Face Inference API...")
             from huggingface_hub import InferenceClient
             repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
             hf_api_token = os.environ.get("HF_API_TOKEN")
             if not hf_api_token:
                 raise ValueError("Please set the HF_API_TOKEN environment variable to use remote inference.")
-            client = InferenceClient(token=hf_api_token)
             def remote_generate(prompt: str) -> str:
                 response = client.text_generation(
                     prompt,
                     model=repo_id,
-#                    max_new_tokens=512,
                     temperature=self.temperature,
                     top_p=self.top_p,
                     repetition_penalty=1.1
                 )
                 return response
             from langchain.llms.base import LLM
             class RemoteLLM(LLM):
                 @property
@@ -145,76 +163,94 @@ class ElevatedRagChain:
                     return {"model": repo_id}
             debug_print("Remote Meta-Llama-3 pipeline created successfully.")
             return RemoteLLM()
-        elif "mistral-api" in self.llm_choice.lower():
             debug_print("Creating Mistral API pipeline...")
             mistral_api_key = os.environ.get("MISTRAL_API_KEY")
             if not mistral_api_key:
                 raise ValueError("Please set the MISTRAL_API_KEY environment variable to use Mistral API.")
             if not MISTRAL_AVAILABLE:
                 raise ImportError("Mistral client library not installed. Install with: pip install mistralai")
-            # Initialize the Mistral client with latest API
-            mistral_client = Mistral(api_key=mistral_api_key)
-            # Define the model to use - updated to match current model names
-            mistral_model = "mistral-small-latest"
             from langchain.llms.base import LLM
             class MistralLLM(LLM):
                 temperature: float = 0.7
                 top_p: float = 0.95
                 def __init__(self, api_key: str, temperature: float = 0.7, top_p: float = 0.95):
-                    super().__init__()  # Important to call the parent constructor
-                    self.client = Mistral(api_key=api_key)
                     self.temperature = temperature
                     self.top_p = top_p
                 @property
                 def _llm_type(self) -> str:
                     return "mistral_llm"
                 def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
-                    response = self.client.chat.complete(
-                        model="mistral-small-latest",  # Replace with the actual model name if different
                         messages=[{"role": "user", "content": prompt}],
                         temperature=self.temperature,
                         top_p=self.top_p,
                         max_tokens=512
                     )
                     return response.choices[0].message.content
                 @property
                 def _identifying_params(self) -> dict:
                     return {"model": "mistral-small-latest"}
-            # Initialize and return the MistralLLM instance
             mistral_llm = MistralLLM(api_key=mistral_api_key, temperature=self.temperature, top_p=self.top_p)
             debug_print("Mistral API pipeline created successfully.")
             return mistral_llm
         else:
             model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
-            if "deepseek" in self.llm_choice.lower():
-                model_id = "deepseek-ai/DeepSeek-R1"
-            elif "gemini" in self.llm_choice.lower():
-                model_id = "gemini/flash-1.5"
-            elif "mistralai" in self.llm_choice.lower():
-                model_id = "mistralai/Mistral-Small-24B-Instruct-2501"
             pipe = pipeline(
                 "text-generation",
                 model=model_id,
                 model_kwargs={"torch_dtype": torch.bfloat16},
-                max_length=4096,
                 do_sample=True,
                 temperature=self.temperature,
                 top_p=self.top_p,
-                device=-1
             )
-            return HuggingFacePipeline(pipeline=pipe)
     def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
         debug_print(f"Processing files using {self.llm_choice}")
@@ -222,7 +258,6 @@ class ElevatedRagChain:
         for link in file_links:
             if link.lower().endswith(".pdf"):
                 debug_print(f"Loading PDF: {link}")
-                # Ensure that the PDF loader returns a non-empty list.
                 loaded_docs = OnlinePDFLoader(link).load()
                 if loaded_docs:
                     self.raw_data.append(loaded_docs[0])
@@ -236,79 +271,49 @@ class ElevatedRagChain:
                     debug_print(f"Error loading TXT file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")
         if not self.raw_data:
             raise ValueError("No files were successfully loaded. Please check the URLs and file formats.")
         debug_print("Files loaded successfully.")
         debug_print("Starting text splitting...")
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
         self.split_data = self.text_splitter.split_documents(self.raw_data)
         if not self.split_data:
             raise ValueError("Text splitting resulted in no chunks. Check the file contents.")
         debug_print(f"Text splitting completed. Number of chunks: {len(self.split_data)}")
         debug_print("Creating BM25 retriever...")
         self.bm25_retriever = BM25Retriever.from_documents(self.split_data)
         self.bm25_retriever.k = self.top_k
         debug_print("BM25 retriever created.")
         debug_print("Embedding chunks and creating FAISS vector store...")
         self.vector_store = FAISS.from_documents(self.split_data, self.embed_func)
         self.faiss_retriever = self.vector_store.as_retriever(search_kwargs={"k": self.top_k})
         debug_print("FAISS vector store created successfully.")
-        ensemble = EnsembleRetriever(
             retrievers=[self.bm25_retriever, self.faiss_retriever],
             weights=[self.bm25_weight, self.faiss_weight]
         )
-        def capture_context(result):
-            # Convert each Document to a string and update the context.
-            self.context = "\n".join([str(doc) for doc in result["context"]])
-            result["context"] = self.context
-            # Add conversation_history from self.conversation_history (if any) as a string.
-            history_text = (
-                "\n".join([f"Q: {conv['query']}\nA: {conv['response']}" for conv in self.conversation_history])
-                if self.conversation_history else ""
-            )
-            result["conversation_history"] = history_text
-            return result
-        def extract_question(input_data):
-            # Expecting input_data to be a dict with a key "question"
-            return input_data["question"]
-        # Build the chain so that the ensemble (BM25 + FAISS) gets only the question string.
         base_runnable = RunnableParallel({
-            "context": RunnableLambda(extract_question) | ensemble,
-            "question": RunnableLambda(extract_question)
-        }) | capture_context
         self.rag_prompt = ChatPromptTemplate.from_template(self.prompt_template)
         self.str_output_parser = StrOutputParser()
         debug_print("Selecting LLM pipeline based on choice: " + self.llm_choice)
         self.llm = self.create_llm_pipeline()
         def format_response(response: str) -> str:
             input_tokens = count_tokens(self.context + self.prompt_template)
             output_tokens = count_tokens(response)
-            # Format the response as Markdown for better visual rendering
             formatted = f"### Response\n\n{response}\n\n---\n"
             formatted += f"- **Input tokens:** {input_tokens}\n"
             formatted += f"- **Output tokens:** {output_tokens}\n"
             formatted += f"- **Generated using:** {self.llm_choice}\n"
-            # Append conversation history summary
             formatted += f"\n**Conversation History:** {len(self.conversation_history)} conversation(s) considered.\n"
             return formatted
         self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
         debug_print("Elevated RAG chain successfully built and ready to use.")
     def get_current_context(self) -> str:
-        # Show a sample of the document context along with a summary of conversation history.
-        base_context = "\n".join([str(doc) for doc in self.split_data[:3]]) if hasattr(self, "split_data") and self.split_data else "No context available."
         history_summary = "\n\n---\n**Recent Conversations (last 3):**\n"
         recent = self.conversation_history[-3:]
         if recent:
@@ -332,23 +337,33 @@ def load_pdfs_updated(file_links, model_choice, prompt_template, bm25_weight, te
     try:
         links = [link.strip() for link in file_links.split("\n") if link.strip()]
         global rag_chain
-        rag_chain = ElevatedRagChain(
-            llm_choice=model_choice,
-            prompt_template=prompt_template,
-            bm25_weight=bm25_weight,
-            temperature=temperature,
-            top_p=top_p
-        )
-        rag_chain.add_pdfs_to_vectore_store(links)
-        context_display = rag_chain.get_current_context()
-        response_msg = f"Files loaded successfully. Using model: {model_choice}"
-        debug_print(response_msg)
-        return (
-            response_msg,
-            f"Word count: {word_count(rag_chain.context)}",
-            f"Model used: {rag_chain.llm_choice}",
-            f"Context:\n{context_display}"
-        )
     except Exception as e:
         error_msg = traceback.format_exc()
         debug_print("Could not load files. Error: " + error_msg)
@@ -359,6 +374,16 @@ def load_pdfs_updated(file_links, model_choice, prompt_template, bm25_weight, te
             "Context: N/A"
         )
 def submit_query_updated(query):
     debug_print("Inside submit_query function.")
     if not query:
@@ -366,20 +391,15 @@ def submit_query_updated(query):
         return "Please enter a non-empty query", "Word count: 0", f"Model used: {rag_chain.llm_choice}", ""
     if hasattr(rag_chain, 'elevated_rag_chain'):
         try:
-            # Incorporate conversation history by joining previous Q&A pairs.
-            history_text = ""
-            if rag_chain.conversation_history:
-                history_text = "\n".join([f"Q: {conv['query']}\nA: {conv['response']}" for conv in rag_chain.conversation_history])
-            # Build the prompt variables dictionary for the chain.
             prompt_variables = {
                 "conversation_history": history_text,
                 "context": rag_chain.context,
                 "question": query
             }
             response = rag_chain.elevated_rag_chain.invoke(prompt_variables)
-            # Save the current conversation to history
             rag_chain.conversation_history.append({"query": query, "response": response})
             input_token_count = count_tokens(query)
             output_token_count = count_tokens(response)
@@ -419,11 +439,9 @@ def reset_app_updated():
 # Gradio Interface Setup
 # ----------------------------
 custom_css = """
-button {
-    background-color: grey !important;
-    font-family: Arial !important;
-    font-weight: bold !important;
-    color: blue !important;
 }
 """
@@ -435,31 +453,24 @@ with gr.Blocks(css=custom_css) as app:
 - 🇺🇸 Remote Meta-Llama-3
 - 🇪🇺 Mistral-API
-**🔥 Randomness (Temperature):** Temperature adjusts how predictable or varied the output is. A low temperature makes the model choose very predictable words (which can be repetitive), while a high temperature introduces more randomness for diverse, creative text.
-**🎯 Word Variety (Top‑p):** Top‑p limits the model’s word choices to those that make up a set percentage (p) of the total probability. Lower values yield focused outputs; higher values increase variety and creativity.
-**✏️ Prompt Template:** Edit the prompt template if desired.
-**🔗 File URLs:** Enter one or more file URLs (PDF or TXT, one per line).
-**⚖️ Weight Controls:** Adjust Lexical vs Semantics (BM25 Weight).
 **🔍 Query:** Enter your query below.
-The response displays the model used, word count, and the current context (including conversation history).
-"""
-    ''')
     with gr.Row():
         with gr.Column():
             model_dropdown = gr.Dropdown(
-                choices=[
-                    "🇺🇸 Remote Meta-Llama-3",
-                    "🇪🇺 Mistral-API"
-                    # "DeepSeek-R1",            # Option commented out
-                    # "Gemini Flash 1.5",         # Option commented out
-                    # "Mistralai/Mistral-Small-24B-Instruct-2501"  # Option commented out
-                ],
                 value="🇺🇸 Remote Meta-Llama-3",
                 label="Select Model"
             )
@@ -535,6 +546,12 @@ The response displays the model used, word count, and the current context (inclu
         inputs=[],
         outputs=[response_output, context_output, model_output]
     )
 if __name__ == "__main__":
     debug_print("Launching Gradio interface.")

 from langchain.retrievers import EnsembleRetriever
 from langchain.prompts import ChatPromptTemplate
 from langchain.schema import StrOutputParser, Document
+from langchain_core.runnables import RunnableParallel, RunnableLambda
 from transformers.quantizers.auto import AutoQuantizationConfig
 import gradio as gr
 import requests
 # Add Mistral imports with fallback handling
 try:
     from mistralai import Mistral
     MISTRAL_AVAILABLE = True
     debug_print = lambda msg: print(f"[{datetime.datetime.now().isoformat()}] {msg}")
     debug_print = lambda msg: print(f"[{datetime.datetime.now().isoformat()}] {msg}")
     debug_print("Mistral client library not found. Install with: pip install mistralai")
 def debug_print(message: str):
     print(f"[{datetime.datetime.now().isoformat()}] {message}")
 def word_count(text: str) -> int:
     return len(text.split())
+# Initialize a tokenizer for token counting (using gpt2 as a generic fallback)
 def initialize_tokenizer():
     try:
         return AutoTokenizer.from_pretrained("gpt2")
             return len(text.split())
     return len(text.split())
+def truncate_prompt(prompt: str, max_tokens: int = 4096) -> str:
+    if global_tokenizer:
+        try:
+            tokens = global_tokenizer.encode(prompt)
+            if len(tokens) > max_tokens:
+                tokens = tokens[-max_tokens:]  # keep the last max_tokens tokens
+                return global_tokenizer.decode(tokens)
+        except Exception as e:
+            debug_print("Truncation error: " + str(e))
+    words = prompt.split()
+    if len(words) > max_tokens:
+        return " ".join(words[-max_tokens:])
+    return prompt
 default_prompt = """\
 {conversation_history}
 Use the following context to provide a detailed technical answer to the user's question.
 {question}
 """
 def load_txt_from_url(url: str) -> Document:
     response = requests.get(url)
     if response.status_code == 200:
     else:
         raise Exception(f"Failed to load {url} with status {response.status_code}")
 class ElevatedRagChain:
     def __init__(self, llm_choice: str = "Meta-Llama-3", prompt_template: str = default_prompt,
                  bm25_weight: float = 0.6, temperature: float = 0.5, top_p: float = 0.95) -> None:
         debug_print(f"Initializing ElevatedRagChain with model: {llm_choice}")
         self.embed_func = HuggingFaceEmbeddings(
             model_name="sentence-transformers/all-MiniLM-L6-v2",
             model_kwargs={"device": "cpu"}
         self.top_p = top_p
         self.prompt_template = prompt_template
         self.context = ""
+        self.conversation_history: List[Dict[str, str]] = []
+        self.raw_data = None
+        self.split_data = None
+        self.elevated_rag_chain = None
+    # Instance method to capture context and conversation history
+    def capture_context(self, result):
+        self.context = "\n".join([str(doc) for doc in result["context"]])
+        result["context"] = self.context
+        history_text = (
+            "\n".join([f"Q: {conv['query']}\nA: {conv['response']}" for conv in self.conversation_history])
+            if self.conversation_history else ""
+        )
+        result["conversation_history"] = history_text
+        return result
+    # Instance method to extract question from input data
+    def extract_question(self, input_data):
+        return input_data["question"]
     def create_llm_pipeline(self):
+        normalized = self.llm_choice.lower()
+        if "remote" in normalized:
             debug_print("Creating remote Meta-Llama-3 pipeline via Hugging Face Inference API...")
             from huggingface_hub import InferenceClient
             repo_id = "meta-llama/Meta-Llama-3-8B-Instruct"
             hf_api_token = os.environ.get("HF_API_TOKEN")
             if not hf_api_token:
                 raise ValueError("Please set the HF_API_TOKEN environment variable to use remote inference.")
+            client = InferenceClient(token=hf_api_token, timeout=180)
             def remote_generate(prompt: str) -> str:
                 response = client.text_generation(
                     prompt,
                     model=repo_id,
                     temperature=self.temperature,
                     top_p=self.top_p,
                     repetition_penalty=1.1
                 )
                 return response
             from langchain.llms.base import LLM
             class RemoteLLM(LLM):
                 @property
                     return {"model": repo_id}
             debug_print("Remote Meta-Llama-3 pipeline created successfully.")
             return RemoteLLM()
+        elif "mistral-api" in normalized:
             debug_print("Creating Mistral API pipeline...")
             mistral_api_key = os.environ.get("MISTRAL_API_KEY")
             if not mistral_api_key:
                 raise ValueError("Please set the MISTRAL_API_KEY environment variable to use Mistral API.")
             if not MISTRAL_AVAILABLE:
                 raise ImportError("Mistral client library not installed. Install with: pip install mistralai")
             from langchain.llms.base import LLM
             class MistralLLM(LLM):
                 temperature: float = 0.7
                 top_p: float = 0.95
+                _client: Any = None
                 def __init__(self, api_key: str, temperature: float = 0.7, top_p: float = 0.95):
+                    super().__init__()
+                    self._client = Mistral(api_key=api_key)
                     self.temperature = temperature
                     self.top_p = top_p
                 @property
                 def _llm_type(self) -> str:
                     return "mistral_llm"
                 def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+                    response = self._client.chat.complete(
+                        model="mistral-small-latest",
                         messages=[{"role": "user", "content": prompt}],
                         temperature=self.temperature,
                         top_p=self.top_p,
                         max_tokens=512
                     )
                     return response.choices[0].message.content
                 @property
                 def _identifying_params(self) -> dict:
                     return {"model": "mistral-small-latest"}
             mistral_llm = MistralLLM(api_key=mistral_api_key, temperature=self.temperature, top_p=self.top_p)
             debug_print("Mistral API pipeline created successfully.")
             return mistral_llm
         else:
+            # Default branch: assume Llama
             model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
+            extra_kwargs = {}
+            if "llama" in normalized or model_id.startswith("meta-llama"):
+                extra_kwargs["max_length"] = 4096
             pipe = pipeline(
                 "text-generation",
                 model=model_id,
                 model_kwargs={"torch_dtype": torch.bfloat16},
                 do_sample=True,
                 temperature=self.temperature,
                 top_p=self.top_p,
+                device=-1,
+                **extra_kwargs
             )
+            from langchain.llms.base import LLM
+            class LocalLLM(LLM):
+                @property
+                def _llm_type(self) -> str:
+                    return "local_llm"
+                def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+                    return pipe(prompt)[0]["generated_text"]
+                @property
+                def _identifying_params(self) -> dict:
+                    return {"model": model_id, "max_length": extra_kwargs.get("max_length")}
+            debug_print("Local Llama pipeline created successfully with max_length=4096.")
+            return LocalLLM()
+    def update_llm_pipeline(self, new_model_choice: str, temperature: float, top_p: float, prompt_template: str, bm25_weight: float):
+        debug_print(f"Updating chain with new model: {new_model_choice}")
+        self.llm_choice = new_model_choice
+        self.temperature = temperature
+        self.top_p = top_p
+        self.prompt_template = prompt_template
+        self.bm25_weight = bm25_weight
+        self.faiss_weight = 1.0 - bm25_weight
+        self.llm = self.create_llm_pipeline()
+        def format_response(response: str) -> str:
+            input_tokens = count_tokens(self.context + self.prompt_template)
+            output_tokens = count_tokens(response)
+            formatted = f"### Response\n\n{response}\n\n---\n"
+            formatted += f"- **Input tokens:** {input_tokens}\n"
+            formatted += f"- **Output tokens:** {output_tokens}\n"
+            formatted += f"- **Generated using:** {self.llm_choice}\n"
+            formatted += f"\n**Conversation History:** {len(self.conversation_history)} conversation(s) considered.\n"
+            return formatted
+        base_runnable = RunnableParallel({
+            "context": RunnableLambda(self.extract_question) | self.ensemble_retriever,
+            "question": RunnableLambda(self.extract_question)
+        }) | self.capture_context
+        self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
+        debug_print("Chain updated successfully with new LLM pipeline.")
     def add_pdfs_to_vectore_store(self, file_links: List[str]) -> None:
         debug_print(f"Processing files using {self.llm_choice}")
         for link in file_links:
             if link.lower().endswith(".pdf"):
                 debug_print(f"Loading PDF: {link}")
                 loaded_docs = OnlinePDFLoader(link).load()
                 if loaded_docs:
                     self.raw_data.append(loaded_docs[0])
                     debug_print(f"Error loading TXT file {link}: {e}")
             else:
                 debug_print(f"File type not supported for URL: {link}")
         if not self.raw_data:
             raise ValueError("No files were successfully loaded. Please check the URLs and file formats.")
         debug_print("Files loaded successfully.")
         debug_print("Starting text splitting...")
         self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)
         self.split_data = self.text_splitter.split_documents(self.raw_data)
         if not self.split_data:
             raise ValueError("Text splitting resulted in no chunks. Check the file contents.")
         debug_print(f"Text splitting completed. Number of chunks: {len(self.split_data)}")
         debug_print("Creating BM25 retriever...")
         self.bm25_retriever = BM25Retriever.from_documents(self.split_data)
         self.bm25_retriever.k = self.top_k
         debug_print("BM25 retriever created.")
         debug_print("Embedding chunks and creating FAISS vector store...")
         self.vector_store = FAISS.from_documents(self.split_data, self.embed_func)
         self.faiss_retriever = self.vector_store.as_retriever(search_kwargs={"k": self.top_k})
         debug_print("FAISS vector store created successfully.")
+        self.ensemble_retriever = EnsembleRetriever(
             retrievers=[self.bm25_retriever, self.faiss_retriever],
             weights=[self.bm25_weight, self.faiss_weight]
         )
         base_runnable = RunnableParallel({
+            "context": RunnableLambda(self.extract_question) | self.ensemble_retriever,
+            "question": RunnableLambda(self.extract_question)
+        }) | self.capture_context
         self.rag_prompt = ChatPromptTemplate.from_template(self.prompt_template)
         self.str_output_parser = StrOutputParser()
         debug_print("Selecting LLM pipeline based on choice: " + self.llm_choice)
         self.llm = self.create_llm_pipeline()
         def format_response(response: str) -> str:
             input_tokens = count_tokens(self.context + self.prompt_template)
             output_tokens = count_tokens(response)
             formatted = f"### Response\n\n{response}\n\n---\n"
             formatted += f"- **Input tokens:** {input_tokens}\n"
             formatted += f"- **Output tokens:** {output_tokens}\n"
             formatted += f"- **Generated using:** {self.llm_choice}\n"
             formatted += f"\n**Conversation History:** {len(self.conversation_history)} conversation(s) considered.\n"
             return formatted
         self.elevated_rag_chain = base_runnable | self.rag_prompt | self.llm | format_response
         debug_print("Elevated RAG chain successfully built and ready to use.")
     def get_current_context(self) -> str:
+        base_context = "\n".join([str(doc) for doc in self.split_data[:3]]) if self.split_data else "No context available."
         history_summary = "\n\n---\n**Recent Conversations (last 3):**\n"
         recent = self.conversation_history[-3:]
         if recent:
     try:
         links = [link.strip() for link in file_links.split("\n") if link.strip()]
         global rag_chain
+        if rag_chain.raw_data:
+            rag_chain.update_llm_pipeline(model_choice, temperature, top_p, prompt_template, bm25_weight)
+            context_display = rag_chain.get_current_context()
+            response_msg = f"Files already loaded. Chain updated with model: {model_choice}"
+            return (
+                response_msg,
+                f"Word count: {word_count(rag_chain.context)}",
+                f"Model used: {rag_chain.llm_choice}",
+                f"Context:\n{context_display}"
+            )
+        else:
+            rag_chain = ElevatedRagChain(
+                llm_choice=model_choice,
+                prompt_template=prompt_template,
+                bm25_weight=bm25_weight,
+                temperature=temperature,
+                top_p=top_p
+            )
+            rag_chain.add_pdfs_to_vectore_store(links)
+            context_display = rag_chain.get_current_context()
+            response_msg = f"Files loaded successfully. Using model: {model_choice}"
+            return (
+                response_msg,
+                f"Word count: {word_count(rag_chain.context)}",
+                f"Model used: {rag_chain.llm_choice}",
+                f"Context:\n{context_display}"
+            )
     except Exception as e:
         error_msg = traceback.format_exc()
         debug_print("Could not load files. Error: " + error_msg)
             "Context: N/A"
         )
+def update_model(new_model: str):
+    global rag_chain
+    if rag_chain and rag_chain.raw_data:
+        rag_chain.update_llm_pipeline(new_model, rag_chain.temperature, rag_chain.top_p,
+                                      rag_chain.prompt_template, rag_chain.bm25_weight)
+        debug_print(f"Model updated to {rag_chain.llm_choice}")
+        return f"Model updated to: {rag_chain.llm_choice}"
+    else:
+        return "No files loaded; please load files first."
 def submit_query_updated(query):
     debug_print("Inside submit_query function.")
     if not query:
         return "Please enter a non-empty query", "Word count: 0", f"Model used: {rag_chain.llm_choice}", ""
     if hasattr(rag_chain, 'elevated_rag_chain'):
         try:
+            history_text = "\n".join([f"Q: {conv['query']}\nA: {conv['response']}" for conv in rag_chain.conversation_history]) if rag_chain.conversation_history else ""
             prompt_variables = {
                 "conversation_history": history_text,
                 "context": rag_chain.context,
                 "question": query
             }
+            if "llama" in rag_chain.llm_choice.lower():
+                prompt_variables["context"] = truncate_prompt(prompt_variables["context"], max_tokens=4096)
             response = rag_chain.elevated_rag_chain.invoke(prompt_variables)
             rag_chain.conversation_history.append({"query": query, "response": response})
             input_token_count = count_tokens(query)
             output_token_count = count_tokens(response)
 # Gradio Interface Setup
 # ----------------------------
 custom_css = """
+textarea {
+  overflow-y: scroll !important;
+  max-height: 200px;
 }
 """
 - 🇺🇸 Remote Meta-Llama-3
 - 🇪🇺 Mistral-API
+**🔥 Randomness (Temperature):** Adjusts output predictability.
+**🎯 Word Variety (Top‑p):** Limits word choices to a set probability percentage.
+**✏️ Prompt Template:** Edit as desired.
+**🔗 File URLs:** Enter one URL per line (.pdf or .txt).
+**⚖️ BM25 Weight:** Adjust Lexical vs Semantics.
 **🔍 Query:** Enter your query below.
+The response displays the model used, word count, and current context (with conversation history).
+''')
     with gr.Row():
         with gr.Column():
             model_dropdown = gr.Dropdown(
+                choices=["🇺🇸 Remote Meta-Llama-3", "🇪🇺 Mistral-API"],
                 value="🇺🇸 Remote Meta-Llama-3",
                 label="Select Model"
             )
         inputs=[],
         outputs=[response_output, context_output, model_output]
     )
+    model_dropdown.change(
+        fn=update_model,
+        inputs=model_dropdown,
+        outputs=model_output
+    )
 if __name__ == "__main__":
     debug_print("Launching Gradio interface.")