# -*- coding: utf-8 -*-
"""app.py

## **CitizenClimate: RAG for climate citizen assemblies**


---

---
This is going to be the master document for all the files, though the py files will be put into their own individual files to ensure they're running all individually. The goal is to have these all as functions that can be called on when need be and have a smooth pipeline.
"""
#load packages
#LangChain and Transformers says Hi
#Initial package
#import langchain
#import

#HuggingFace shenanigans
from langchain_huggingface import HuggingFacePipeline #Easy pipeline
from langchain_huggingface.embeddings import HuggingFaceEmbeddings #For embeddings
from langchain_huggingface.embeddings import HuggingFaceEndpointEmbeddings #For even more robust embeddingd
from transformers import AutoModelForCausalLM, AutoTokenizer,pipeline #Defined pipelines
from transformers import BitsAndBytesConfig #Quantise the shit out of things
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint #For chat templates

#Text loading
from llama_index.core import SimpleDirectoryReader #Loads documents
from llama_index.core import Document #deals with the objects created
from llama_index.embeddings.huggingface import HuggingFaceEmbedding #Another embedding tool
from llama_index.core.node_parser import SentenceSplitter #Slipt the text into sentences
from llama_index.core.ingestion import IngestionPipeline #Here's the other pipeline

#Text storage and loading
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore #Stores the embeddings
from llama_index.core import VectorStoreIndex #Stores the index
from llama_index.core import StorageContext, load_index_from_storage, Settings #Might be important
from llama_index.core.vector_stores import ExactMatchFilter, MetadataFilters
from llama_index.core.memory import ChatSummaryMemoryBuffer, Memory #We need to limit the memory
from llama_index.core.indices.prompt_helper import PromptHelper #Let's see if this works

#Using the llama index shenanigans
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI #Not using this anymore, but keeping it in case something breaks if I remove it
from llama_index.llms.huggingface import HuggingFaceLLM #Same as the previous one
from llama_index.core import ChatPromptTemplate
from huggingface_hub import CommitScheduler # scheduler
#Adding in a reranker so that I can improve the outputs
from reranker_v1 import rerank_documents

#Adding packages to set up vLLM
from llama_index.llms.openai_like import OpenAILike
from llama_index.core import Settings
from llama_index.llms.vllm import Vllm, VllmServer
from vllm_server import start_vllm, wait_for_vllm #My own packages uwu

#Evaluation
from llama_index.core.evaluation import FaithfulnessEvaluator #I need to use you my guy
from llama_index.core.callbacks import CallbackManager, LlamaDebugHandler #Debugger
from llama_index.core.callbacks.schema import EventPayload #I am now using this to make sure that I can run the debugger orz

#More packages uwu
import gradio as gr #For I/O
import os
import json
import time
import subprocess
import threading
import requests
from datetime import datetime, timezone
from pathlib import Path
from uuid import uuid4
#from datasets import load_dataset #Uploaded the dataset to the hub, I hope this works
#it didn't work, so I decided to just upload the files directly

#I forgot to set up the vLLM bit
#So that was why it wasn't working lol
#Set the vllm url
VLLM_URL = os.getenv("VLLM_URL", "http://localhost:8000/v1")

# Start server
vllm_process = start_vllm()

#Putting in the model here because it's faster tbh
#Bets embedding model on the charts as of 28/10/2025
#Might not be the best option, let's try ChatGPT's version
embedding_model = HuggingFaceEmbedding(model_name = 'Qwen/Qwen3-Embedding-8B', device= 'cpu') #Set the model

#Then we store with ChromaDB
climate_repo_path = ('./Working_Climate_Database_Sentence')
clim_db_sentence = chromadb.PersistentClient(path=climate_repo_path)
climate_collection_sentence = clim_db_sentence.get_or_create_collection('CitizenClimate_Sentence')
vector_store_sentence = ChromaVectorStore(chroma_collection = climate_collection_sentence)
#I'm on my nth run, so I' mot going to do this just yet
#Maybe when I get new documents

"""## Retrival logic and prompt engineering
Let's make sure that it can load the model, then test out the prompts.
Then we can make this a pipeline.
"""

#The query model being used
#Moved back to using the inference API
#Because we are fighting for our lives
query_model = OpenAILike(
    model='kosbu/Llama-3.3-70B-Instruct-AWQ',
    api_base='http://localhost:8000/v1',
    api_key='dummy',# vLLM doesn't require real API key shooketh
    max_tokens = 2048,
    temperature = 0.5,
    is_chat_model = True
)


#So this is the not fun part
#One: Link to the storage
CHROMA_DB_PATH = './Working_Climate_Database_Sentence'  # The path where Chroma's data is stored
COLLECTION_NAME = 'CitizenClimate_Sentence' # The name of one of many existing Chroma collections

#Adding a log file
log_files = Path('chatbot_interactions')
log_files.mkdir(parents=True, exist_ok=True)
log_files_path = log_files / f"train-{uuid4()}.jsonl"

scheduler = CommitScheduler(
    repo_id = 'CitizenClimate',
    repo_type = 'dataset',
    folder_path= str(log_files_path.parent),
    path_in_repo = 'data',
)

#I need to ensure that the debug handler is a global value apparently
debug_handler = None

#The engine!
def initialize_query_engine():
    """Connect to ChromaDB, load the index, and create the query engine."""

    print(f'Connecting to ChromaDB at: {CHROMA_DB_PATH}...')

    #Initialise the debug handler as a global variable uwu
    global debug_handler
    
    #Setting up a debugger becasue I'm not a heathen
    debug_handler = LlamaDebugHandler(print_trace_on_end=True)
    callback_manager = CallbackManager([debug_handler])
    Settings.callback_manager = callback_manager

    # Use PersistentClient since I've saved it localy
    db = chromadb.PersistentClient(path=CHROMA_DB_PATH)

    # Get the existing collection
    try:
        chroma_collection = db.get_collection(COLLECTION_NAME)
    except Exception as e:
        raise ValueError(
            f"Could not find or connect to Chroma collection '{COLLECTION_NAME}'. "
            f'Error: {e}' )

    # Create the VectorStore again
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

    # Index!
    index = VectorStoreIndex.from_vector_store(vector_store, embed_model=embedding_model)

    # Adding a memory buffer 
    #Changing up the memory buffer to a more stable version that summarises the chat before hand and ensures 
    memory = ChatSummaryMemoryBuffer.from_defaults(
        llm=query_model, #We summarize our own work
        token_limit=800,
        count_initial_tokens=True)
    
    #Now defining the Prompt helper which will hopefully help me budget my responses.
    prompt_helper = PromptHelper(
        context_window=4096, #Matches the vllm server doc
        num_output=512,     #Reserve space for the response
        chunk_overlap_ratio=0.1, #No idea what this does, but it's in the docs
        chunk_size_limit=None)

    #Define some nice system requirements here
    RETRIEVAL_TOP_K = 4
    KIRKLEES_SYSTEM_PROMPT = ('You are a friendly, intelligent chatbot designed to answer user questions about climate issues and solutions in Kirklees. Ensure your response is viable in Kirklees unless it is a general query. Keep responses clear, factual, and easy to understand. Refer to the documents as your knowledge base e.g. According to my database, instead of according to the documents.')
    # Create the Chat Engine
    query_engine = index.as_chat_engine(
        llm=query_model,
        verbose=True,
        memory=memory,
        similarity_top_k=RETRIEVAL_TOP_K,
        system_prompt = KIRKLEES_SYSTEM_PROMPT,
        streaming=True,
        prompt_helper=prompt_helper,
        chat_mode='condense_plus_context') #I wonder what I should do with the chat mode...

    print('ChromaDB index, callback manager and query engine initialized successfully!')
    return query_engine

#We have to have something to log the interactions so that they can be analysed
def data_collection_log (message, full_response,source_nodes, duration):
    """ Saving the interaction for data analysis."""
    
    #Hello debug handler, time to work
    global debug_handler
    
    #Ensure we can access the condensed query just in case uwu
    #Like sometimes we don't have a query and that's fine
    condensed_query = message 

    #Extracts the condensed query because this looks like it'll be interesting
    if debug_handler:
        try:
            llm_events = debug_handler.get_llm_inputs_outputs() #Gets the llm inputs and outputs
            for event_pair in llm_events:
                if len(event_pair) >= 1:
                    start_event = event_pair[0]
                    if start_event.payload:
                        if EventPayload.MESSAGES in start_event.payload:
                            messages = start_event.payload.get(EventPayload.MESSAGES, [])
                            if messages and hasattr(messages[-1], "content"):
                                condensed_query = messages[-1].content
                                break
                        elif EventPayload.PROMPT in start_event.payload:
                            condensed_query = start_event.payload.get(EventPayload.PROMPT, message)
                            break
        except Exception as e:
            print(f'Failed to extract condensed query: {e}')
    
    
    log_entry = {
      'timestamp': datetime.now(timezone.utc).isoformat(),
      'user_query': message,
      'condensed_query': condensed_query,
      'bot_response': full_response,
      'latency_seconds': round(duration, 2),
      'sources': [
          {'file': str(getattr(n, 'metadata', {}).get('file_name', 'N/A')),
           'page': str(getattr(n, 'metadata', {}).get('page_label', 'N/A')),
           'score': float(getattr(n, 'score', 0.0) or 0.0),
           'text_chunk': (getattr(n, 'text', '')[:2000] + '...') if getattr(n, 'text', None) else ''
    }
    for n in (source_nodes or []) ] #Logging what the fox says
  }
    try:
        if hasattr(scheduler, "lock"):
            with scheduler.lock:
                with log_files_path.open('a', encoding='utf-8') as f:
                    f.write(json.dumps(log_entry) + '\n')
        else:
            with log_files_path.open('a', encoding='utf-8') as f:
                f.write(json.dumps(log_entry) + '\n')
    except Exception as e:
        print(f'Failed to log interaction: {e}')

  #Claer the events after logging so that my system doesn't die uwu
    if debug_handler:
        debug_handler.flush_event_logs()

# Initialize the engine globally
try:
    RAG_CHAT_ENGINE = initialize_query_engine()
except Exception as e:
    print(f'Error during initialization: {e}')
    RAG_CHAT_ENGINE = None
#Nice that works!!!!

#To ensure that there is a background wait before launch
#Here is where we put the wait_for_llm() function
def background_wait():
    wait_for_vllm()
    print('vLLM is ready!!')


#Let's put the RAG chat function here#
#It's a baby for now, but we'll add storage and logging to it very soon uwu
def rag_chat_function(message: str, history: list) -> str:
    """
    The main chat function connected to the Gradio interface.

    Args:
        message (str): The user's latest message.
        history (list): The full chat history provided by Gradio.

    Returns:
        str: The assistant's response.
    """
    # Guard against initialization failure
    if RAG_CHAT_ENGINE is None:
        yield 'RAG system not ready. Please check setup.'
        return

    #Start the clock!
    start_time = time.time() #Start the clock, Phoenix
    
    #Ensuring that there isn't empty variable so that it doesn't crash
    #sources_text = ''

    try:
        #For better experience, we stream Netflix
        response = RAG_CHAT_ENGINE.stream_chat(message)

        #Stream the response back to Gradio
        full_response = ''
        for token in response.response_gen:
            full_response += token
            yield full_response

        #We're testing shit out,so here we have the source nodes
        #I.E. Where we got the text from.
        #This is going to bite me in the ass because I went page by page
        #Get source nodes after streaming is complete
        source_nodes = response.source_nodes
        if source_nodes:
            source_nodes = rerank_documents(message, source_nodes, top_k=4)
        #I am removing this section to make it more palatable
        #if source_nodes:
        #    sources_text = '\n\n**Sources:**\n'
        #    for i, node in enumerate(source_nodes[:5]): # Display top 5 sources
        #        metadata = node.metadata
        #        f_name = str(metadata.get('file_name', 'N/A')) 
        #        p_label = str(metadata.get('page_label', 'N/A'))
        #        sources_text += f"- **Source {i+1}**: File: `{f_name}`, Page: `{p_label}`\n"

        #Finalise logging
        duration = time.time() - start_time
        data_collection_log(message, full_response, source_nodes, duration)

        # Yield the final response with sources added
        yield full_response #+ sources_text

    except Exception as e:
        print(f'An error occurred during query: {e}')
        yield 'Sorry, an error occurred while processing your request. Please try again.'
        return

#Then we test out the Gradio stuff right here right now

if RAG_CHAT_ENGINE is not None:
    with gr.Blocks() as demo:
        gr.ChatInterface(
            fn=rag_chat_function,
            title='CitizenClimate: Kirklees',
            description = 'This is a chatbot that can be used to answer your climate questions and helps you with practical climate solutions for you and your community.',
            theme=gr.themes.Soft(),
            textbox=gr.Textbox( placeholder = 'Enter your question here please...', container=False, scale=7))
        gr.Markdown('---')
        gr.Markdown('Although care has been taken to ensure that the outputs are as accurate as possible, the system may occasionally produce harmful instructions or biased content and may occasionally generate incorrect information.')
    

    # Launch the app (In a HuggingFace Space, you don't call launch() apparently, but this is for local testing)
    #However, I will still call launch because I have things that need doing.
    if __name__ == "__main__":
        #threading.Thread(target=background_wait, daemon=True).start() #Ensures that the model loadds in the background before it runs
        demo.launch(server_name="0.0.0.0", server_port=7860)
else:
    demo = gr.Interface(
        fn=lambda x: 'RAG System initialization failed. Check logs.',
        inputs='text',
        outputs='text',
        title='RAG Prototype ERROR')