# import sys
# sys.path.append('app/src')

from astra import astra_rag_eval
from llm import groq_chat, CHAT_MODEL
from chroma import search_eval
from typing import Any


LLM_ANSWER_GEN_TEMPLATE = """\
Generate one brief and informative answer to the following question: {question}. \
  The answer should be concise, relevant, and not exceed 60 words in length.
"""

import json

import json
from tqdm import tqdm
import time

def generate_responses_llm(questions_file: str, output_file: str, model: CHAT_MODEL="mixtral-8x7b-32768", batch_size: int = 30, delay_between_batches: int = 10):
    """
    Generate responses using the LLM for each question in the input file and save them to the output file.
    """
    responses = []  # Dictionary to store question-response pairs
    
    with open(questions_file, 'r') as f_questions:
        data = json.load(f_questions)
        questions = data["question"]
        num_questions = len(questions)
        
        for i in tqdm(range(0, num_questions, batch_size), desc="Generating responses", total=num_questions // batch_size):
            batch_questions = questions[i:i+batch_size]
            for question in batch_questions:
                # Generate response using LLM
                answer = groq_chat(
                    message=question,
                    preamble=LLM_ANSWER_GEN_TEMPLATE,  # Use a short prompt template
                    model=model,
                ).choices[0].message.content
                responses.append({"question": question, "answer": answer})  # Store question-response pair in dictionary
            
            # Introduce delay between batches
            time.sleep(delay_between_batches)
    
    # Save responses to JSON file
    with open(output_file, 'w') as f_output:
        json.dump(responses, f_output, indent=4)


import json
from typing import Any
from tqdm import tqdm
import time

def generate_responses_rag(questions_file: str, output_file: str, model: CHAT_MODEL="mixtral-8x7b-32768", batch_size: int = 30, delay_between_batches: int = 10):
    """
    Generate responses using the LLM for each question in the input file and save them to the output file.
    """
    responses = []  # List to store question-response pairs
    
    with open(questions_file, 'r') as f_questions:
        data = json.load(f_questions)
        num_questions = len(data)
        
        for i in tqdm(range(0, num_questions, batch_size), desc="Generating responses", total=num_questions // batch_size):
            batch_data = data[i:i+batch_size]
            for idx, item in enumerate(batch_data):
                question = item["question"]
                print(question)
                context = search_eval(query=question, k=3, model_name_or_path="models/bge-large_finetuned")
                
                # Generate response using LLM
                if not context:
                    answer = "I'm sorry, I don't have any information on that. Feel free to ask me anything else."
                else:
                    answer = astra_rag_eval(
                        prompt=question,
                        context=[result["doc"] for result in context]
                    )
                
                responses.append({"question": question, "answer": answer})  # Store question-response pair in list
                print(f"{i+idx+1} questions answered")
            
            # Introduce delay between batches
            time.sleep(delay_between_batches)
    
    # Save responses to JSON file
    with open(output_file, 'w') as f_output:
        json.dump(responses, f_output, indent=4)

      
generate_responses_rag(questions_file='app/evaluations/eval_data/question_answer_pairs-min.json', output_file='app/evaluations/eval_data/rag_bge_large_finetuned_response_qa.json')