import os
import json
import time
import requests
import numpy as np

from dotenv import load_dotenv
from sentence_transformers import SentenceTransformer
from together import Together
from openai import OpenAI

global db, referenced_tables_db, embedder, index, llm_client


def load_json_to_db(file_path):
    with open(file_path) as f:
        db = json.load(f)
    return db


# -------- Embedding Functions --------
def make_embeddings(embedder, embedder_name, db):
    texts = [chunk['text'] for chunk in db]
    embeddings = embedder.encode(texts, convert_to_numpy=True, batch_size=1, show_progress_bar=True)
    return embeddings


def get_project_root():
    return os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))


def save_embeddings(embedder_name, embeddings):
    root = get_project_root()
    file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy")
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    np.save(file_path, embeddings)
    print(f"Saved embeddings to: {file_path}")


def load_embeddings(embedder_name):
    root = get_project_root()
    file_path = os.path.join(root, "data", "embeddings", f"{embedder_name.replace('/', '_')}.npy")

    try:
        embeddings = np.load(file_path, allow_pickle=True)
        print(f"Loaded embeddings from: {file_path}")
    except FileNotFoundError:
        print(f"Embeddings not found. Recomputing for: {embedder_name}")
        embeddings = make_embeddings(embedder, embedder_name, db)
        save_embeddings(embedder_name, embeddings)

    return embeddings


def load_embedder_with_fallbacks(embedder_name):
    print(f"Loading embedder {embedder_name}")
    model = SentenceTransformer(
        embedder_name,
        trust_remote_code=True,
        tokenizer_kwargs={"padding_side": "left"},
        device='cpu'
    )
    return model


# -------- Cosine Similarity Index (no FAISS) --------
def build_cosine_index(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms


def load_cosine_index(embedder_name):
    embeddings = load_embeddings(embedder_name)
    normalized_embeddings = build_cosine_index(embeddings)
    return normalized_embeddings


# -------- Cosine Similarity Search (Brute Force) --------
def vector_search(query, embedder, db, index, referenced_table_db, k=6):
    def get_detailed_instruct(task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery:{query}'

    task = 'Given a search query, retrieve relevant passages that answer the query'
    query_embedding = embedder.encode([get_detailed_instruct(task, query)], convert_to_numpy=True)
    query_vec = query_embedding / np.linalg.norm(query_embedding)

    cosine_similarities = np.dot(index, query_vec.T).flatten()
    top_k_indices = np.argsort(-cosine_similarities)[:k]

    results = []
    referenced_tables = set()
    existed_tables = set()

    for i in top_k_indices:
        results.append({
            "text": db[i]['text'],
            "section": db[i]['metadata']['section'],
            "chunk_id": db[i]['metadata']['chunk_id'],
            "similarity": float(cosine_similarities[i]),
        })
        if db[i]['metadata']['referee_id']:
            existed_tables.add(db[i]['metadata']['referee_id'])
        try:
            if db[i]['metadata']['referenced_tables']:
                referenced_tables.update(db[i]['metadata']['referenced_tables'])
        except KeyError:
            continue

    table_to_add = [table for table in referenced_tables if table not in existed_tables]

    for chunk in referenced_table_db:
        if chunk['metadata']['referee_id'] in table_to_add:
            results.append({
                "text": chunk['text'],
                "section": chunk['metadata']['section'],
                "chunk_id": chunk['metadata']['chunk_id'],
            })
    return results


def load_together_llm_client():
    load_dotenv()
    return Together(api_key=os.getenv("TOGETHER_API_KEY"))

def load_nvidia_llm_client():
    load_dotenv()
    return OpenAI(
        base_url="https://integrate.api.nvidia.com/v1",
        api_key=os.getenv("NVIDIA_API_KEY"),
    )


# -------- Prompt Construction --------
def construct_prompt(query, faiss_results):
    with open("src/system_prompt.txt", "r") as f:
        system_prompt = f.read().strip()

    prompt = f"""
### System Prompt
{system_prompt}

### User Query
{query}

### Clinical Guidelines Context
"""
    for res in faiss_results:
        prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n"
    return prompt


def construct_prompt_with_memory(query, faiss_results, chat_history=None, history_limit=4):
    with open("src/system_prompt.txt", "r") as f:
        system_prompt = f.read().strip()

    prompt = f"### System Prompt\n{system_prompt}\n\n"

    if chat_history:
        prompt += "### Chat History\n"
        for m in chat_history[-history_limit:]:
            prompt += f"{m['role'].title()}: {m['content']}\n"
        prompt += "\n"

    prompt += f"### User Query\n{query}\n\n"
    prompt += "### Clinical Guidelines Context\n"
    for res in faiss_results:
        prompt += f"- reference: {res['section']}\n- This paragraph is from section: {res['text']}\n"
    return prompt


def call_llm(llm_client, prompt, stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9, model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"):
    print(f"Calling LLM with model: {model_name}")
    try:
        if stream_flag:
            def stream_generator():
                response = llm_client.chat.completions.create(
                    model=model_name,
                    messages=[{"role": "user", "content": prompt}],
                    max_tokens=max_tokens,
                    temperature=temperature,
                    top_p=top_p,
                    stream=True,
                )
                for chunk in response:
                    if chunk.choices and chunk.choices[0].delta.content:
                        yield chunk.choices[0].delta.content
            return stream_generator()
        else:
            response = llm_client.chat.completions.create(
                model=model_name,
                messages=[{"role": "user", "content": prompt}],
                max_tokens=max_tokens,
                temperature=temperature,
                top_p=top_p,
                stream=False,
            )
            return response.choices[0].message.content
    except Exception as e:
        print("Error in call_llm:", str(e))
        import traceback
        traceback.print_exc()
        raise

def call_nvidia_llm(llm_client, prompt, stream_flag=False, max_tokens=4096, temperature=0.6, top_p=0.7, model_name="openai/gpt-oss-20b"):
    print(f"Calling NVIDIA LLM with model: {model_name}")
    try:
        if stream_flag:
            def stream_generator():
                completion = llm_client.chat.completions.create(
                    model=model_name,
                    messages=[{"role":"user","content": prompt}],
                    temperature=temperature,
                    top_p=top_p,
                    max_tokens=max_tokens,
                    stream=True
                )
                for chunk in completion:
                    if chunk.choices[0].delta.content is not None:
                        yield chunk.choices[0].delta.content
            return stream_generator()
        else:
            completion = llm_client.chat.completions.create(
                model=model_name,
                messages=[{"role":"user","content": prompt}],
                temperature=temperature,
                top_p=top_p,
                max_tokens=max_tokens,
                stream=False
            )
            return completion.choices[0].message.content
    except Exception as e:
        print("Error in call_nvidia_llm:", str(e))
        import traceback
        traceback.print_exc()
        raise


def call_ollama(prompt, model="mistral", stream_flag=False, max_tokens=500, temperature=0.05, top_p=0.9):
    url = "http://localhost:11434/api/generate"
    payload = {
        "model": model,
        "prompt": prompt,
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        "stream": True
    }

    with requests.post(url, json=payload, stream=True) as response:
        for line in response.iter_lines():
            if line:
                try:
                    data = json.loads(line.decode("utf-8"))
                    yield data["response"]
                except Exception:
                    continue


# -------- Main Assistant Entry Points --------
def launch_depression_assistant(embedder_name, designated_client=None):
    global db, referenced_tables_db, embedder, index, llm_client

    db = load_json_to_db("data/processed/guideline_db.json")
    referenced_tables_db = load_json_to_db("data/processed/referenced_table_chunks.json")

    embedder = load_embedder_with_fallbacks(embedder_name)
    index = load_cosine_index(embedder_name)

    if designated_client is None:
        print("Attempting to load NVIDIA LLM client...")
        try:
            llm_client = load_nvidia_llm_client()
            print("Successfully loaded NVIDIA LLM client.")
        except Exception as e:
            print(f"Failed to load NVIDIA LLM client: {e}")
            print("Attempting to load Together LLM client as a fallback...")
            try:
                llm_client = load_together_llm_client()
                print("Successfully loaded Together LLM client.")
            except Exception as e:
                print(f"Failed to load Together LLM client: {e}")
                llm_client = None
    else:
        llm_client = designated_client
        print(f"Using designated client: {type(llm_client).__name__}")

    if llm_client is None:
        print("Warning: No LLM client could be loaded. The assistant will not be able to generate responses.")

    print("---------Depression Assistant is ready to use!--------------\n\n")


def depression_assistant(query, model_name=None, max_tokens=None, temperature=None, top_p=None, stream_flag=False, chat_history=None):
    results = vector_search(query, embedder, db, index, referenced_tables_db, k=3)
    prompt = construct_prompt_with_memory(query, results, chat_history=chat_history)

    kwargs = {}
    if model_name:
        kwargs['model_name'] = model_name
    if max_tokens:
        kwargs['max_tokens'] = max_tokens
    if temperature is not None:
        kwargs['temperature'] = temperature
    if top_p:
        kwargs['top_p'] = top_p

    if llm_client == "Run Ollama Locally":
        if 'model_name' in kwargs:
            kwargs['model'] = kwargs.pop('model_name')
        return results, call_ollama(prompt, stream_flag=stream_flag, **kwargs)
    elif isinstance(llm_client, OpenAI):  # NVIDIA Client
        return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
    elif isinstance(llm_client, Together):  # Together Client
        return results, call_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)
    else:
        if llm_client is None:
            raise ValueError("LLM client not initialized. Please check API keys.")
        # Fallback to NVIDIA as requested
        return results, call_nvidia_llm(llm_client, prompt, stream_flag=stream_flag, **kwargs)


def load_queries_and_answers(query_file, answers_file):
    with open(query_file, 'r') as f:
        queries = f.readlines()
    with open(answers_file, 'r') as f:
        answers = f.readlines()
    return queries, answers


def write_batched_results(embedder_name, result_path):
    launch_depression_assistant(embedder_name)
    queries, answers = load_queries_and_answers("data/raw/queries.txt", "data/raw/answers.txt")
    embedder_filename = embedder_name.replace('/', '_')

    with open(f"{result_path}Retrieved_Results_by_{embedder_filename}.md", "w") as f1, \
         open(f"{result_path}Response_by_{embedder_filename}.md", "w") as f2:

        for i, query in enumerate(queries):
            result, response = depression_assistant(query)

            f1.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Retrieved Results\n")
            for res in result:
                f1.write(f"\n\n#### {res['section']}\n\n{res['text']}\n")
            f1.write("\n\n---\n\n")

            f2.write(f"## Query {i+1}\n{query.strip()}\n\n## Answer\n{answers[i].strip()}\n\n## Response\n{response}\n\n---\n\n")
            break  # remove this `break` if you want to process all queries