import re
import faiss
import numpy as np
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from pdfminer.high_level import extract_text
from up_config import Config
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.docstore.in_memory import InMemoryDocstore
# from langchain_ollama import ChatOllama

def document_description(text: str)->str:

    llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
    # llm = ChatOllama(model="llama3.2:latest")

    prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph elaborate description.
Text: {text}
Description:'''
    summary_prompt = PromptTemplate.from_template(prompt_text)
    chain = summary_prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text})
    return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")

def doc_summarizer(text: str)->str:

    llm = ChatGoogleGenerativeAI(model=Config.MODEL_NAME)
    # llm = ChatOllama(model="llama3.2:latest")
    
    prompt_text = '''You are an expert AI assistant specialized in document understanding. Your task is to understand the text and create a general elaboration about the document content, providing a concise, one-paragraph summary. You can remove the unessary texts. Respond with only required answers.
Text: {text}
Description:'''
    summary_prompt = PromptTemplate.from_template(prompt_text)
    chain = summary_prompt | llm | StrOutputParser()
    response = chain.invoke({"text": text})
    return next((line for line in response.split('\n') if line.strip()), "No description could be generated.")


def text_cleaning(text: str) -> str:

    new_text = re.sub(r'https?://\S+|www\.\S+', '', text.lower())
    new_text = re.sub(r"\[[a-z0-9,\s]+\]", '', new_text)
    new_text = re.sub(r"\([a-z0-9\s]+\)", '', new_text)
    new_text = re.sub(r"<[a-z0-9]+>", '', new_text)
    new_text = re.sub(r"[a-z.]+@[a-z.]+.(...)", '', new_text)
    new_text = re.sub(r'[^a-z0-9.\s:\\{}_\[\]^,;\'\-+=!@$%&*()]', ' ', new_text)
    new_text = re.sub(r'\s+', ' ', new_text)
    new_text = new_text.replace('\x0b', '').replace('\x0c', '')
    new_text = new_text.replace('-\n', '')

    return new_text

def text_processing(file_path: str) -> list[str]:

    text = extract_text(file_path)
    docs = text_cleaning(text)
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=Config.CHUNK_SIZE,
        chunk_overlap=Config.CHUNK_OVERLAP,
        add_start_index=True,
        separators=["\n\n", "\n", ".", " "],
    )
    raw_docs = splitter.split_text(docs)
    processed_docs = [re.sub(r'\s+', ' ', doc).strip() for doc in raw_docs]
    return processed_docs

def ingest_into_vector_db_hnsw(file_path: str):

    splited_docs = text_processing(file_path)

    description_text = " ".join(splited_docs)
    doc_desc = document_description(description_text)

    summ_docs = doc_summarizer(doc_desc) 
    print("doc summ: ", summ_docs)
    embeddings = HuggingFaceEmbeddings(model_name=Config.EMBEDDING_MODEL)
    vectors = embeddings.embed_documents(splited_docs)

    d = len(vectors[0])
    M = 32
    index = faiss.IndexHNSWFlat(d, M)
    index.hnsw.efConstruction = 200
    index.hnsw.efSearch = 100
    index.add(np.array(vectors, dtype="float32"))

    docs = [Document(page_content=doc) for doc in splited_docs]
    docstore = InMemoryDocstore({i: doc for i, doc in enumerate(docs)})
    index_to_docstore_id = {i: i for i in range(len(docs))}

    vectorstore = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=docstore,
        index_to_docstore_id=index_to_docstore_id
    )

    retriever = vectorstore.as_retriever(
        search_type="similarity",
        search_kwargs={
            'k': Config.RETRIEVER_K,
            # 'score_threshold': 0.4
        }
    ) 

    return retriever, summ_docs