import streamlit as st
import os
import time
import logging
import io
import requests
from bs4 import BeautifulSoup
#from PyPDF2 import PdfReader
from dotenv import load_dotenv
import pdfplumber
import docx
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI


def fetch_and_process_pdf(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        pdf_file = io.BytesIO(response.content)
        text = process_pdf(pdf_file)
        return text
    except requests.HTTPError as e:
        logging.error(f"Failed to fetch PDF from {url}. Error: {e}")
        return ""

def process_pdf(pdf):
    start_time = time.time()
    text = ""
    with pdfplumber.open(pdf) as pdf_reader:
        for page in pdf_reader.pages:
            text += page.extract_text() or ""
    end_time = time.time()
    logging.info(f"Processed PDF in {end_time - start_time} seconds")
    return text

def display_chat_history():
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []
    history_text = ""
    for chat in st.session_state.chat_history:
        history_text += f"Q: {chat['question']}\nA: {chat['answer']}\n{chat['time']}\n---\n"
    st.text_area("Chat History", history_text, height=300)

def update_chat_history(question, answer):
    if 'chat_history' not in st.session_state:
        st.session_state.chat_history = []
    st.session_state.chat_history.append({
        "question": question,
        "answer": answer,
        "time": time.strftime("%Y-%m-%d %H:%M:%S")
    })


def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

def read_word(file_path):
    doc = docx.Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text
	
def read_documents_from_directory(directory):
    combined_text = ""
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        if filename.endswith(".pdf"):
            combined_text += read_pdf(file_path)
        elif filename.endswith(".docx"):
            combined_text += read_word(file_path)
    return combined_text
	
def get_pdf_links_from_dataset(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Define the base URL
        base_url = "https://huggingface.co"
        
        # Extract and construct absolute URLs
        pdf_links = [base_url + link.get('href') for link in soup.find_all('a') if '.pdf' in link.get('href')]
        return pdf_links
    except requests.HTTPError as e:
        logging.error(f"Failed to get PDF links from dataset. Error: {e}")
        return []

        
#train_directory = r'C:\Users\writa\Downloads\Crypto'
url = "https://huggingface.co/datasets/Writo/realestate_data/tree/main"

def main():
    load_dotenv()
    st.set_page_config(page_title="EstateSphere")
    st.header("🏢 EstateSphere")
    
    # Ensure train_directory is accessible in Hugging Face Space
    #text = read_documents_from_directory(train_directory)

    dataset_url = 'https://huggingface.co/datasets/Writo/realestate_data/tree/main'
    pdf_links = get_pdf_links_from_dataset(dataset_url)

    if pdf_links:
        with st.spinner("Processing PDFs, please wait..."):
            text = ""
            for link in pdf_links:
                text += fetch_and_process_pdf(link)

    # Processing text and setting up the AI model
    char_text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000,
                                               chunk_overlap=200, length_function=len)
    text_chunks = char_text_splitter.split_text(text)
    embeddings = OpenAIEmbeddings()
    docsearch = FAISS.from_texts(text_chunks, embeddings)
    llm = OpenAI()
    chain = load_qa_chain(llm, chain_type="appropriate_type")

    # Chat interface
    query = st.text_input("Type your question:", key="query")

    if query:
        with st.spinner("Finding your answer..."):
            try:
                docs = docsearch.similarity_search(query)
                response = chain.run(input_documents=docs, question=query)
                update_chat_history(query, response)
                display_chat_history()
            except Exception as e:
                st.error(f"An error occurred: {e}")

    # Help and support in the sidebar
    st.sidebar.header("Help & Support")
    st.sidebar.write("Need assistance? Reach out to our support team.")

    # Footer
    st.sidebar.text("© 2024 MosaicAI")
	
    update_chat_history(question, answer)

if __name__ == "__main__":
    main()