import docx
import streamlit as st
import os
import PyPDF2
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

checkpoint = "facebook/bart-large-cnn"


@st.cache_resource
def load_model():
    model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
    return model


@st.cache_resource
def load_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    return tokenizer


def load_text_file(file):
    bytes_data = file.getvalue()
    text = bytes_data.decode("utf-8")
    return text


def load_pdf_file(file):
    pdf_reader = PyPDF2.PdfReader(file)
    pdf_text = ""
    for page_num in range(len(pdf_reader.pages)):
        pdf_text += pdf_reader.pages[page_num].extract_text() or ""
    return pdf_text


def load_word_file(file):
    doc = docx.Document(file)
    paragraphs = [p.text for p in doc.paragraphs]
    return "\n".join(paragraphs)


def split_text_into_chunks(text, max_chunk_length):
    chunks = []
    current_chunk = ""

    for word in text.split():
        if len(current_chunk) + len(word) + 1 <= max_chunk_length:
            current_chunk += word + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = word + " "

    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks


def main():
    st.set_page_config(
        page_title="Summarisation Tool",
        page_icon="🧊",
        layout="wide",
        initial_sidebar_state="expanded",
    )
    model = load_model()
    print("Model's maximum sequence length:", model.config.max_position_embeddings)

    tokenizer = load_tokenizer()
    print("Tokenizer's maximum sequence length:", tokenizer.model_max_length)

    st.title("Summarisation Tool")
    st.write(
        f"Performs basic summarisation of text and audio using the '{checkpoint}' model."
    )

    st.sidebar.title("Options")
    summary_balance = st.sidebar.select_slider(
        "Output Summarisation Detail:",
        options=["concise", "balanced", "detailed"],
        value="balanced",
    )

    textTab, docTab, audioTab = st.tabs(["Plain Text", "Text Document", "Audio File"])

    with textTab:
        sentence = st.text_area(
            "Paste text to be summarised:",
            help="Paste text into text area and hit Summarise button",
            height=300,
        )
        st.write(f"{len(sentence)} characters and {len(sentence.split())} words")

    with docTab:
        uploaded_file = st.file_uploader("Select a file to be summarised:")
        if uploaded_file is not None:
            file_name = os.path.basename(uploaded_file.name)
            _, file_ext = os.path.splitext(file_name)
            if "pdf" in file_ext:
                sentence = load_pdf_file(uploaded_file)
            elif "docx" in file_ext:
                sentence = load_word_file(uploaded_file)
            else:
                sentence = load_text_file(uploaded_file)
        st.write(f"{len(sentence)} characters and {len(sentence.split())} words")
        # st.write(sentence)

    with audioTab:
        st.text("Yet to be implemented...")

    button = st.button("Summarise")
    st.divider()

    with st.spinner("Generating Summary..."):
        if button and sentence:
            chunks = split_text_into_chunks(sentence, 100000)
            print(f"Split into {len(chunks)} chunks")

            text_words = len(sentence.split())
            if summary_balance == "concise":
                min_multiplier = text_words * 0.1
                max_multiplier = text_words * 0.3
            elif summary_balance == "detailed":
                min_multiplier = text_words * 0.5
                max_multiplier = text_words * 0.8
            elif summary_balance == "balanced":
                min_multiplier = text_words * 0.2
                max_multiplier = text_words * 0.4

            if max_multiplier > 1024:
                max_multiplier = 1024
                min_multiplier = 512

            print(
                f"Tokenizer min tokens {int(min_multiplier)}, max tokens {int(max_multiplier)}"
            )
            inputs = tokenizer(
                chunks,
                max_length=model.config.max_position_embeddings,
                return_tensors="pt",
                truncation=True,
                padding=True,
            )
            summary_ids = model.generate(
                inputs["input_ids"],
                min_new_tokens=int(min_multiplier),
                max_new_tokens=int(max_multiplier),
                do_sample=False,
            )
            summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            st.write(summary)
            st.write(f"{len(summary)} characters and {len(summary.split())} words")


if __name__ == "__main__":
    main()