import streamlit as st
from sentence_transformers import SentenceTransformer, util
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
import os

# Ensure NLTK 'punkt' resource is available
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt', quiet=True, download_dir=os.path.expanduser('~/nltk_data'))
    nltk.data.path.append(os.path.expanduser('~/nltk_data'))

# Function to extract text from the uploaded PDF
def extract_text_from_pdf(pdf_file):
    reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# Cached function to load the transformer model
@st.cache_resource
def load_model():
    return SentenceTransformer('all-MiniLM-L6-v2')

# Function to process text into sentences and embeddings
def process_text(text, model):
    sentences = sent_tokenize(text)  # Use NLTK for better sentence splitting
    embeddings = model.encode(sentences, show_progress_bar=True)
    return sentences, embeddings

# Streamlit UI
st.title("GitaGPT: Bhagavad Gita Chatbot")
st.write("Upload the Bhagavad Gita PDF file and ask questions based on its teachings!")

# Upload PDF file
uploaded_file = st.file_uploader("Upload Bhagavad Gita PDF", type=["pdf"])

if uploaded_file:
    with st.spinner("Extracting text and processing..."):
        # Step 1: Extract text
        raw_text = extract_text_from_pdf(uploaded_file)
        if not raw_text.strip():
            st.error("The uploaded PDF does not contain extractable text.")
            st.stop()

        # Step 2: Load model and process text
        model = load_model()
        sentences, embeddings = process_text(raw_text, model)

    st.success("PDF processed successfully! Ask your questions below.")
    
    # Step 3: Input for user query
    user_query = st.text_input("Ask your question:")

    if user_query:
        with st.spinner("Finding the best answer..."):
            # Compute embedding for the user query
            query_embedding = model.encode(user_query)
            # Compute similarity scores
            scores = util.cos_sim(query_embedding, embeddings).flatten()
            top_indices = scores.argsort(descending=True)[:5]
            top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices]
        
        # Display top matches
        st.write("**Top Responses:**")
        for idx, (response, score) in enumerate(top_matches):
            st.write(f"{idx + 1}. {response} (Score: {score:.4f})")
else:
    st.info("Please upload a PDF file to begin.")