import streamlit as st from sentence_transformers import SentenceTransformer, util import PyPDF2 import nltk from nltk.tokenize import sent_tokenize import os # Ensure NLTK 'punkt' resource is available try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt', quiet=True, download_dir=os.path.expanduser('~/nltk_data')) nltk.data.path.append(os.path.expanduser('~/nltk_data')) # Function to extract text from the uploaded PDF def extract_text_from_pdf(pdf_file): reader = PyPDF2.PdfReader(pdf_file) text = "" for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text return text # Cached function to load the transformer model @st.cache_resource def load_model(): return SentenceTransformer('all-MiniLM-L6-v2') # Function to process text into sentences and embeddings def process_text(text, model): sentences = sent_tokenize(text) # Use NLTK for better sentence splitting embeddings = model.encode(sentences, show_progress_bar=True) return sentences, embeddings # Streamlit UI st.title("GitaGPT: Bhagavad Gita Chatbot") st.write("Upload the Bhagavad Gita PDF file and ask questions based on its teachings!") # Upload PDF file uploaded_file = st.file_uploader("Upload Bhagavad Gita PDF", type=["pdf"]) if uploaded_file: with st.spinner("Extracting text and processing..."): # Step 1: Extract text raw_text = extract_text_from_pdf(uploaded_file) if not raw_text.strip(): st.error("The uploaded PDF does not contain extractable text.") st.stop() # Step 2: Load model and process text model = load_model() sentences, embeddings = process_text(raw_text, model) st.success("PDF processed successfully! Ask your questions below.") # Step 3: Input for user query user_query = st.text_input("Ask your question:") if user_query: with st.spinner("Finding the best answer..."): # Compute embedding for the user query query_embedding = model.encode(user_query) # Compute similarity scores scores = util.cos_sim(query_embedding, embeddings).flatten() top_indices = scores.argsort(descending=True)[:5] top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices] # Display top matches st.write("**Top Responses:**") for idx, (response, score) in enumerate(top_matches): st.write(f"{idx + 1}. {response} (Score: {score:.4f})") else: st.info("Please upload a PDF file to begin.")