KrishnConnect / app.py
ojas121's picture
Update app.py
07c9a4a verified
import streamlit as st
from sentence_transformers import SentenceTransformer, util
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
import os
# Ensure NLTK 'punkt' resource is available
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt', quiet=True, download_dir=os.path.expanduser('~/nltk_data'))
nltk.data.path.append(os.path.expanduser('~/nltk_data'))
# Function to extract text from the uploaded PDF
def extract_text_from_pdf(pdf_file):
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text
return text
# Cached function to load the transformer model
@st.cache_resource
def load_model():
return SentenceTransformer('all-MiniLM-L6-v2')
# Function to process text into sentences and embeddings
def process_text(text, model):
sentences = sent_tokenize(text) # Use NLTK for better sentence splitting
embeddings = model.encode(sentences, show_progress_bar=True)
return sentences, embeddings
# Streamlit UI
st.title("GitaGPT: Bhagavad Gita Chatbot")
st.write("Upload the Bhagavad Gita PDF file and ask questions based on its teachings!")
# Upload PDF file
uploaded_file = st.file_uploader("Upload Bhagavad Gita PDF", type=["pdf"])
if uploaded_file:
with st.spinner("Extracting text and processing..."):
# Step 1: Extract text
raw_text = extract_text_from_pdf(uploaded_file)
if not raw_text.strip():
st.error("The uploaded PDF does not contain extractable text.")
st.stop()
# Step 2: Load model and process text
model = load_model()
sentences, embeddings = process_text(raw_text, model)
st.success("PDF processed successfully! Ask your questions below.")
# Step 3: Input for user query
user_query = st.text_input("Ask your question:")
if user_query:
with st.spinner("Finding the best answer..."):
# Compute embedding for the user query
query_embedding = model.encode(user_query)
# Compute similarity scores
scores = util.cos_sim(query_embedding, embeddings).flatten()
top_indices = scores.argsort(descending=True)[:5]
top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices]
# Display top matches
st.write("**Top Responses:**")
for idx, (response, score) in enumerate(top_matches):
st.write(f"{idx + 1}. {response} (Score: {score:.4f})")
else:
st.info("Please upload a PDF file to begin.")