Spaces:

ojas121
/

KrishnConnect

Sleeping

App Files Files Community

KrishnConnect / app.py

ojas121

Update app.py

07c9a4a verified about 1 year ago

raw

history blame contribute delete

2.65 kB

	import streamlit as st
	from sentence_transformers import SentenceTransformer, util
	import PyPDF2
	import nltk
	from nltk.tokenize import sent_tokenize
	import os

	# Ensure NLTK 'punkt' resource is available
	try:
	nltk.data.find('tokenizers/punkt')
	except LookupError:
	nltk.download('punkt', quiet=True, download_dir=os.path.expanduser('~/nltk_data'))
	nltk.data.path.append(os.path.expanduser('~/nltk_data'))

	# Function to extract text from the uploaded PDF
	def extract_text_from_pdf(pdf_file):
	reader = PyPDF2.PdfReader(pdf_file)
	text = ""
	for page in reader.pages:
	page_text = page.extract_text()
	if page_text:
	text += page_text
	return text

	# Cached function to load the transformer model
	@st.cache_resource
	def load_model():
	return SentenceTransformer('all-MiniLM-L6-v2')

	# Function to process text into sentences and embeddings
	def process_text(text, model):
	sentences = sent_tokenize(text) # Use NLTK for better sentence splitting
	embeddings = model.encode(sentences, show_progress_bar=True)
	return sentences, embeddings

	# Streamlit UI
	st.title("GitaGPT: Bhagavad Gita Chatbot")
	st.write("Upload the Bhagavad Gita PDF file and ask questions based on its teachings!")

	# Upload PDF file
	uploaded_file = st.file_uploader("Upload Bhagavad Gita PDF", type=["pdf"])

	if uploaded_file:
	with st.spinner("Extracting text and processing..."):
	# Step 1: Extract text
	raw_text = extract_text_from_pdf(uploaded_file)
	if not raw_text.strip():
	st.error("The uploaded PDF does not contain extractable text.")
	st.stop()

	# Step 2: Load model and process text
	model = load_model()
	sentences, embeddings = process_text(raw_text, model)

	st.success("PDF processed successfully! Ask your questions below.")

	# Step 3: Input for user query
	user_query = st.text_input("Ask your question:")

	if user_query:
	with st.spinner("Finding the best answer..."):
	# Compute embedding for the user query
	query_embedding = model.encode(user_query)
	# Compute similarity scores
	scores = util.cos_sim(query_embedding, embeddings).flatten()
	top_indices = scores.argsort(descending=True)[:5]
	top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices]

	# Display top matches
	st.write("Top Responses:")
	for idx, (response, score) in enumerate(top_matches):
	st.write(f"{idx + 1}. {response} (Score: {score:.4f})")
	else:
	st.info("Please upload a PDF file to begin.")