Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| from sentence_transformers import SentenceTransformer, util | |
| import PyPDF2 | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| import os | |
| # Ensure NLTK 'punkt' resource is available | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt', quiet=True, download_dir=os.path.expanduser('~/nltk_data')) | |
| nltk.data.path.append(os.path.expanduser('~/nltk_data')) | |
| # Function to extract text from the uploaded PDF | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text | |
| return text | |
| # Cached function to load the transformer model | |
| def load_model(): | |
| return SentenceTransformer('all-MiniLM-L6-v2') | |
| # Function to process text into sentences and embeddings | |
| def process_text(text, model): | |
| sentences = sent_tokenize(text) # Use NLTK for better sentence splitting | |
| embeddings = model.encode(sentences, show_progress_bar=True) | |
| return sentences, embeddings | |
| # Streamlit UI | |
| st.title("GitaGPT: Bhagavad Gita Chatbot") | |
| st.write("Upload the Bhagavad Gita PDF file and ask questions based on its teachings!") | |
| # Upload PDF file | |
| uploaded_file = st.file_uploader("Upload Bhagavad Gita PDF", type=["pdf"]) | |
| if uploaded_file: | |
| with st.spinner("Extracting text and processing..."): | |
| # Step 1: Extract text | |
| raw_text = extract_text_from_pdf(uploaded_file) | |
| if not raw_text.strip(): | |
| st.error("The uploaded PDF does not contain extractable text.") | |
| st.stop() | |
| # Step 2: Load model and process text | |
| model = load_model() | |
| sentences, embeddings = process_text(raw_text, model) | |
| st.success("PDF processed successfully! Ask your questions below.") | |
| # Step 3: Input for user query | |
| user_query = st.text_input("Ask your question:") | |
| if user_query: | |
| with st.spinner("Finding the best answer..."): | |
| # Compute embedding for the user query | |
| query_embedding = model.encode(user_query) | |
| # Compute similarity scores | |
| scores = util.cos_sim(query_embedding, embeddings).flatten() | |
| top_indices = scores.argsort(descending=True)[:5] | |
| top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices] | |
| # Display top matches | |
| st.write("**Top Responses:**") | |
| for idx, (response, score) in enumerate(top_matches): | |
| st.write(f"{idx + 1}. {response} (Score: {score:.4f})") | |
| else: | |
| st.info("Please upload a PDF file to begin.") | |