ojas121 commited on
Commit
cd874ef
·
verified ·
1 Parent(s): 3602c46

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -17
app.py CHANGED
@@ -1,24 +1,32 @@
1
- import os
2
- os.system("pip install huggingface_hub==0.14.1")
3
- os.system("pip install sentence-transformers==2.2.2")
4
  import streamlit as st
5
  from sentence_transformers import SentenceTransformer, util
6
  import PyPDF2
 
 
 
 
 
7
 
8
  # Function to extract text from the uploaded PDF
9
  def extract_text_from_pdf(pdf_file):
10
  reader = PyPDF2.PdfReader(pdf_file)
11
  text = ""
12
  for page in reader.pages:
13
- text += page.extract_text()
 
 
14
  return text
15
 
 
 
 
 
 
16
  # Function to process text into sentences and embeddings
17
- def process_text(text):
18
- sentences = [sentence.strip() for sentence in text.split("\n") if sentence.strip()]
19
- model = SentenceTransformer('all-MiniLM-L6-v2') # A lightweight transformer model
20
  embeddings = model.encode(sentences, show_progress_bar=True)
21
- return sentences, embeddings, model
22
 
23
  # Streamlit UI
24
  st.title("GitaGPT: Bhagavad Gita Chatbot")
@@ -31,10 +39,14 @@ if uploaded_file:
31
  with st.spinner("Extracting text and processing..."):
32
  # Step 1: Extract text
33
  raw_text = extract_text_from_pdf(uploaded_file)
 
 
 
 
 
 
 
34
 
35
- # Step 2: Process text to generate embeddings
36
- sentences, embeddings, model = process_text(raw_text)
37
-
38
  st.success("PDF processed successfully! Ask your questions below.")
39
 
40
  # Step 3: Input for user query
@@ -45,12 +57,13 @@ if uploaded_file:
45
  # Compute embedding for the user query
46
  query_embedding = model.encode(user_query)
47
  # Compute similarity scores
48
- scores = util.cos_sim(query_embedding, embeddings)
49
- best_match_idx = scores.argmax()
50
- # Fetch the best matching sentence
51
- response = sentences[best_match_idx]
52
 
53
- st.write(f"**Answer:** {response}")
 
 
 
54
  else:
55
  st.info("Please upload a PDF file to begin.")
56
-
 
 
 
 
1
  import streamlit as st
2
  from sentence_transformers import SentenceTransformer, util
3
  import PyPDF2
4
+ import nltk
5
+ from nltk.tokenize import sent_tokenize
6
+
7
+ # Ensure NLTK resources are downloaded
8
+ nltk.download("punkt")
9
 
10
  # Function to extract text from the uploaded PDF
11
  def extract_text_from_pdf(pdf_file):
12
  reader = PyPDF2.PdfReader(pdf_file)
13
  text = ""
14
  for page in reader.pages:
15
+ page_text = page.extract_text()
16
+ if page_text:
17
+ text += page_text
18
  return text
19
 
20
+ # Cached function to load the transformer model
21
+ @st.cache_resource
22
+ def load_model():
23
+ return SentenceTransformer('all-MiniLM-L6-v2')
24
+
25
  # Function to process text into sentences and embeddings
26
+ def process_text(text, model):
27
+ sentences = sent_tokenize(text) # Use NLTK for better sentence splitting
 
28
  embeddings = model.encode(sentences, show_progress_bar=True)
29
+ return sentences, embeddings
30
 
31
  # Streamlit UI
32
  st.title("GitaGPT: Bhagavad Gita Chatbot")
 
39
  with st.spinner("Extracting text and processing..."):
40
  # Step 1: Extract text
41
  raw_text = extract_text_from_pdf(uploaded_file)
42
+ if not raw_text.strip():
43
+ st.error("The uploaded PDF does not contain extractable text.")
44
+ st.stop()
45
+
46
+ # Step 2: Load model and process text
47
+ model = load_model()
48
+ sentences, embeddings = process_text(raw_text, model)
49
 
 
 
 
50
  st.success("PDF processed successfully! Ask your questions below.")
51
 
52
  # Step 3: Input for user query
 
57
  # Compute embedding for the user query
58
  query_embedding = model.encode(user_query)
59
  # Compute similarity scores
60
+ scores = util.cos_sim(query_embedding, embeddings).flatten()
61
+ top_indices = scores.argsort(descending=True)[:5]
62
+ top_matches = [(sentences[idx], scores[idx].item()) for idx in top_indices]
 
63
 
64
+ # Display top matches
65
+ st.write("**Top Responses:**")
66
+ for idx, (response, score) in enumerate(top_matches):
67
+ st.write(f"{idx + 1}. {response} (Score: {score:.4f})")
68
  else:
69
  st.info("Please upload a PDF file to begin.")