SujathaL commited on
Commit
23be0f8
·
verified ·
1 Parent(s): 8a1d8cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -39
app.py CHANGED
@@ -1,57 +1,49 @@
1
  import streamlit as st
2
- from transformers import pipeline
3
  import pdfplumber
4
- import re
 
 
 
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
- from sentence_transformers import SentenceTransformer, util
7
 
8
- # Load Hugging Face Question Answering model
9
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
10
-
11
- # Load Embeddings Model for Better Context Matching
12
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
13
 
14
- # Function to Extract and Clean Text from PDF
15
  def extract_clean_text(pdf_path):
16
  text = ""
17
  with pdfplumber.open(pdf_path) as pdf:
18
  for page in pdf.pages:
19
  text += page.extract_text() + "\n"
20
-
21
- # Remove extra spaces and newlines
22
- text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with a single space
23
- text = text.replace(" .", ".") # Fix misplaced spaces before periods
24
-
25
- # Add section headers where possible
26
- text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text) # Convert labels into headings
27
-
28
- return text
29
 
30
  # Function to Split Text into Chunks
31
- def split_text(text):
32
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
33
- chunks = text_splitter.split_text(text)
34
- return chunks
35
-
36
- # Function to Find the Most Relevant Chunk Using Embeddings
37
- def find_best_chunk(question, chunks):
38
- question_embedding = embedding_model.encode(question, convert_to_tensor=True)
39
- chunk_embeddings = [embedding_model.encode(chunk, convert_to_tensor=True) for chunk in chunks]
40
-
41
- # Compute similarity between question and each chunk
42
- similarities = [util.pytorch_cos_sim(question_embedding, chunk_emb).item() for chunk_emb in chunk_embeddings]
43
-
44
- # Find the most relevant chunk
45
- best_chunk_index = similarities.index(max(similarities))
46
- return chunks[best_chunk_index]
47
 
48
  # Streamlit UI
49
- st.title("Chat with AWS Restart PDF")
50
 
51
- # Load and Process PDF
52
- pdf_path = "AWS restart program information.docx.pdf" # Change to your uploaded file
53
- pdf_text = extract_clean_text(pdf_path) # Extract & clean text
54
- chunks = split_text(pdf_text) # Split into chunks
 
55
 
56
  st.write("✅ PDF Loaded Successfully!")
57
 
@@ -59,6 +51,6 @@ st.write("✅ PDF Loaded Successfully!")
59
  question = st.text_input("Ask a question about AWS Restart program:")
60
 
61
  if st.button("Get Answer") and question:
62
- relevant_chunk = find_best_chunk(question, chunks) # Retrieve the best chunk
63
- response = qa_pipeline(question=question, context=relevant_chunk) # Ask the model
64
  st.write("Answer:", response['answer'])
 
1
  import streamlit as st
 
2
  import pdfplumber
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from transformers import pipeline
7
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
8
 
9
+ # Load Models
10
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 
 
11
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
 
13
+ # Function to Extract & Clean PDF Text
14
  def extract_clean_text(pdf_path):
15
  text = ""
16
  with pdfplumber.open(pdf_path) as pdf:
17
  for page in pdf.pages:
18
  text += page.extract_text() + "\n"
19
+ return text.replace("\n", " ")
 
 
 
 
 
 
 
 
20
 
21
  # Function to Split Text into Chunks
22
+ def split_text(text, chunk_size=500):
23
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
24
+ return text_splitter.split_text(text)
25
+
26
+ # Function to Create FAISS Vector Database
27
+ def create_faiss_index(chunks):
28
+ embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks])
29
+ index = faiss.IndexFlatL2(embeddings.shape[1])
30
+ index.add(embeddings)
31
+ return index, chunks, embeddings
32
+
33
+ # Function to Find Best Matching Chunk
34
+ def find_best_chunk(question, index, chunks, embeddings):
35
+ question_embedding = embedding_model.encode(question).reshape(1, -1)
36
+ _, closest_idx = index.search(np.array(question_embedding), 1)
37
+ return chunks[closest_idx[0][0]]
38
 
39
  # Streamlit UI
40
+ st.title("Chat with AWS Restart PDF (Like ChatPDF)")
41
 
42
+ # Load & Process PDF
43
+ pdf_path = "AWS restart program information.docx.pdf"
44
+ pdf_text = extract_clean_text(pdf_path)
45
+ chunks = split_text(pdf_text)
46
+ index, chunks, embeddings = create_faiss_index(chunks)
47
 
48
  st.write("✅ PDF Loaded Successfully!")
49
 
 
51
  question = st.text_input("Ask a question about AWS Restart program:")
52
 
53
  if st.button("Get Answer") and question:
54
+ relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
55
+ response = qa_pipeline(question=question, context=relevant_chunk)
56
  st.write("Answer:", response['answer'])