SujathaL commited on
Commit
5752742
·
verified ·
1 Parent(s): 6c5b356

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -11
app.py CHANGED
@@ -1,22 +1,30 @@
1
  import streamlit as st
2
  from transformers import pipeline
3
- import PyPDF2
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from sentence_transformers import SentenceTransformer, util
6
 
7
- # Load the Question Answering Model
8
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
9
 
10
  # Load Embeddings Model for Better Context Matching
11
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
12
 
13
- # Function to Extract Text from PDF
14
- def extract_text_from_pdf(pdf_path):
15
- with open(pdf_path, "rb") as f:
16
- pdf_reader = PyPDF2.PdfReader(f)
17
- text = ""
18
- for page in pdf_reader.pages:
19
  text += page.extract_text() + "\n"
 
 
 
 
 
 
 
 
20
  return text
21
 
22
  # Function to Split Text into Chunks
@@ -41,9 +49,9 @@ def find_best_chunk(question, chunks):
41
  st.title("Chat with AWS Restart PDF")
42
 
43
  # Load and Process PDF
44
- pdf_path = "AWS restart program information.docx.pdf"
45
- pdf_text = extract_text_from_pdf(pdf_path)
46
- chunks = split_text(pdf_text)
47
 
48
  st.write("✅ PDF Loaded Successfully!")
49
 
 
1
  import streamlit as st
2
  from transformers import pipeline
3
+ import pdfplumber
4
+ import re
5
  from langchain.text_splitter import RecursiveCharacterTextSplitter
6
  from sentence_transformers import SentenceTransformer, util
7
 
8
+ # Load Hugging Face Question Answering model
9
  qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
10
 
11
  # Load Embeddings Model for Better Context Matching
12
  embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
13
 
14
+ # Function to Extract and Clean Text from PDF
15
+ def extract_clean_text(pdf_path):
16
+ text = ""
17
+ with pdfplumber.open(pdf_path) as pdf:
18
+ for page in pdf.pages:
 
19
  text += page.extract_text() + "\n"
20
+
21
+ # Remove extra spaces and newlines
22
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces/newlines with a single space
23
+ text = text.replace(" .", ".") # Fix misplaced spaces before periods
24
+
25
+ # Add section headers where possible
26
+ text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text) # Convert labels into headings
27
+
28
  return text
29
 
30
  # Function to Split Text into Chunks
 
49
  st.title("Chat with AWS Restart PDF")
50
 
51
  # Load and Process PDF
52
+ pdf_path = "AWS restart program information.docx.pdf" # Change to your uploaded file
53
+ pdf_text = extract_clean_text(pdf_path) # Extract & clean text
54
+ chunks = split_text(pdf_text) # Split into chunks
55
 
56
  st.write("✅ PDF Loaded Successfully!")
57