Spaces:

SujathaL
/

AWS_Restart_Program_Chatbot

Sleeping

SujathaL commited on Mar 3, 2025

Commit

5752742

verified ·

1 Parent(s): 6c5b356

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,22 +1,30 @@
 import streamlit as st
 from transformers import pipeline
-import PyPDF2
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer, util
-# Load the Question Answering Model
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Load Embeddings Model for Better Context Matching
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
-# Function to Extract Text from PDF
-def extract_text_from_pdf(pdf_path):
-    with open(pdf_path, "rb") as f:
-        pdf_reader = PyPDF2.PdfReader(f)
-        text = ""
-        for page in pdf_reader.pages:
             text += page.extract_text() + "\n"
     return text
 # Function to Split Text into Chunks
@@ -41,9 +49,9 @@ def find_best_chunk(question, chunks):
 st.title("Chat with AWS Restart PDF")
 # Load and Process PDF
-pdf_path = "AWS restart program information.docx.pdf"
-pdf_text = extract_text_from_pdf(pdf_path)
-chunks = split_text(pdf_text)
 st.write("✅ PDF Loaded Successfully!")

 import streamlit as st
 from transformers import pipeline
+import pdfplumber
+import re
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from sentence_transformers import SentenceTransformer, util
+# Load Hugging Face Question Answering model
 qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
 # Load Embeddings Model for Better Context Matching
 embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
+# Function to Extract and Clean Text from PDF
+def extract_clean_text(pdf_path):
+    text = ""
+    with pdfplumber.open(pdf_path) as pdf:
+        for page in pdf.pages:
             text += page.extract_text() + "\n"
+    # Remove extra spaces and newlines
+    text = re.sub(r'\s+', ' ', text)  # Replace multiple spaces/newlines with a single space
+    text = text.replace(" .", ".")  # Fix misplaced spaces before periods
+    # Add section headers where possible
+    text = re.sub(r'(?<=\n)([A-Z][a-z]+.*?):', r'\n\n## \1\n', text)  # Convert labels into headings
     return text
 # Function to Split Text into Chunks
 st.title("Chat with AWS Restart PDF")
 # Load and Process PDF
+pdf_path = "AWS restart program information.docx.pdf"  # Change to your uploaded file
+pdf_text = extract_clean_text(pdf_path)  # Extract & clean text
+chunks = split_text(pdf_text)  # Split into chunks
 st.write("✅ PDF Loaded Successfully!")