File size: 2,708 Bytes
9e5a092
5752742
23be0f8
 
 
a1e9850
4c3d6b3
9e5a092
4c3d6b3
 
 
2a2dd2f
5f0b01f
6c5b356
 
23be0f8
5752742
 
 
 
5f0b01f
 
 
 
9e5a092
6c5b356
23be0f8
 
 
 
 
 
5f0b01f
 
 
23be0f8
 
 
 
5f0b01f
23be0f8
5f0b01f
 
 
23be0f8
 
a1e9850
4c3d6b3
2a2dd2f
4c3d6b3
 
2a2dd2f
9e5a092
ce554ac
3a62d0c
23be0f8
 
 
 
 
a1e9850
5f0b01f
 
 
 
9e5a092
3a62d0c
 
9e5a092
 
23be0f8
2a2dd2f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import streamlit as st
import pdfplumber
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import pipeline

# Load Extractive QA Model (Like ChatPDF)
model_name = "deepset/roberta-base-squad2"
qa_pipeline = pipeline("question-answering", model=model_name)

# Load Sentence Embeddings Model
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to Extract & Clean PDF Text
def extract_clean_text(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text = page.extract_text()
            if extracted_text:  # Only add text if it's not empty
                text += extracted_text + "\n"
    return text.strip()  # Remove extra spaces

# Function to Split Text into Chunks
def split_text(text, chunk_size=500):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=50)
    return text_splitter.split_text(text)

# Function to Create FAISS Vector Database
def create_faiss_index(chunks):
    if not chunks:
        return None, None, None  # Avoid errors if text extraction fails
    embeddings = np.array([embedding_model.encode(chunk) for chunk in chunks], dtype=np.float32)
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, chunks, embeddings

# Function to Find the Best Matching Chunk
def find_best_chunk(question, index, chunks, embeddings):
    if index is None:
        return "No valid text found in the PDF."
    question_embedding = embedding_model.encode(question).reshape(1, -1).astype(np.float32)
    _, closest_idx = index.search(np.array(question_embedding), 1)
    return chunks[closest_idx[0][0]]

# Function to Extract the Best Answer
def get_answer(question, context):
    response = qa_pipeline(question=question, context=context)
    return response['answer']  # Returns extracted answer (ChatPDF-like behavior)

# Streamlit UI
st.title("Chat with AWS Restart PDF")

# Load & Process PDF
pdf_path = "AWS restart program information.docx.pdf"
pdf_text = extract_clean_text(pdf_path)
chunks = split_text(pdf_text)
index, chunks, embeddings = create_faiss_index(chunks)

if pdf_text:
    st.write("✅ PDF Loaded Successfully!")
else:
    st.write("⚠ No valid text found in the PDF. Please check the document format.")

# User Input
question = st.text_input("Ask a question about AWS Restart program:")

if st.button("Get Answer") and question:
    relevant_chunk = find_best_chunk(question, index, chunks, embeddings)
    response = get_answer(question, relevant_chunk)
    st.write("Answer:", response)