File size: 2,381 Bytes
8255e91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import os
import streamlit as st
from dotenv import load_dotenv

from embedding import get_embedding
from vector import VectorStore
from parse import PDFTextExtractor
from chunk import SimpleTextChunker
from llm import ask_llm

# Load environment variables
load_dotenv()

# Initialize VectorStore
if "store" not in st.session_state:
    st.session_state["store"] = VectorStore()


st.title("πŸ“š RAG Note Assistant - Upload & Ask")

PDF_FOLDER = "pdf_folder"
os.makedirs(PDF_FOLDER, exist_ok=True)

# upload PDF files
uploaded_files = st.file_uploader("Upload new PDF documents", accept_multiple_files=True, type=["pdf"])

if uploaded_files:
    for file in uploaded_files:
        file_path = os.path.join(PDF_FOLDER, file.name)
        with open(file_path, "wb") as f:
            f.write(file.getbuffer())

        # Extract text from the uploaded PDF
        extractor = PDFTextExtractor(PDF_FOLDER)
        document = extractor.extract_text_from_pdf(file_path)


        # Chunk the extracted text
        chunker = SimpleTextChunker(chunk_size=500, chunk_overlap=100)
        chunks = chunker.process_document(document)

        # Generate embeddings and upsert into Pinecone
        embeddings = [get_embedding(chunk["content"]) for chunk in chunks]
        st.session_state["store"].add(embeddings, chunks)

        st.success(f" '{file.name}' has been successfully added to the knowledge base!")

# ask question
question = st.text_input("Enter your question")

if st.button("Submit"):
    if not question.strip():
        st.warning(" Please enter a valid question.")
    else:
        # Generate query embedding
        query_embedding = get_embedding(question)

        # Perform similarity search
        relevant_chunks = st.session_state["store"].search(query_embedding)

        if not relevant_chunks:
            st.warning(" No relevant content found in the knowledge base. Please upload related documents first.")
        else:
            # Combine retrieved chunks into context
            context = "\n".join([chunk["text"] for chunk in relevant_chunks])

            # Ask the LLM for the answer
            with st.spinner('AI is thinking...'):
                answer = ask_llm(question, context)

            st.markdown("### πŸ€– AI Answer")
            st.write(answer)

            st.markdown("### πŸ“– Reference Chunks")
            st.write(context)