File size: 2,561 Bytes
4b5b4ff
 
 
 
 
 
 
 
6a1e254
4b5b4ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a1e254
 
 
5f8977c
 
4b5b4ff
 
 
 
 
5f8977c
4b5b4ff
 
 
 
6a1e254
4b5b4ff
 
 
 
5f8977c
 
4b5b4ff
6a1e254
4b5b4ff
 
5f8977c
4b5b4ff
5f8977c
4b5b4ff
 
 
 
 
 
 
 
 
 
5f8977c
4b5b4ff
 
 
 
 
0f7414c
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import os
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
from groq import Groq

# Download the punkt resource at runtime (in case it wasn't downloaded during build)
nltk.download('punkt')

# Set the API key directly
GROQ_API_KEY = "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"

# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)

# Test the client
response = client.chat.completions.create(
    messages=[{"role": "user", "content": "Test query to verify Groq API"}],
    model="llama3-8b-8192",
)

print(response.choices[0].message.content)

# Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Initialize FAISS Index
dimension = 384  # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)

# Function to Extract Text from PDF
def extract_text_from_pdf(pdf_file):
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

# Function to Chunk and Tokenize Text
def chunk_and_tokenize(text):
    sentences = sent_tokenize(text)
    chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]
    return chunks

# Function to Create Embeddings
def create_embeddings(chunks):
    embeddings = model.encode(chunks)
    return embeddings

# Function to Query Groq
def query_groq(prompt):
    response = client.chat.completions.create(
        messages=[{"role": "user", "content": prompt}],
        model="llama3-8b-8192",
    )
    return response.choices[0].message.content

# Streamlit Frontend
st.title("RAG-based PDF Query App")

uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
    text = extract_text_from_pdf(uploaded_file)
    st.write("Extracted Text:")
    st.write(text[:500])  # Display first 500 characters
    
    chunks = chunk_and_tokenize(text)
    st.write(f"Text divided into {len(chunks)} chunks.")
    
    embeddings = create_embeddings(chunks)
    index.add(embeddings)
    st.write("Embeddings created and stored in FAISS database.")

    query = st.text_input("Enter your query:")
    if query:
        # Find the most relevant chunk
        query_embedding = model.encode([query])
        _, indices = index.search(query_embedding, 1)
        relevant_chunk = chunks[indices[0][0]]
        
        # Query Groq
        response = query_groq(relevant_chunk)
        st.write("Response from Groq:")
        st.write(response)