Spaces:
Sleeping
Sleeping
File size: 2,561 Bytes
4b5b4ff 6a1e254 4b5b4ff 6a1e254 5f8977c 4b5b4ff 5f8977c 4b5b4ff 6a1e254 4b5b4ff 5f8977c 4b5b4ff 6a1e254 4b5b4ff 5f8977c 4b5b4ff 5f8977c 4b5b4ff 5f8977c 4b5b4ff 0f7414c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | import os
import PyPDF2
import nltk
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer
import faiss
import streamlit as st
from groq import Groq
# Download the punkt resource at runtime (in case it wasn't downloaded during build)
nltk.download('punkt')
# Set the API key directly
GROQ_API_KEY = "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"
# Initialize Groq Client
client = Groq(api_key=GROQ_API_KEY)
# Test the client
response = client.chat.completions.create(
messages=[{"role": "user", "content": "Test query to verify Groq API"}],
model="llama3-8b-8192",
)
print(response.choices[0].message.content)
# Load Sentence Transformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')
# Initialize FAISS Index
dimension = 384 # Dimension of the embeddings
index = faiss.IndexFlatL2(dimension)
# Function to Extract Text from PDF
def extract_text_from_pdf(pdf_file):
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
return text
# Function to Chunk and Tokenize Text
def chunk_and_tokenize(text):
sentences = sent_tokenize(text)
chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)]
return chunks
# Function to Create Embeddings
def create_embeddings(chunks):
embeddings = model.encode(chunks)
return embeddings
# Function to Query Groq
def query_groq(prompt):
response = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama3-8b-8192",
)
return response.choices[0].message.content
# Streamlit Frontend
st.title("RAG-based PDF Query App")
uploaded_file = st.file_uploader("Upload a PDF file", type="pdf")
if uploaded_file:
text = extract_text_from_pdf(uploaded_file)
st.write("Extracted Text:")
st.write(text[:500]) # Display first 500 characters
chunks = chunk_and_tokenize(text)
st.write(f"Text divided into {len(chunks)} chunks.")
embeddings = create_embeddings(chunks)
index.add(embeddings)
st.write("Embeddings created and stored in FAISS database.")
query = st.text_input("Enter your query:")
if query:
# Find the most relevant chunk
query_embedding = model.encode([query])
_, indices = index.search(query_embedding, 1)
relevant_chunk = chunks[indices[0][0]]
# Query Groq
response = query_groq(relevant_chunk)
st.write("Response from Groq:")
st.write(response)
|