File size: 2,223 Bytes
09cb6d9
 
 
 
 
9db8b94
09cb6d9
 
 
 
c51c32f
09cb6d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ece1a2
baa254c
09cb6d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import os
from groq import Groq
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from PyPDF2 import PdfReader
import streamlit as st
from tempfile import NamedTemporaryFile

# Initialize Groq client
client = Groq(api_key=os.getenv("groq_api_key"))

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file_path):
  pdf_reader = PdfReader(pdf_file_path)
  text = ''
  for page in pdf_reader.pages:
    text += page.extract_text()
  return text

#Function to split text into chunks
def chunk_text(text, chunk_size=500, chunk_overlap=50):
  text_splitter = RecursiveCharacterTextSplitter(
      chunk_size = chunk_size, chunk_overlap = chunk_overlap
  )
  return text_splitter.split_text(text)

def create_embeddings_and_store(chunks):
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  vector_db = FAISS.from_texts(chunks, embedding= embeddings)
  return vector_db

def query_vector_db(query, vector_db):
  docs = vector_db.similarity_search(query, k=3)
  context = '\n'.join([doc.page_content for doc in docs])

  chat_completion = client.chat.completions.create(
      messages=[
          {'role':'system', 'content': f"use the following contect : \n{context}"},
          {'role':'user','content':query},
      ],
      model = 'llama3-8b-8192' 
  )
  return chat_completion.choices[0].message.content

#Streamlit APP
st.title("Rag Based Application")

upload_file = st.file_uploader("Upload a PDF Document", type =['pdf'])

if upload_file:
  with NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
    temp_file.write(upload_file.read())
    pdf_path = temp_file.name

  text = extract_text_from_pdf(pdf_path)
  st.write("PDF Text Extracted Successful")

  chunks = chunk_text(text)
  st.write("Text Chunked Successfully")
 
  vector_db = create_embeddings_and_store(chunks)
  st.write("Embeddings Generate and Store Successfully")

  user_query = st.text_input("Enter your query : ")

  if user_query:
    response = query_vector_db(user_query, vector_db)
    st.write("Response from LLM : ")
    st.write(response)