Zeeshan24's picture
Create app.py
15da411 verified
import os
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from groq import Groq
import requests
from io import BytesIO
# Hardcoded API Key
GROQ_API_KEY = "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"
# Function to download PDF from a URL
def download_pdf_from_url(url):
try:
response = requests.get(url)
response.raise_for_status()
return BytesIO(response.content)
except requests.exceptions.RequestException as e:
st.error(f"Failed to download PDF: {e}")
return None
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to split text into chunks
def create_chunks(text, chunk_size=500):
words = text.split()
chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
return chunks
# Function to create embeddings
def create_embeddings(chunks, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
embeddings = model.encode(chunks)
return embeddings
# Function to store embeddings in FAISS
def store_embeddings_in_faiss(embeddings):
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))
return index
# Function to query FAISS index
def query_faiss(index, query_embedding, k=5):
distances, indices = index.search(query_embedding, k)
return indices
# Function to interact with Groq API
def send_query_to_groq(query):
client = Groq(api_key=GROQ_API_KEY)
response = client.chat.completions.create(
messages=[{"role": "user", "content": query}],
model="llama3-8b-8192"
)
return response.choices[0].message.content
# Preload and process PDF links
def preload_pdfs(pdf_links):
st.write("Downloading and processing PDFs...")
all_chunks = []
for url in pdf_links:
pdf_file = download_pdf_from_url(url)
if pdf_file:
text = extract_text_from_pdf(pdf_file)
chunks = create_chunks(text)
all_chunks.extend(chunks)
return all_chunks
# Streamlit UI
def main():
st.title("RAG-based Application")
# Predefined PDF links
pdf_links = [
"https://drive.google.com/uc?id=1hF6exN7tYScy-mxQAP5X9R_200X-ukMB", # Add your links here
# Add more links as needed
]
# Preload PDFs and create embeddings
chunks = preload_pdfs(pdf_links)
embeddings = create_embeddings(chunks)
index = store_embeddings_in_faiss(embeddings)
st.success("All PDFs processed successfully! You can now ask questions.")
# Input for user query
query = st.text_input("Ask your question:")
if query:
st.write("Fetching relevant chunks...")
query_embedding = create_embeddings([query])
relevant_indices = query_faiss(index, query_embedding)
relevant_texts = [chunks[i] for i in relevant_indices[0]]
context = " ".join(relevant_texts)
st.write("Sending query to Groq API...")
response = send_query_to_groq(context)
st.write("Response:", response)
if __name__ == "__main__":
main()