First_Aid_Kit / app.py
NHZ's picture
Update app.py
7e5ff22 verified
raw
history blame
3.45 kB
import os
import streamlit as st
import requests
import PyPDF2
from sentence_transformers import SentenceTransformer
import faiss
import nltk
from groq import Groq
# Ensure the punkt tokenizer is downloaded
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
nltk.download('punkt')
# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_url):
# Convert Google Drive shareable link to direct download link
direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
response = requests.get(direct_url)
pdf_content = response.content
with open("temp.pdf", "wb") as f:
f.write(pdf_content)
# Read the PDF content
with open("temp.pdf", "rb") as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
os.remove("temp.pdf")
return text
# Function to chunk text
def chunk_text(text, chunk_size=300):
sentences = nltk.sent_tokenize(text)
chunks = []
current_chunk = []
current_length = 0
for sentence in sentences:
current_length += len(sentence.split())
if current_length <= chunk_size:
current_chunk.append(sentence)
else:
chunks.append(" ".join(current_chunk))
current_chunk = [sentence]
current_length = len(sentence.split())
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
# Function to create embeddings and store them in FAISS
def create_faiss_index(chunks):
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(chunks)
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
return index, embeddings
# Function to query FAISS
def query_faiss(index, query, chunks, model):
query_vector = model.encode([query])
distances, indices = index.search(query_vector, k=3)
results = [chunks[i] for i in indices[0]]
return results
# Main Streamlit App
def main():
st.title("RAG-based Application")
st.write("Interact with your document using Groq-powered model.")
# Pre-defined document link
doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
# Extract Document Content
if "document_text" not in st.session_state:
st.write("Extracting document content...")
text = extract_text_from_pdf(doc_link)
st.session_state['document_text'] = text
st.success("Document content extracted!")
# Process Document and Create FAISS Index
if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
st.write("Processing document...")
chunks = chunk_text(st.session_state['document_text'])
index, embeddings = create_faiss_index(chunks)
st.session_state['faiss_index'] = index
st.session_state['chunks'] = chunks
st.session_state['model'] = SentenceTransformer("all-MiniLM-L6-v2")
st.success(f"Document processed into {len(chunks)} chunks!")
# Query the Document
if 'faiss_index' in st.session_state:
st.header("Ask Questions")
query = st.text_input("Enter your question here")
if st.button("Query Document"):
results = query_faiss(st.session_state['faiss_index'],