First_Aid_Kit / app.py
NHZ's picture
Update app.py
e7ac282 verified
import os
import streamlit as st
import requests
import PyPDF2
from groq import Groq
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
# Initialize Groq client
client = Groq(api_key=os.getenv("GROQ_API_KEY"))
# Function to extract text from a PDF
def extract_text_from_pdf(pdf_url):
# Convert Google Drive shareable link to direct download link
direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=")
response = requests.get(direct_url)
pdf_content = response.content
with open("temp.pdf", "wb") as f:
f.write(pdf_content)
# Read the PDF content
with open("temp.pdf", "rb") as f:
reader = PyPDF2.PdfReader(f)
text = ""
for page in reader.pages:
text += page.extract_text()
os.remove("temp.pdf")
return text
# Function to chunk text manually
def chunk_text(text, chunk_size=300):
# Split text by spaces and process into chunks
words = text.split()
chunks = []
current_chunk = []
for word in words:
if len(current_chunk) + len(word.split()) <= chunk_size:
current_chunk.append(word)
else:
chunks.append(" ".join(current_chunk))
current_chunk = [word]
if current_chunk:
chunks.append(" ".join(current_chunk))
return chunks
# Function to create embeddings and store them in FAISS using Langchain
def create_faiss_index(chunks):
# Use SentenceTransformer for embeddings
embeddings_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
# Create FAISS vector store
doc_search = FAISS.from_texts(chunks, embeddings)
return doc_search
# Function to query FAISS and retrieve relevant document chunks
def query_faiss(doc_search, query):
results = doc_search.similarity_search(query, k=3)
return [result.page_content for result in results]
# Main Streamlit App
def main():
st.title("RAG-based Application")
st.write("Interact with your document using Groq-powered model.")
# Pre-defined document link
doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
# Extract Document Content
if "document_text" not in st.session_state:
st.write("Extracting document content...")
text = extract_text_from_pdf(doc_link)
st.session_state['document_text'] = text
st.success("Document content extracted!")
# Process Document and Create FAISS Index
if 'document_text' in st.session_state and "faiss_index" not in st.session_state:
st.write("Processing document...")
chunks = chunk_text(st.session_state['document_text'])
doc_search = create_faiss_index(chunks)
st.session_state['faiss_index'] = doc_search
st.session_state['chunks'] = chunks
st.success(f"Document processed into {len(chunks)} chunks!")
# Query the Document
if 'faiss_index' in st.session_state:
st.header("Ask Questions")
query = st.text_input("Enter your question here")
if st.button("Query Document"):
results = query_faiss(st.session_state['faiss_index'], query)
if not results:
st.warning("No relevant context found in the document.")
else:
st.write("### Results from Document:")
for i, result in enumerate(results):
st.write(f"**Result {i+1}:** {result}")
# Combine results to provide context
context = "\n".join(results)
st.write("### Insights based on Document Context:")
prompt = (
f"The following context is from the document:\n\n"
f"{context}\n\n"
f"Based on this context, answer the question:\n"
f"{query}"
)
chat_completion = client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model="llama-3.3-70b-versatile",
)
st.write(chat_completion.choices[0].message.content)
if __name__ == "__main__":
main()