First_Aid_Kit / app.py
NHZ's picture
Update app.py
05b86d4 verified
raw
history blame
2.83 kB
import os
import requests
import numpy as np
import faiss
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import GroqLLM
import streamlit as st
# Initialize Groq API LLM
llm = GroqLLM(api_key=os.getenv("GROQ_API_KEY"))
# Function to extract content from a public Google Drive PDF link
def extract_pdf_content(drive_url):
file_id = drive_url.split("/d/")[1].split("/view")[0]
download_url = f"https://drive.google.com/uc?export=download&id={file_id}"
response = requests.get(download_url)
if response.status_code != 200:
return None
with open("document.pdf", "wb") as f:
f.write(response.content)
reader = PdfReader("document.pdf")
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to create a FAISS vector store from the document content
def create_vector_store(text):
sentences = text.split(". ")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = FAISS.from_texts(sentences, embedding=embeddings)
return vector_store, sentences
# Streamlit app
st.title("RAG-based Application with Focused Context")
# Predefined Google Drive link
drive_url = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing"
# Extract document content
st.write("Extracting content from the document...")
text = extract_pdf_content(drive_url)
if text:
st.write("Document extracted successfully!")
st.write("Creating vector store...")
vector_store, sentences = create_vector_store(text)
st.write("Vector store created successfully!")
query = st.text_input("Enter your query:")
if query:
st.write("Retrieving relevant context from the document...")
retriever = vector_store.as_retriever()
retriever.search_kwargs["k"] = 3 # Retrieve top 3 matches
# Define a prompt template to guide LLM response generation
prompt_template = PromptTemplate(
template="""
Use the following context to answer the question:
{context}
Question: {question}
Answer:""",
input_variables=["context", "question"]
)
# Create a RetrievalQA chain
qa_chain = RetrievalQA(
retriever=retriever,
llm=llm,
prompt=prompt_template
)
# Run the query through the QA chain
result = qa_chain.run(query)
st.write("Answer:", result)
else:
st.error("Failed to extract content from the document.")