Spaces:
Sleeping
Sleeping
File size: 3,972 Bytes
9a44baf 6d7222a 0632146 2ca9eec 6518b55 6d7222a 2ca9eec e1410c3 6d7222a 2ca9eec 6d7222a 9a44baf 2ca9eec e8b6274 2ca9eec e8b6274 c39358b e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec e8b6274 2ca9eec 6d7222a b4e84e9 6d7222a 2ca9eec 6d7222a 2ca9eec 6d7222a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import os
import streamlit as st
import requests
from PyPDF2 import PdfReader
from langchain_community.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from groq import Groq
# Hardcoded Google Drive link
GOOGLE_DRIVE_LINK = "https://drive.google.com/file/d/1wv5gbGP0SA15BzoNUxprXhYx0jHhPgHl/view?usp=sharing"
# Function to download the PDF from Google Drive
def download_pdf():
file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0]
url = f"https://drive.google.com/uc?id={file_id}&export=download"
response = requests.get(url)
with open("document.pdf", "wb") as f:
f.write(response.content)
return "document.pdf"
# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
reader = PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text()
return text
# Function to create FAISS vector database
def create_vector_db(text):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = text_splitter.split_text(text)
# Use Hugging Face Embeddings
model_name = "all-MiniLM-L6-v2"
embeddings = HuggingFaceEmbeddings(model_name=model_name)
vector_db = FAISS.from_texts(chunks, embeddings)
return vector_db
# Function to query Groq API
def query_groq_api(query, context, model="llama-3.3-70b-versatile"):
# Define the Groq API key
GROQ_API_KEY = "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"
# Optionally set it as an environment variable (not necessary in this case)
os.environ["GROQ_API_KEY"] = GROQ_API_KEY
# API endpoint (Uncomment the URL)
url = "https://api.groq.com/openai/v1/chat/completions"
# Headers for the API request
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}", # Retrieve from environment
}
# Data to send to the API
data = {
"model": model,
"messages": [
{"role": "system", "content": "You are an intelligent assistant."},
{"role": "user", "content": f"Context: {context}\nQuestion: {query}"}
],
}
try:
# Send POST request to Groq API
response = requests.post(url, headers=headers, json=data)
response.raise_for_status() # Raise an error for bad responses
# Get the API response content
result = response.json()
# Extract the answer from the response
return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
except requests.exceptions.RequestException as e:
# Handle errors
return f"Error: {e}"
# Streamlit App
st.title("PDF Book Querry and Response")
# Persistent state to store vector database
if "vector_db" not in st.session_state:
st.session_state.vector_db = None
# Process the hardcoded PDF link
if st.button("Process PDF"):
st.info("Downloading and processing the PDF...")
pdf_file = download_pdf()
pdf_text = extract_text_from_pdf(pdf_file)
st.success("PDF processed successfully!")
# Create FAISS vector database
st.info("Creating vector database...")
st.session_state.vector_db = create_vector_db(pdf_text)
st.success("Vector database created!")
# Query the document
if st.session_state.vector_db:
user_query = st.text_input("Ask a question about the document:")
if st.button("Submit Query"):
with st.spinner("Processing your query..."):
# Retrieve similar text chunks
similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3)
context = " ".join([doc.page_content for doc in similar_docs])
# Send query with context to Groq API
response = query_groq_api(user_query, context)
st.write("**Answer:**", response) |