File size: 3,972 Bytes
9a44baf
 
6d7222a
 
 
 
 
0632146
 
2ca9eec
6518b55
6d7222a
 
 
2ca9eec
 
 
 
 
 
e1410c3
6d7222a
 
2ca9eec
 
 
 
 
6d7222a
 
 
 
 
 
 
 
 
 
 
9a44baf
2ca9eec
 
e8b6274
2ca9eec
e8b6274
 
c39358b
e8b6274
 
 
 
 
2ca9eec
 
e8b6274
2ca9eec
e8b6274
 
2ca9eec
 
 
 
 
 
 
 
 
e8b6274
2ca9eec
 
e8b6274
 
2ca9eec
e8b6274
 
2ca9eec
e8b6274
2ca9eec
e8b6274
2ca9eec
 
6d7222a
b4e84e9
6d7222a
 
 
 
 
 
 
 
 
2ca9eec
 
6d7222a
2ca9eec
 
 
 
6d7222a
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
import streamlit as st
import requests
from PyPDF2 import PdfReader
from langchain_community.vectorstores import FAISS
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from groq import Groq

# Hardcoded Google Drive link
GOOGLE_DRIVE_LINK = "https://drive.google.com/file/d/1wv5gbGP0SA15BzoNUxprXhYx0jHhPgHl/view?usp=sharing"

# Function to download the PDF from Google Drive
def download_pdf():
    file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0]
    url = f"https://drive.google.com/uc?id={file_id}&export=download"
    response = requests.get(url)
    with open("document.pdf", "wb") as f:
        f.write(response.content)
    return "document.pdf"

# Function to extract text from PDF
def extract_text_from_pdf(pdf_file):
    reader = PdfReader(pdf_file)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

# Function to create FAISS vector database
def create_vector_db(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_text(text)

    # Use Hugging Face Embeddings
    model_name = "all-MiniLM-L6-v2"
    embeddings = HuggingFaceEmbeddings(model_name=model_name)
    vector_db = FAISS.from_texts(chunks, embeddings)
    return vector_db

# Function to query Groq API
def query_groq_api(query, context, model="llama-3.3-70b-versatile"):
    # Define the Groq API key
    GROQ_API_KEY = "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"
    
    # Optionally set it as an environment variable (not necessary in this case)
    os.environ["GROQ_API_KEY"] = GROQ_API_KEY

    # API endpoint (Uncomment the URL)
    url = "https://api.groq.com/openai/v1/chat/completions"
    
    # Headers for the API request
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}",  # Retrieve from environment
    }
    
    # Data to send to the API
    data = {
        "model": model,
        "messages": [
            {"role": "system", "content": "You are an intelligent assistant."},
            {"role": "user", "content": f"Context: {context}\nQuestion: {query}"}
        ],
    }

    try:
        # Send POST request to Groq API
        response = requests.post(url, headers=headers, json=data)
        response.raise_for_status()  # Raise an error for bad responses

        # Get the API response content
        result = response.json()

        # Extract the answer from the response
        return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
    
    except requests.exceptions.RequestException as e:
        # Handle errors
        return f"Error: {e}"

# Streamlit App
st.title("PDF Book Querry and Response")

# Persistent state to store vector database
if "vector_db" not in st.session_state:
    st.session_state.vector_db = None

# Process the hardcoded PDF link
if st.button("Process PDF"):
    st.info("Downloading and processing the PDF...")
    pdf_file = download_pdf()
    pdf_text = extract_text_from_pdf(pdf_file)
    st.success("PDF processed successfully!")

    # Create FAISS vector database
    st.info("Creating vector database...")
    st.session_state.vector_db = create_vector_db(pdf_text)
    st.success("Vector database created!")

# Query the document
if st.session_state.vector_db:
    user_query = st.text_input("Ask a question about the document:")
    if st.button("Submit Query"):
        with st.spinner("Processing your query..."):
            # Retrieve similar text chunks
            similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3)
            context = " ".join([doc.page_content for doc in similar_docs])

            # Send query with context to Groq API
            response = query_groq_api(user_query, context)
            st.write("**Answer:**", response)