Spaces:

makhdoomnaeem
/

Questions_Answers

Sleeping

App Files Files Community

makhdoomnaeem commited on Jan 5, 2025

Commit

6d7222a

verified ·

1 Parent(s): a15bbb6

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -128

app.py CHANGED Viewed

@@ -1,141 +1,98 @@
 import os
-import pickle
-import re
 import streamlit as st
-from googleapiclient.discovery import build
-from google_auth_oauthlib.flow import InstalledAppFlow
-from sentence_transformers import SentenceTransformer
-import faiss
-from groq import Groq
-# Constants
-SCOPES = ['https://www.googleapis.com/auth/drive.readonly']
 # Initialize Groq Client
 GROQ_API_KEY = "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"  # Replace with your Groq API key
 os.environ["GROQ_API_KEY"] = GROQ_API_KEY
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
-# Hardcoded Google Drive share link
-SHARE_LINK = "https://drive.google.com/drive/folders/1gVdnV1za1thrVnH4LALbDzPtcvKr0z4u?usp=drive_link"
-# Function to extract folder ID from share link
-def extract_folder_id(share_link):
-    match = re.search(r"(?<=folders/)[^/?]+", share_link)
-    if match:
-        return match.group(0)
-    else:
-        st.error("Invalid Google Drive folder share link.")
-        return None
-# Function to authenticate Google Drive
-def authenticate_drive():
-    creds = None
-    if os.path.exists("token.pickle"):
-        with open("token.pickle", "rb") as token:
-            creds = pickle.load(token)
-    if not creds or not creds.valid:
-        if creds and creds.expired and creds.refresh_token:
-            creds.refresh(Request())
-        else:
-            flow = InstalledAppFlow.from_client_secrets_file("client_secrets.json", SCOPES)
-            creds = flow.run_local_server(port=0)
-        with open("token.pickle", "wb") as token:
-            pickle.dump(creds, token)
-    return build("drive", "v3", credentials=creds)
-# Function to load documents from Google Drive
-def load_documents(service, folder_id):
-    documents = []
-    results = service.files().list(
-        q=f"'{folder_id}' in parents and trashed=false",
-        fields="files(id, name, mimeType)"
-    ).execute()
-    files = results.get("files", [])
-    for file in files:
-        if file["mimeType"] == "application/pdf":
-            request = service.files().get_media(fileId=file["id"])
-            file_content = request.execute().decode("utf-8")  # Assuming plain text PDF for simplicity
-            documents.append({"id": file["id"], "name": file["name"], "text": file_content})
-    return documents
-# Function to build FAISS index
-def build_faiss_index(documents, embedder):
-    document_texts = [doc["text"] for doc in documents]
-    embeddings = embedder.encode(document_texts, convert_to_tensor=False)
-    dimension = embeddings[0].shape[0]
-    index = faiss.IndexFlatL2(dimension)
-    index.add(embeddings)
-    return index, document_texts
 # Function to query Groq API
-def query_groq(query, context):
-    chat_completion = client.chat.completions.create(
-        messages=[
-            {
-                "role": "user",
-                "content": f"Answer the following question based on these documents: {context} \n\nQuestion: {query}",
-            }
         ],
-        model="llama-3.3-70b-versatile",
-        stream=False,
-    )
-    return chat_completion.choices[0].message.content
-# Streamlit UI
-st.title("Document Querying with RAG and Groq")
-st.write("Processing documents from a predefined Google Drive folder and answering your queries.")
-# Google Drive authentication
-service = authenticate_drive()
-# Extract folder ID from share link
-folder_id = extract_folder_id(SHARE_LINK)
-documents = []
-if folder_id:
-    st.write("Fetching documents from Google Drive...")
     try:
-        documents = load_documents(service, folder_id)
-        st.success(f"Loaded {len(documents)} documents!")
-    except Exception as e:
-        st.error(f"Error fetching documents: {e}")
-# Build FAISS Index
-embedder = SentenceTransformer("all-MiniLM-L6-v2")
-index, document_texts = None, None
-if documents:
-    st.write("Building FAISS index...")
-    try:
-        index, document_texts = build_faiss_index(documents, embedder)
-        st.success("Index built successfully!")
-    except Exception as e:
-        st.error(f"Error building index: {e}")
-# Query the Documents
-query = st.text_input("Enter your question:")
-if query and index:
-    st.write("Searching for relevant documents...")
-    try:
-        # Retrieve top-k relevant documents
-        query_embedding = embedder.encode(query, convert_to_tensor=False)
-        top_k = 3
-        distances, indices = index.search([query_embedding], top_k)
-        relevant_docs = [document_texts[idx] for idx in indices[0]]
-        context = " ".join(relevant_docs)[:100000]  # Truncate context for API compatibility
-        # Display relevant document names
-        st.write("Top relevant documents:")
-        for idx in indices[0]:
-            st.write(f"- {documents[idx]['name']}")
-        # Query Groq API
-        st.write("Querying Groq AI for the answer...")
-        try:
-            answer = query_groq(query, context)
-            st.success(f"Answer: {answer}")
-        except Exception as e:
-            st.error(f"Error querying Groq API: {e}")
-    except Exception as e:
-        st.error(f"Error during query: {e}")

 import os
 import streamlit as st
+import requests
+from PyPDF2 import PdfReader
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
 # Initialize Groq Client
 GROQ_API_KEY = "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"  # Replace with your Groq API key
 os.environ["GROQ_API_KEY"] = GROQ_API_KEY
 client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
+# Hardcoded Google Drive link
+GOOGLE_DRIVE_LINK = "https://drive.google.com/drive/folders/1gVdnV1za1thrVnH4LALbDzPtcvKr0z4u?usp=drive_link"
+# Function to download the PDF from Google Drive
+def download_pdf():
+    file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0]
+    url = f"https://drive.google.com/uc?id={file_id}&export=download"
+    response = requests.get(url)
+    with open("document.pdf", "wb") as f:
+        f.write(response.content)
+    return "document.pdf"
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    reader = PdfReader(pdf_file)
+    text = ""
+    for page in reader.pages:
+        text += page.extract_text()
+    return text
+# Function to create FAISS vector database
+def create_vector_db(text):
+    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
+    chunks = text_splitter.split_text(text)
+    # Use Hugging Face Embeddings
+    model_name = "all-MiniLM-L6-v2"
+    embeddings = HuggingFaceEmbeddings(model_name=model_name)
+    vector_db = FAISS.from_texts(chunks, embeddings)
+    return vector_db
 # Function to query Groq API
+def query_groq_api(query, context, model="llama-3.3-70b-versatile"):
+    url = "https://api.groq.com/openai/v1/chat/completions"
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}",
+    }
+    data = {
+        "model": model,
+        "messages": [
+            {"role": "system", "content": "You are an intelligent assistant."},
+            {"role": "user", "content": f"Context: {context}\nQuestion: {query}"}
         ],
+    }
     try:
+        response = requests.post(url, headers=headers, json=data)
+        response.raise_for_status()  # Raise an error for bad responses
+        result = response.json()
+        return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.")
+    except requests.exceptions.RequestException as e:
+        return f"Error: {e}"
+# Streamlit App
+st.title("PDF Q&A with Groq API")
+# Persistent state to store vector database
+if "vector_db" not in st.session_state:
+    st.session_state.vector_db = None
+# Process the hardcoded PDF link
+if st.button("Process PDF"):
+    st.info("Downloading and processing the PDF...")
+    pdf_file = download_pdf()
+    pdf_text = extract_text_from_pdf(pdf_file)
+    st.success("PDF processed successfully!")
+    # Create FAISS vector database
+    st.info("Creating vector database...")
+    st.session_state.vector_db = create_vector_db(pdf_text)
+    st.success("Vector database created!")
+# Query the document
+if st.session_state.vector_db:
+    user_query = st.text_input("Ask a question about the document:")
+    if st.button("Submit Query"):
+        with st.spinner("Processing your query..."):
+            # Retrieve similar text chunks
+            similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3)
+            context = " ".join([doc.page_content for doc in similar_docs])
+            # Send query with context to Groq API
+            response = query_groq_api(user_query, context)
+            st.write("**Answer:**", response)