Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import requests | |
| from PyPDF2 import PdfReader | |
| from langchain_community.vectorstores import FAISS | |
| from langchain.embeddings.huggingface import HuggingFaceEmbeddings | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from groq import Groq | |
| # Hardcoded Google Drive link | |
| GOOGLE_DRIVE_LINK = "https://drive.google.com/file/d/1wv5gbGP0SA15BzoNUxprXhYx0jHhPgHl/view?usp=sharing" | |
| # Function to download the PDF from Google Drive | |
| def download_pdf(): | |
| file_id = GOOGLE_DRIVE_LINK.split("/d/")[1].split("/view")[0] | |
| url = f"https://drive.google.com/uc?id={file_id}&export=download" | |
| response = requests.get(url) | |
| with open("document.pdf", "wb") as f: | |
| f.write(response.content) | |
| return "document.pdf" | |
| # Function to extract text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| reader = PdfReader(pdf_file) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Function to create FAISS vector database | |
| def create_vector_db(text): | |
| text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50) | |
| chunks = text_splitter.split_text(text) | |
| # Use Hugging Face Embeddings | |
| model_name = "all-MiniLM-L6-v2" | |
| embeddings = HuggingFaceEmbeddings(model_name=model_name) | |
| vector_db = FAISS.from_texts(chunks, embeddings) | |
| return vector_db | |
| # Function to query Groq API | |
| def query_groq_api(query, context, model="llama-3.3-70b-versatile"): | |
| # Define the Groq API key | |
| GROQ_API_KEY = "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" | |
| # Optionally set it as an environment variable (not necessary in this case) | |
| os.environ["GROQ_API_KEY"] = GROQ_API_KEY | |
| # API endpoint (Uncomment the URL) | |
| url = "https://api.groq.com/openai/v1/chat/completions" | |
| # Headers for the API request | |
| headers = { | |
| "Content-Type": "application/json", | |
| "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}", # Retrieve from environment | |
| } | |
| # Data to send to the API | |
| data = { | |
| "model": model, | |
| "messages": [ | |
| {"role": "system", "content": "You are an intelligent assistant."}, | |
| {"role": "user", "content": f"Context: {context}\nQuestion: {query}"} | |
| ], | |
| } | |
| try: | |
| # Send POST request to Groq API | |
| response = requests.post(url, headers=headers, json=data) | |
| response.raise_for_status() # Raise an error for bad responses | |
| # Get the API response content | |
| result = response.json() | |
| # Extract the answer from the response | |
| return result.get("choices", [{}])[0].get("message", {}).get("content", "No response.") | |
| except requests.exceptions.RequestException as e: | |
| # Handle errors | |
| return f"Error: {e}" | |
| # Streamlit App | |
| st.title("PDF Book Querry and Response") | |
| # Persistent state to store vector database | |
| if "vector_db" not in st.session_state: | |
| st.session_state.vector_db = None | |
| # Process the hardcoded PDF link | |
| if st.button("Process PDF"): | |
| st.info("Downloading and processing the PDF...") | |
| pdf_file = download_pdf() | |
| pdf_text = extract_text_from_pdf(pdf_file) | |
| st.success("PDF processed successfully!") | |
| # Create FAISS vector database | |
| st.info("Creating vector database...") | |
| st.session_state.vector_db = create_vector_db(pdf_text) | |
| st.success("Vector database created!") | |
| # Query the document | |
| if st.session_state.vector_db: | |
| user_query = st.text_input("Ask a question about the document:") | |
| if st.button("Submit Query"): | |
| with st.spinner("Processing your query..."): | |
| # Retrieve similar text chunks | |
| similar_docs = st.session_state.vector_db.similarity_search(user_query, k=3) | |
| context = " ".join([doc.page_content for doc in similar_docs]) | |
| # Send query with context to Groq API | |
| response = query_groq_api(user_query, context) | |
| st.write("**Answer:**", response) |