Spaces:
Sleeping
Sleeping
| # app.py | |
| import os | |
| import json | |
| import faiss | |
| import numpy as np | |
| import PyPDF2 | |
| import requests | |
| import streamlit as st | |
| from groq import Groq | |
| # Constants | |
| PDF_URL = "https://drive.google.com/uc?export=download&id=1YWX-RYxgtcKO1QETnz1N3rboZUhRZwcH" | |
| VECTOR_DIM = 768 | |
| CHUNK_SIZE = 512 | |
| # Function to download and extract text from the PDF | |
| def extract_text_from_pdf(url): | |
| response = requests.get(url) | |
| with open("document.pdf", "wb") as f: | |
| f.write(response.content) | |
| with open("document.pdf", "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| text = "\n".join(page.extract_text() for page in reader.pages) | |
| return text | |
| # Function to split text into chunks | |
| def create_chunks(text, chunk_size): | |
| words = text.split() | |
| chunks = [" ".join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] | |
| return chunks | |
| # Function to create FAISS vector store | |
| def create_faiss_index(chunks, vector_dim): | |
| # Check if GPU is available and use it | |
| if faiss.get_num_gpus() > 0: | |
| st.write("Using GPU for FAISS indexing.") | |
| resource = faiss.StandardGpuResources() # Initialize GPU resources | |
| index_flat = faiss.IndexFlatL2(vector_dim) | |
| index = faiss.index_cpu_to_gpu(resource, 0, index_flat) | |
| else: | |
| st.write("Using CPU for FAISS indexing.") | |
| index = faiss.IndexFlatL2(vector_dim) | |
| embeddings = np.random.rand(len(chunks), vector_dim).astype('float32') # Replace with real embeddings | |
| index.add(embeddings) | |
| return index, embeddings | |
| # Initialize Groq API client | |
| def get_groq_client(): | |
| return Groq(api_key=os.environ.get("GROQ_API_KEY")) | |
| # Query Groq model | |
| def query_model(client, question): | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": question}], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit app | |
| def main(): | |
| st.title("RAG-Based ChatBot") | |
| # Step 1: Extract text from the document | |
| st.header("Step 1: Extract Text") | |
| if st.button("Extract Text from PDF"): | |
| text = extract_text_from_pdf(PDF_URL) | |
| st.session_state["text"] = text | |
| st.success("Text extracted successfully!") | |
| # Step 2: Chunk the text | |
| st.header("Step 2: Create Chunks") | |
| if "text" in st.session_state and st.button("Create Chunks"): | |
| chunks = create_chunks(st.session_state["text"], CHUNK_SIZE) | |
| st.session_state["chunks"] = chunks | |
| st.success(f"Created {len(chunks)} chunks.") | |
| # Step 3: Create FAISS index | |
| st.header("Step 3: Create Vector Database") | |
| if "chunks" in st.session_state and st.button("Create Vector Database"): | |
| index, embeddings = create_faiss_index(st.session_state["chunks"], VECTOR_DIM) | |
| st.session_state["index"] = index | |
| st.success("FAISS vector database created.") | |
| # Step 4: Ask a question | |
| st.header("Step 4: Query the Model") | |
| question = st.text_input("Ask a question about the document:") | |
| if question and "index" in st.session_state: | |
| client = get_groq_client() | |
| answer = query_model(client, question) | |
| st.write("Answer:", answer) | |
| if __name__ == "__main__": | |
| main() | |