Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import requests | |
| import PyPDF2 | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import nltk | |
| from groq import Groq | |
| # Ensure the punkt tokenizer is downloaded | |
| try: | |
| nltk.data.find('tokenizers/punkt') | |
| except LookupError: | |
| nltk.download('punkt') | |
| # Initialize Groq client | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| # Function to extract text from a PDF | |
| def extract_text_from_pdf(pdf_url): | |
| # Convert Google Drive shareable link to direct download link | |
| direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=") | |
| response = requests.get(direct_url) | |
| pdf_content = response.content | |
| with open("temp.pdf", "wb") as f: | |
| f.write(pdf_content) | |
| # Read the PDF content | |
| with open("temp.pdf", "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| os.remove("temp.pdf") | |
| return text | |
| # Function to chunk text | |
| def chunk_text(text, chunk_size=300): | |
| sentences = nltk.sent_tokenize(text) | |
| chunks = [] | |
| current_chunk = [] | |
| current_length = 0 | |
| for sentence in sentences: | |
| current_length += len(sentence.split()) | |
| if current_length <= chunk_size: | |
| current_chunk.append(sentence) | |
| else: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [sentence] | |
| current_length = len(sentence.split()) | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| # Function to create embeddings and store them in FAISS | |
| def create_faiss_index(chunks): | |
| model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = model.encode(chunks) | |
| dimension = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| return index, embeddings | |
| # Function to query FAISS | |
| def query_faiss(index, query, chunks, model): | |
| query_vector = model.encode([query]) | |
| distances, indices = index.search(query_vector, k=3) | |
| results = [chunks[i] for i in indices[0]] | |
| return results | |
| # Main Streamlit App | |
| def main(): | |
| st.title("RAG-based Application") | |
| st.write("Interact with your document using Groq-powered model.") | |
| # Pre-defined document link | |
| doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing" | |
| # Extract Document Content | |
| if "document_text" not in st.session_state: | |
| st.write("Extracting document content...") | |
| text = extract_text_from_pdf(doc_link) | |
| st.session_state['document_text'] = text | |
| st.success("Document content extracted!") | |
| # Process Document and Create FAISS Index | |
| if 'document_text' in st.session_state and "faiss_index" not in st.session_state: | |
| st.write("Processing document...") | |
| chunks = chunk_text(st.session_state['document_text']) | |
| index, embeddings = create_faiss_index(chunks) | |
| st.session_state['faiss_index'] = index | |
| st.session_state['chunks'] = chunks | |
| st.session_state['model'] = SentenceTransformer("all-MiniLM-L6-v2") | |
| st.success(f"Document processed into {len(chunks)} chunks!") | |
| # Query the Document | |
| if 'faiss_index' in st.session_state: | |
| st.header("Ask Questions") | |
| query = st.text_input("Enter your question here") | |
| if st.button("Query Document"): | |
| results = query_faiss(st.session_state['faiss_index'], | |