Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import requests | |
| import PyPDF2 | |
| from groq import Groq | |
| from langchain.vectorstores import FAISS | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from sentence_transformers import SentenceTransformer | |
| # Initialize Groq client | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| # Function to extract text from a PDF | |
| def extract_text_from_pdf(pdf_url): | |
| # Convert Google Drive shareable link to direct download link | |
| direct_url = pdf_url.replace("/view?usp=sharing", "").replace("file/d/", "uc?id=") | |
| response = requests.get(direct_url) | |
| pdf_content = response.content | |
| with open("temp.pdf", "wb") as f: | |
| f.write(pdf_content) | |
| # Read the PDF content | |
| with open("temp.pdf", "rb") as f: | |
| reader = PyPDF2.PdfReader(f) | |
| text = "" | |
| for page in reader.pages: | |
| text += page.extract_text() | |
| os.remove("temp.pdf") | |
| return text | |
| # Function to chunk text manually | |
| def chunk_text(text, chunk_size=300): | |
| # Split text by spaces and process into chunks | |
| words = text.split() | |
| chunks = [] | |
| current_chunk = [] | |
| for word in words: | |
| if len(current_chunk) + len(word.split()) <= chunk_size: | |
| current_chunk.append(word) | |
| else: | |
| chunks.append(" ".join(current_chunk)) | |
| current_chunk = [word] | |
| if current_chunk: | |
| chunks.append(" ".join(current_chunk)) | |
| return chunks | |
| # Function to create embeddings and store them in FAISS using Langchain | |
| def create_faiss_index(chunks): | |
| # Use SentenceTransformer for embeddings | |
| embeddings_model = SentenceTransformer("all-MiniLM-L6-v2") | |
| embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # Create FAISS vector store | |
| doc_search = FAISS.from_texts(chunks, embeddings) | |
| return doc_search | |
| # Function to query FAISS and retrieve relevant document chunks | |
| def query_faiss(doc_search, query): | |
| results = doc_search.similarity_search(query, k=3) | |
| return [result.page_content for result in results] | |
| # Main Streamlit App | |
| def main(): | |
| st.title("RAG-based Application") | |
| st.write("Interact with your document using Groq-powered model.") | |
| # Pre-defined document link | |
| doc_link = "https://drive.google.com/file/d/1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0/view?usp=sharing" | |
| # Extract Document Content | |
| if "document_text" not in st.session_state: | |
| st.write("Extracting document content...") | |
| text = extract_text_from_pdf(doc_link) | |
| st.session_state['document_text'] = text | |
| st.success("Document content extracted!") | |
| # Process Document and Create FAISS Index | |
| if 'document_text' in st.session_state and "faiss_index" not in st.session_state: | |
| st.write("Processing document...") | |
| chunks = chunk_text(st.session_state['document_text']) | |
| doc_search = create_faiss_index(chunks) | |
| st.session_state['faiss_index'] = doc_search | |
| st.session_state['chunks'] = chunks | |
| st.success(f"Document processed into {len(chunks)} chunks!") | |
| # Query the Document | |
| if 'faiss_index' in st.session_state: | |
| st.header("Ask Questions") | |
| query = st.text_input("Enter your question here") | |
| if st.button("Query Document"): | |
| results = query_faiss(st.session_state['faiss_index'], query) | |
| if not results: | |
| st.warning("No relevant context found in the document.") | |
| else: | |
| st.write("### Results from Document:") | |
| for i, result in enumerate(results): | |
| st.write(f"**Result {i+1}:** {result}") | |
| # Combine results to provide context | |
| context = "\n".join(results) | |
| st.write("### Insights based on Document Context:") | |
| prompt = ( | |
| f"The following context is from the document:\n\n" | |
| f"{context}\n\n" | |
| f"Based on this context, answer the question:\n" | |
| f"{query}" | |
| ) | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model="llama-3.3-70b-versatile", | |
| ) | |
| st.write(chat_completion.choices[0].message.content) | |
| if __name__ == "__main__": | |
| main() | |