Spaces:
Sleeping
Sleeping
| import os | |
| import PyPDF2 | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| from sentence_transformers import SentenceTransformer | |
| import faiss | |
| import streamlit as st | |
| from groq import Groq | |
| # Download the punkt resource at runtime (in case it wasn't downloaded during build) | |
| nltk.download('punkt') | |
| # Set the API key directly | |
| GROQ_API_KEY = "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" | |
| # Initialize Groq Client | |
| client = Groq(api_key=GROQ_API_KEY) | |
| # Test the client | |
| response = client.chat.completions.create( | |
| messages=[{"role": "user", "content": "Test query to verify Groq API"}], | |
| model="llama3-8b-8192", | |
| ) | |
| print(response.choices[0].message.content) | |
| # Load Sentence Transformer Model | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Initialize FAISS Index | |
| dimension = 384 # Dimension of the embeddings | |
| index = faiss.IndexFlatL2(dimension) | |
| # Function to Extract Text from PDF | |
| def extract_text_from_pdf(pdf_file): | |
| pdf_reader = PyPDF2.PdfReader(pdf_file) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() | |
| return text | |
| # Function to Chunk and Tokenize Text | |
| def chunk_and_tokenize(text): | |
| sentences = sent_tokenize(text) | |
| chunks = [' '.join(sentences[i:i+5]) for i in range(0, len(sentences), 5)] | |
| return chunks | |
| # Function to Create Embeddings | |
| def create_embeddings(chunks): | |
| embeddings = model.encode(chunks) | |
| return embeddings | |
| # Function to Query Groq | |
| def query_groq(prompt): | |
| response = client.chat.completions.create( | |
| messages=[{"role": "user", "content": prompt}], | |
| model="llama3-8b-8192", | |
| ) | |
| return response.choices[0].message.content | |
| # Streamlit Frontend | |
| st.title("RAG-based PDF Query App") | |
| uploaded_file = st.file_uploader("Upload a PDF file", type="pdf") | |
| if uploaded_file: | |
| text = extract_text_from_pdf(uploaded_file) | |
| st.write("Extracted Text:") | |
| st.write(text[:500]) # Display first 500 characters | |
| chunks = chunk_and_tokenize(text) | |
| st.write(f"Text divided into {len(chunks)} chunks.") | |
| embeddings = create_embeddings(chunks) | |
| index.add(embeddings) | |
| st.write("Embeddings created and stored in FAISS database.") | |
| query = st.text_input("Enter your query:") | |
| if query: | |
| # Find the most relevant chunk | |
| query_embedding = model.encode([query]) | |
| _, indices = index.search(query_embedding, 1) | |
| relevant_chunk = chunks[indices[0][0]] | |
| # Query Groq | |
| response = query_groq(relevant_chunk) | |
| st.write("Response from Groq:") | |
| st.write(response) | |