Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import PyPDF2 | |
| import requests | |
| import faiss | |
| from groq import Groq | |
| # Initialize Groq client using the secret environment variable | |
| client = Groq(api_key=os.getenv("GROQ_API_KEY")) | |
| # Function to download and read PDF content | |
| def extract_text_from_google_drive(): | |
| link = "https://drive.google.com/uc?id=1XvqA1OIssRs2gbmOtKFKj-02yQ5X2yg0" | |
| response = requests.get(link) | |
| with open("document.pdf", "wb") as file: | |
| file.write(response.content) | |
| with open("document.pdf", "rb") as file: | |
| reader = PyPDF2.PdfReader(file) | |
| text = " ".join([page.extract_text() for page in reader.pages]) | |
| return text | |
| # Function to chunk text | |
| def chunk_text(text, max_length=500): | |
| sentences = text.split(". ") | |
| chunks = [] | |
| chunk = "" | |
| for sentence in sentences: | |
| if len(chunk) + len(sentence) <= max_length: | |
| chunk += sentence + ". " | |
| else: | |
| chunks.append(chunk.strip()) | |
| chunk = sentence + ". " | |
| if chunk: | |
| chunks.append(chunk.strip()) | |
| return chunks | |
| # Function to compute simple embeddings | |
| def compute_embeddings(chunks): | |
| embeddings = [] | |
| for chunk in chunks: | |
| vector = [ord(char) for char in chunk[:300]] # Truncate to 300 characters | |
| padded_vector = vector + [0] * (300 - len(vector)) # Zero-pad to fixed size | |
| embeddings.append(padded_vector) | |
| return embeddings | |
| # Function to create FAISS index | |
| def create_faiss_index(embeddings): | |
| dimension = len(embeddings[0]) | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(faiss.FloatVectorArray(embeddings)) | |
| return index | |
| # Function to query Groq API | |
| def query_groq(question, model_name="llama-3.3-70b-versatile"): | |
| chat_completion = client.chat.completions.create( | |
| messages=[{"role": "user", "content": question}], | |
| model=model_name, | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Streamlit app | |
| def main(): | |
| st.title("RAG-based Application with Groq API") | |
| st.subheader("Query the document stored on Google Drive") | |
| st.write("Extracting text from the document...") | |
| text = extract_text_from_google_drive() | |
| st.write("Document text extracted successfully!") | |
| st.write("Chunking and embedding text...") | |
| chunks = chunk_text(text) | |
| embeddings = compute_embeddings(chunks) | |
| index = create_faiss_index(embeddings) | |
| st.write(f"Created FAISS index with {len(chunks)} chunks.") | |
| # Query input | |
| question = st.text_input("Ask a question based on the document:") | |
| if question: | |
| st.write("Searching for relevant chunks...") | |
| question_embedding = compute_embeddings([question])[0] | |
| _, indices = index.search(faiss.FloatVectorArray([question_embedding]), k=1) | |
| relevant_chunk = chunks[indices[0][0]] | |
| st.write("Generating answer using Groq API...") | |
| answer = query_groq(relevant_chunk) | |
| st.write("### Answer:") | |
| st.write(answer) | |
| if __name__ == "__main__": | |
| main() | |