| import streamlit as st |
| import numpy as np |
| import faiss |
| import requests |
| import pdfplumber |
| from io import BytesIO |
| from sentence_transformers import SentenceTransformer |
| from groq import Groq |
| from urllib.parse import urlparse, parse_qs |
|
|
| |
| embed_model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
| |
| API_KEY = "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" |
| client = Groq(api_key=API_KEY) |
|
|
| |
| STORED_LINKS = [ |
| "https://drive.google.com/file/d/1zHtEpoEZv_3BhEDhQKkf1D1vya2jzyAd/view?usp=sharing", |
| "https://drive.google.com/file/d/1xnRgDFGGV723Bgddf8KE9quwzpllgxyD/view?usp=sharing" |
| ] |
|
|
| |
| def extract_drive_file_id(url): |
| parsed_url = urlparse(url) |
| if 'drive.google.com' in parsed_url.netloc: |
| return parse_qs(parsed_url.query).get('id', [None])[0] or parsed_url.path.split('/')[3] |
| return None |
|
|
| |
| def download_pdf_from_drive(file_id): |
| response = requests.get(f"https://drive.google.com/uc?id={file_id}&export=download") |
| response.raise_for_status() |
| return BytesIO(response.content) |
|
|
| |
| def extract_text_from_pdf(pdf_file): |
| with pdfplumber.open(pdf_file) as pdf: |
| return ' '.join(page.extract_text() for page in pdf.pages if page.extract_text()) |
|
|
| |
| def create_embeddings(text): |
| chunks = [text[i:i + 500] for i in range(0, len(text), 500)] |
| embeddings = embed_model.encode(chunks) |
| index = faiss.IndexFlatL2(embeddings.shape[1]) |
| index.add(embeddings) |
| return chunks, embeddings, index |
|
|
| |
| def get_relevant_chunk(question, embeddings, index, chunks): |
| question_embedding = embed_model.encode([question]) |
| D, I = index.search(np.array(question_embedding).astype(np.float32), 1) |
| relevant_chunk = chunks[I[0][0]] |
| return relevant_chunk |
|
|
| |
| def get_answer_from_groq(question, context): |
| chat_completion = client.chat.completions.create( |
| messages=[ |
| {"role": "user", "content": f"Answer the following question based on the context:\nContext: {context}\nQuestion: {question}"} |
| ], |
| model="llama3-8b-8192", |
| ) |
| return chat_completion.choices[0].message.content |
|
|
| |
| def main(): |
| st.set_page_config(page_title="Google Drive RAG App", page_icon="π", layout="centered") |
| st.markdown("<h1 style='text-align: center;'>Google Drive RAG Application</h1>", unsafe_allow_html=True) |
|
|
| st.write("Processing predefined document links from Google Drive to generate embeddings stored in a FAISS index.") |
| |
| |
| all_text = "" |
| for link in STORED_LINKS: |
| try: |
| file_id = extract_drive_file_id(link) |
| if file_id: |
| st.write(f"π₯ Processing document: {link}") |
| pdf_file = download_pdf_from_drive(file_id) |
| text = extract_text_from_pdf(pdf_file) |
| all_text += text |
| else: |
| st.warning(f"β οΈ Invalid link: {link}") |
| except Exception as e: |
| st.error(f"β Failed to process link: {link}. Error: {e}") |
| |
| if all_text: |
| st.success("β
All documents processed successfully!") |
| |
| |
| st.write("π Creating embeddings...") |
| chunks, embeddings, index = create_embeddings(all_text) |
| st.success("β
Embeddings created and stored in FAISS index!") |
| |
| |
| question = st.text_input("Ask a question based on the uploaded documents:") |
| if question: |
| relevant_chunk = get_relevant_chunk(question, embeddings, index, chunks) |
| st.write("π Retrieving the answer...") |
| answer = get_answer_from_groq(question, relevant_chunk) |
| st.subheader("Answer:") |
| st.write(answer) |
|
|
| if __name__ == "__main__": |
| main() |
|
|