Spaces:
Sleeping
Sleeping
| import os | |
| import io | |
| import re | |
| import requests | |
| import faiss | |
| import numpy as np | |
| import streamlit as st | |
| from PyPDF2 import PdfReader | |
| from sentence_transformers import SentenceTransformer | |
| from groq import Groq | |
| # ============ CONFIG ============ # | |
| GROQ_API_KEY = os.getenv("GROQ_API_KEY") | |
| if not GROQ_API_KEY: | |
| st.error("β GROQ_API_KEY environment variable not found.") | |
| st.stop() | |
| client = Groq(api_key=GROQ_API_KEY) | |
| embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| # Google Drive file links (shared by you) | |
| GDRIVE_LINKS = [ | |
| "https://drive.google.com/file/d/1aBFrAktgTIFwYxNDiY75Gj-4gwqoUJbm/view?usp=sharing", | |
| "https://drive.google.com/file/d/1boqYWdtFqYagnVk7oeh6hRZb5Um2W9zC/view?usp=sharing" | |
| ] | |
| # ============ UTILS ============ # | |
| def gdrive_to_direct(link): | |
| match = re.search(r"drive\.google\.com\/file\/d\/([^/]+)", link) | |
| if match: | |
| file_id = match.group(1) | |
| return f"https://drive.google.com/uc?export=download&id={file_id}" | |
| return None | |
| def fetch_pdf(url): | |
| response = requests.get(url, timeout=30) | |
| response.raise_for_status() | |
| return response.content | |
| def read_pdf_bytes(data): | |
| reader = PdfReader(io.BytesIO(data)) | |
| text = "" | |
| for page in reader.pages: | |
| extracted = page.extract_text() | |
| if extracted: | |
| text += extracted | |
| return text | |
| def chunk_text(text, max_length=500): | |
| words = text.split() | |
| return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)] | |
| def create_faiss_index(chunks): | |
| embeddings = embedder.encode(chunks) | |
| dim = embeddings.shape[1] | |
| index = faiss.IndexFlatL2(dim) | |
| index.add(np.array(embeddings)) | |
| return index, chunks | |
| def search_index(index, query, chunks, top_k=3): | |
| query_embedding = embedder.encode([query]) | |
| D, I = index.search(np.array(query_embedding), top_k) | |
| return [chunks[i] for i in I[0]] | |
| # ============ STREAMLIT UI ============ # | |
| st.set_page_config(page_title="π§ RAG Chat from Cloud PDFs", layout="wide") | |
| st.title("π Chat with 2 Google Drive PDFs (Auto-loaded)") | |
| with st.spinner("π₯ Downloading and processing PDF documents..."): | |
| combined_text = "" | |
| for link in GDRIVE_LINKS: | |
| direct_url = gdrive_to_direct(link) | |
| if direct_url: | |
| try: | |
| pdf_bytes = fetch_pdf(direct_url) | |
| combined_text += read_pdf_bytes(pdf_bytes) | |
| except Exception as e: | |
| st.error(f"β Error fetching PDF from: {link}\n\n{e}") | |
| st.stop() | |
| else: | |
| st.error(f"β Invalid Google Drive link format: {link}") | |
| st.stop() | |
| chunks = chunk_text(combined_text) | |
| index, stored_chunks = create_faiss_index(chunks) | |
| st.success("β PDFs loaded and indexed. Ask your questions below!") | |
| # Input box for queries | |
| query = st.text_input("Ask a question based on the documents:") | |
| if query: | |
| with st.spinner("π Searching and generating response..."): | |
| context = search_index(index, query, stored_chunks) | |
| prompt = "\n".join(context) + f"\n\nQuestion: {query}" | |
| response = client.chat.completions.create( | |
| model="llama-3.3-70b-versatile", | |
| messages=[{"role": "user", "content": prompt}] | |
| ) | |
| answer = response.choices[0].message.content.strip() | |
| st.markdown(f"**Answer:** {answer}") | |