Spaces:
Sleeping
Sleeping
File size: 3,356 Bytes
50e1cd1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
import os
import io
import re
import requests
import faiss
import numpy as np
import streamlit as st
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
from groq import Groq
# ============ CONFIG ============ #
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
if not GROQ_API_KEY:
st.error("β GROQ_API_KEY environment variable not found.")
st.stop()
client = Groq(api_key=GROQ_API_KEY)
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Google Drive file links (shared by you)
GDRIVE_LINKS = [
"https://drive.google.com/file/d/1aBFrAktgTIFwYxNDiY75Gj-4gwqoUJbm/view?usp=sharing",
"https://drive.google.com/file/d/1boqYWdtFqYagnVk7oeh6hRZb5Um2W9zC/view?usp=sharing"
]
# ============ UTILS ============ #
def gdrive_to_direct(link):
match = re.search(r"drive\.google\.com\/file\/d\/([^/]+)", link)
if match:
file_id = match.group(1)
return f"https://drive.google.com/uc?export=download&id={file_id}"
return None
def fetch_pdf(url):
response = requests.get(url, timeout=30)
response.raise_for_status()
return response.content
def read_pdf_bytes(data):
reader = PdfReader(io.BytesIO(data))
text = ""
for page in reader.pages:
extracted = page.extract_text()
if extracted:
text += extracted
return text
def chunk_text(text, max_length=500):
words = text.split()
return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
def create_faiss_index(chunks):
embeddings = embedder.encode(chunks)
dim = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(np.array(embeddings))
return index, chunks
def search_index(index, query, chunks, top_k=3):
query_embedding = embedder.encode([query])
D, I = index.search(np.array(query_embedding), top_k)
return [chunks[i] for i in I[0]]
# ============ STREAMLIT UI ============ #
st.set_page_config(page_title="π§ RAG Chat from Cloud PDFs", layout="wide")
st.title("π Chat with 2 Google Drive PDFs (Auto-loaded)")
with st.spinner("π₯ Downloading and processing PDF documents..."):
combined_text = ""
for link in GDRIVE_LINKS:
direct_url = gdrive_to_direct(link)
if direct_url:
try:
pdf_bytes = fetch_pdf(direct_url)
combined_text += read_pdf_bytes(pdf_bytes)
except Exception as e:
st.error(f"β Error fetching PDF from: {link}\n\n{e}")
st.stop()
else:
st.error(f"β Invalid Google Drive link format: {link}")
st.stop()
chunks = chunk_text(combined_text)
index, stored_chunks = create_faiss_index(chunks)
st.success("β
PDFs loaded and indexed. Ask your questions below!")
# Input box for queries
query = st.text_input("Ask a question based on the documents:")
if query:
with st.spinner("π Searching and generating response..."):
context = search_index(index, query, stored_chunks)
prompt = "\n".join(context) + f"\n\nQuestion: {query}"
response = client.chat.completions.create(
model="llama-3.3-70b-versatile",
messages=[{"role": "user", "content": prompt}]
)
answer = response.choices[0].message.content.strip()
st.markdown(f"**Answer:** {answer}")
|