Talha812 commited on
Commit
50e1cd1
Β·
verified Β·
1 Parent(s): 148ab75

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +100 -0
app.py CHANGED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import re
4
+ import requests
5
+ import faiss
6
+ import numpy as np
7
+ import streamlit as st
8
+ from PyPDF2 import PdfReader
9
+ from sentence_transformers import SentenceTransformer
10
+ from groq import Groq
11
+
12
+ # ============ CONFIG ============ #
13
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
+ if not GROQ_API_KEY:
15
+ st.error("❌ GROQ_API_KEY environment variable not found.")
16
+ st.stop()
17
+
18
+ client = Groq(api_key=GROQ_API_KEY)
19
+ embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
20
+
21
+ # Google Drive file links (shared by you)
22
+ GDRIVE_LINKS = [
23
+ "https://drive.google.com/file/d/1aBFrAktgTIFwYxNDiY75Gj-4gwqoUJbm/view?usp=sharing",
24
+ "https://drive.google.com/file/d/1boqYWdtFqYagnVk7oeh6hRZb5Um2W9zC/view?usp=sharing"
25
+ ]
26
+
27
+ # ============ UTILS ============ #
28
+ def gdrive_to_direct(link):
29
+ match = re.search(r"drive\.google\.com\/file\/d\/([^/]+)", link)
30
+ if match:
31
+ file_id = match.group(1)
32
+ return f"https://drive.google.com/uc?export=download&id={file_id}"
33
+ return None
34
+
35
+ def fetch_pdf(url):
36
+ response = requests.get(url, timeout=30)
37
+ response.raise_for_status()
38
+ return response.content
39
+
40
+ def read_pdf_bytes(data):
41
+ reader = PdfReader(io.BytesIO(data))
42
+ text = ""
43
+ for page in reader.pages:
44
+ extracted = page.extract_text()
45
+ if extracted:
46
+ text += extracted
47
+ return text
48
+
49
+ def chunk_text(text, max_length=500):
50
+ words = text.split()
51
+ return [' '.join(words[i:i + max_length]) for i in range(0, len(words), max_length)]
52
+
53
+ def create_faiss_index(chunks):
54
+ embeddings = embedder.encode(chunks)
55
+ dim = embeddings.shape[1]
56
+ index = faiss.IndexFlatL2(dim)
57
+ index.add(np.array(embeddings))
58
+ return index, chunks
59
+
60
+ def search_index(index, query, chunks, top_k=3):
61
+ query_embedding = embedder.encode([query])
62
+ D, I = index.search(np.array(query_embedding), top_k)
63
+ return [chunks[i] for i in I[0]]
64
+
65
+ # ============ STREAMLIT UI ============ #
66
+ st.set_page_config(page_title="🧠 RAG Chat from Cloud PDFs", layout="wide")
67
+ st.title("πŸ“„ Chat with 2 Google Drive PDFs (Auto-loaded)")
68
+
69
+ with st.spinner("πŸ“₯ Downloading and processing PDF documents..."):
70
+ combined_text = ""
71
+ for link in GDRIVE_LINKS:
72
+ direct_url = gdrive_to_direct(link)
73
+ if direct_url:
74
+ try:
75
+ pdf_bytes = fetch_pdf(direct_url)
76
+ combined_text += read_pdf_bytes(pdf_bytes)
77
+ except Exception as e:
78
+ st.error(f"❌ Error fetching PDF from: {link}\n\n{e}")
79
+ st.stop()
80
+ else:
81
+ st.error(f"❌ Invalid Google Drive link format: {link}")
82
+ st.stop()
83
+
84
+ chunks = chunk_text(combined_text)
85
+ index, stored_chunks = create_faiss_index(chunks)
86
+
87
+ st.success("βœ… PDFs loaded and indexed. Ask your questions below!")
88
+
89
+ # Input box for queries
90
+ query = st.text_input("Ask a question based on the documents:")
91
+ if query:
92
+ with st.spinner("πŸ” Searching and generating response..."):
93
+ context = search_index(index, query, stored_chunks)
94
+ prompt = "\n".join(context) + f"\n\nQuestion: {query}"
95
+ response = client.chat.completions.create(
96
+ model="llama-3.3-70b-versatile",
97
+ messages=[{"role": "user", "content": prompt}]
98
+ )
99
+ answer = response.choices[0].message.content.strip()
100
+ st.markdown(f"**Answer:** {answer}")