Alamgirapi commited on
Commit
e2d9338
Β·
verified Β·
1 Parent(s): fdcfb8f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +254 -0
app.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import os
3
+ import faiss
4
+ import numpy as np
5
+ from sentence_transformers import SentenceTransformer
6
+ from langchain.text_splitter import CharacterTextSplitter
7
+ from PyPDF2 import PdfReader
8
+ import docx
9
+ from groq import Groq
10
+ import tempfile
11
+ import io
12
+
13
+ # Page configuration
14
+ st.set_page_config(
15
+ page_title="Document Q&A Assistant",
16
+ page_icon="πŸ“š",
17
+ layout="wide"
18
+ )
19
+
20
+ # Initialize session state
21
+ if 'knowledge_base_ready' not in st.session_state:
22
+ st.session_state.knowledge_base_ready = False
23
+ if 'cache' not in st.session_state:
24
+ st.session_state.cache = {}
25
+ if 'embedder' not in st.session_state:
26
+ st.session_state.embedder = None
27
+ if 'index' not in st.session_state:
28
+ st.session_state.index = None
29
+ if 'chunks' not in st.session_state:
30
+ st.session_state.chunks = []
31
+
32
+ # App title and description
33
+ st.title("πŸ“š Document Q&A Assistant")
34
+ st.markdown("Upload your PDF documents and ask questions about their content!")
35
+
36
+ # Sidebar for configuration
37
+ st.sidebar.header("βš™οΈ Configuration")
38
+ groq_api_key = st.sidebar.text_input(
39
+ "Enter your Groq API Key:",
40
+ type="password",
41
+ help="Get your API key from https://console.groq.com/"
42
+ )
43
+
44
+ # Document upload section
45
+ st.header("πŸ“„ Upload Documents")
46
+ uploaded_files = st.file_uploader(
47
+ "Choose PDF files",
48
+ type=['pdf'],
49
+ accept_multiple_files=True,
50
+ help="Upload one or more PDF documents to create your knowledge base"
51
+ )
52
+
53
+ # Document loaders
54
+ @st.cache_data
55
+ def load_pdf_from_bytes(file_bytes):
56
+ """Load PDF content from bytes"""
57
+ try:
58
+ reader = PdfReader(io.BytesIO(file_bytes))
59
+ text = ""
60
+ for page in reader.pages:
61
+ if page.extract_text():
62
+ text += page.extract_text() + "\n"
63
+ return text
64
+ except Exception as e:
65
+ st.error(f"Error reading PDF: {str(e)}")
66
+ return ""
67
+
68
+ @st.cache_resource
69
+ def load_embedder():
70
+ """Load the sentence transformer model"""
71
+ return SentenceTransformer("all-MiniLM-L6-v2")
72
+
73
+ def process_documents(files, embedder):
74
+ """Process uploaded documents and create FAISS index"""
75
+ documents = []
76
+
77
+ # Progress bar
78
+ progress_bar = st.progress(0)
79
+ status_text = st.empty()
80
+
81
+ # Load documents
82
+ for i, file in enumerate(files):
83
+ status_text.text(f"Processing {file.name}...")
84
+ file_bytes = file.read()
85
+ text = load_pdf_from_bytes(file_bytes)
86
+ if text.strip():
87
+ documents.append(text)
88
+ progress_bar.progress((i + 1) / (len(files) + 2))
89
+
90
+ if not documents:
91
+ st.error("No valid documents found!")
92
+ return None, None
93
+
94
+ # Split into chunks
95
+ status_text.text("Splitting documents into chunks...")
96
+ splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
97
+ chunks = []
98
+ for doc in documents:
99
+ chunks.extend(splitter.split_text(doc))
100
+ progress_bar.progress((len(files) + 1) / (len(files) + 2))
101
+
102
+ if not chunks:
103
+ st.error("No chunks created from documents!")
104
+ return None, None
105
+
106
+ # Create embeddings and FAISS index
107
+ status_text.text("Creating embeddings and search index...")
108
+ try:
109
+ embeddings = embedder.encode(chunks, show_progress_bar=False)
110
+ dimension = embeddings.shape[1]
111
+ index = faiss.IndexFlatL2(dimension)
112
+ index.add(np.array(embeddings))
113
+ progress_bar.progress(1.0)
114
+ status_text.text("βœ… Knowledge base created successfully!")
115
+ return index, chunks
116
+ except Exception as e:
117
+ st.error(f"Error creating embeddings: {str(e)}")
118
+ return None, None
119
+
120
+ def retriever(query, embedder, index, chunks, k=3):
121
+ """Retrieve relevant chunks for a query"""
122
+ q_emb = embedder.encode([query])
123
+ distances, indices = index.search(np.array(q_emb), k)
124
+ return [chunks[i] for i in indices[0]]
125
+
126
+ def generator(query, docs, groq_client):
127
+ """Generate answer using Groq"""
128
+ context = " ".join(docs)
129
+ prompt = f"""
130
+ You are an AI assistant. Use the following context to answer the question.
131
+
132
+ Context:
133
+ {context}
134
+
135
+ Question: {query}
136
+ Answer clearly and concisely:
137
+ """
138
+ try:
139
+ response = groq_client.chat.completions.create(
140
+ model="llama-3.3-70b-versatile",
141
+ messages=[{"role": "user", "content": prompt}],
142
+ temperature=0.2,
143
+ max_tokens=512
144
+ )
145
+ return response.choices[0].message.content
146
+ except Exception as e:
147
+ return f"Error generating answer: {str(e)}"
148
+
149
+ def cache_rag(query, embedder, index, chunks, groq_client, cache, threshold=0.85):
150
+ """RAG with caching functionality"""
151
+ q_emb = embedder.encode(query)
152
+
153
+ # Check cache
154
+ for cached_q, entry in cache.items():
155
+ c_emb = entry["embedding"]
156
+ sim = np.dot(q_emb, c_emb) / (np.linalg.norm(q_emb) * np.linalg.norm(c_emb))
157
+ if sim > threshold:
158
+ return entry["answer"], True # Cache hit
159
+
160
+ # Cache miss - retrieve and generate
161
+ docs = retriever(query, embedder, index, chunks)
162
+ ans = generator(query, docs, groq_client)
163
+ cache[query] = {"embedding": q_emb, "answer": ans}
164
+ return ans, False # Cache miss
165
+
166
+ # Process documents when uploaded
167
+ if uploaded_files and groq_api_key:
168
+ if st.button("πŸ”„ Process Documents", type="primary"):
169
+ with st.spinner("Processing documents..."):
170
+ # Load embedder
171
+ if st.session_state.embedder is None:
172
+ st.session_state.embedder = load_embedder()
173
+
174
+ # Process documents
175
+ index, chunks = process_documents(uploaded_files, st.session_state.embedder)
176
+
177
+ if index is not None and chunks:
178
+ st.session_state.index = index
179
+ st.session_state.chunks = chunks
180
+ st.session_state.knowledge_base_ready = True
181
+ st.success(f"βœ… Successfully processed {len(uploaded_files)} documents with {len(chunks)} chunks!")
182
+ else:
183
+ st.session_state.knowledge_base_ready = False
184
+
185
+ elif uploaded_files and not groq_api_key:
186
+ st.warning("⚠️ Please enter your Groq API key to process documents.")
187
+ elif not uploaded_files:
188
+ st.info("πŸ“€ Please upload PDF documents to get started.")
189
+
190
+ # Q&A Section
191
+ if st.session_state.knowledge_base_ready and groq_api_key:
192
+ st.header("❓ Ask Questions")
193
+
194
+ # Initialize Groq client
195
+ try:
196
+ groq_client = Groq(api_key=groq_api_key)
197
+ except Exception as e:
198
+ st.error(f"Error initializing Groq client: {str(e)}")
199
+ st.stop()
200
+
201
+ # Question input
202
+ query = st.text_input(
203
+ "Enter your question:",
204
+ placeholder="What is the main topic of the document?",
205
+ help="Ask any question about the content of your uploaded documents"
206
+ )
207
+
208
+ if query and st.button("πŸ” Get Answer", type="primary"):
209
+ with st.spinner("Searching for answer..."):
210
+ try:
211
+ answer, is_cached = cache_rag(
212
+ query,
213
+ st.session_state.embedder,
214
+ st.session_state.index,
215
+ st.session_state.chunks,
216
+ groq_client,
217
+ st.session_state.cache
218
+ )
219
+
220
+ # Display answer
221
+ st.subheader("πŸ’‘ Answer")
222
+ st.write(answer)
223
+
224
+ # Show cache status
225
+ if is_cached:
226
+ st.success("βœ… Answer retrieved from cache")
227
+ else:
228
+ st.info("πŸ” New answer generated")
229
+
230
+ except Exception as e:
231
+ st.error(f"Error generating answer: {str(e)}")
232
+
233
+ # Display cache statistics
234
+ if st.session_state.cache:
235
+ st.sidebar.subheader("πŸ“Š Cache Statistics")
236
+ st.sidebar.write(f"Cached queries: {len(st.session_state.cache)}")
237
+
238
+ if st.sidebar.button("πŸ—‘οΈ Clear Cache"):
239
+ st.session_state.cache = {}
240
+ st.sidebar.success("Cache cleared!")
241
+
242
+ # Footer
243
+ st.markdown("---")
244
+ st.markdown(
245
+ """
246
+ **Instructions:**
247
+ 1. Enter your Groq API key in the sidebar
248
+ 2. Upload one or more PDF documents
249
+ 3. Click 'Process Documents' to build the knowledge base
250
+ 4. Ask questions about your documents
251
+
252
+ The app uses caching to speed up similar queries!
253
+ """
254
+ )