ZunairaHawwar commited on
Commit
8faa875
Β·
verified Β·
1 Parent(s): 1f12f76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +349 -102
app.py CHANGED
@@ -1,127 +1,374 @@
1
- import nest_asyncio
2
  import streamlit as st
3
  import os
4
  import json
5
- from groq import Groq
6
- from sentence_transformers import SentenceTransformer
7
- import chromadb
8
- from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
9
- from chromadb.config import Settings
10
- from langchain.document_loaders import JSONLoader
 
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
 
 
 
 
 
 
12
 
13
- # Apply asyncio patch (Streamlit fix)
14
  nest_asyncio.apply()
15
 
16
  # --- CONFIGURATION ---
17
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
18
- GROQ_MODEL = "llama3-8b-8192"
19
-
20
- # Initialize Groq client
21
- groq_client = Groq(api_key=GROQ_API_KEY)
22
 
23
- # Initialize Chroma Embedding Function
24
- embedding_function = SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
 
 
25
 
26
- # Initialize ChromaDB Persistent Client
27
- chroma_client = chromadb.PersistentClient(path="./chroma_db", settings=Settings(anonymized_telemetry=False))
28
- collection = chroma_client.get_or_create_collection(
29
- name="icodeguru_knowledge",
30
- embedding_function=embedding_function
31
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
 
33
- # --- Clear Collection ---
34
- def clear_collection():
35
- all_items = collection.get()
36
- ids = all_items.get("ids", [])
37
- if ids:
38
- collection.delete(ids=ids)
39
- st.info(f"Cleared {len(ids)} existing documents from ChromaDB.")
40
- else:
41
- st.info("No documents to clear from ChromaDB.")
42
 
43
- # --- Ingest JSON Files from /docs/ ---
44
- def ingest_docs_to_chroma():
45
- folder_path = "./docs"
46
- all_docs = []
47
- for filename in os.listdir(folder_path):
48
- if filename.endswith(".json"):
49
- file_path = os.path.join(folder_path, filename)
50
- loader = JSONLoader(file_path=file_path, jq_schema='.[]', text_content=False)
51
- docs = loader.load()
52
- all_docs.extend(docs)
53
- st.write(f"Loaded {len(docs)} documents from {filename}")
54
 
55
- # Chunk Documents
56
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
57
- chunks = text_splitter.split_documents(all_docs)
58
- st.write(f"Total chunks created: {len(chunks)}")
59
 
60
- # Clear existing vectors to avoid duplication
61
- clear_collection()
62
 
63
- # Add Chunks to ChromaDB
64
- for chunk in chunks:
65
- if isinstance(chunk.page_content, list):
66
- content = " ".join(str(item) for item in chunk.page_content).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  else:
68
- content = str(chunk.page_content).strip()
 
 
 
 
69
 
70
- doc_id = str(hash(content))
71
- collection.add(documents=[content], ids=[doc_id])
72
-
73
- st.success("βœ… Knowledge Base Updated Successfully!")
74
-
75
- # --- Search embedded knowledge ---
76
- def search_vector_data(query):
77
- try:
78
- results = collection.query(query_texts=[query], n_results=3)
79
- if results and results["documents"]:
80
- return "\n\n".join(results["documents"][0])
81
- except Exception as e:
82
- st.error(f"Vector search error: {e}")
83
- return None
84
-
85
- # --- Ask Groq LLM ---
86
- def ask_groq(context, question):
87
- messages = [
88
- {"role": "system", "content": "You are a helpful assistant. Always provide relevant video and website links if possible."},
89
- {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {question}\nAnswer (include links):"}
90
- ]
91
- try:
92
- response = groq_client.chat.completions.create(
93
- model=GROQ_MODEL,
94
- messages=messages
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  )
96
- return response.choices[0].message.content.strip()
97
- except Exception as e:
98
- st.error(f"Groq API error: {e}")
99
- return "⚠️ Failed to get response from Groq API."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- # --- Streamlit UI ---
102
  def main():
103
- st.set_page_config(page_title="EduBot for iCodeGuru", layout="wide")
 
 
 
 
 
 
 
 
104
  st.title("πŸŽ“ EduBot for @icodeguru0")
105
- st.markdown("Ask anything based on pre-loaded iCodeGuru knowledge.")
106
-
107
- if st.button("πŸ”„ Refresh Knowledge Base"):
108
- ingest_docs_to_chroma()
109
-
110
- st.markdown("---")
111
-
112
- user_question = st.text_input("πŸ’¬ Ask your question:")
113
-
114
- if user_question:
115
- vector_context = search_vector_data(user_question)
116
- if vector_context:
117
- with st.spinner("🧐 Answering from knowledge base..."):
118
- answer = ask_groq(vector_context, user_question)
119
- st.success(answer)
120
- else:
121
- st.warning("⚠️ No relevant answer found in the embedded knowledge.")
122
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  st.markdown("---")
124
- st.caption("Powered by ChromaDB 🧠 and Groq ⚑")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  if __name__ == "__main__":
127
- main()
 
 
1
  import streamlit as st
2
  import os
3
  import json
4
+ from typing import List, Optional
5
+ import nest_asyncio
6
+
7
+ # LangChain imports
8
+ from langchain.vectorstores import Chroma
9
+ from langchain.embeddings import HuggingFaceEmbeddings
10
+ from langchain.document_loaders import JSONLoader, DirectoryLoader
11
  from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain.llms import Groq
13
+ from langchain.chains import RetrievalQA
14
+ from langchain.prompts import PromptTemplate
15
+ from langchain.schema import Document
16
+ from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
17
+ from langchain.memory import ConversationBufferMemory
18
+ from langchain.chains import ConversationalRetrievalChain
19
 
20
+ # Apply asyncio patch for Streamlit compatibility
21
  nest_asyncio.apply()
22
 
23
  # --- CONFIGURATION ---
24
  GROQ_API_KEY = os.environ.get("GROQ_API_KEY")
25
+ if not GROQ_API_KEY:
26
+ st.error("⚠️ GROQ_API_KEY environment variable is not set!")
27
+ st.stop()
 
28
 
29
+ GROQ_MODEL = "llama3-8b-8192"
30
+ EMBEDDING_MODEL = "all-MiniLM-L6-v2"
31
+ CHROMA_PERSIST_DIR = "./chroma_db"
32
+ DOCS_DIR = "./docs"
33
 
34
+ class LangChainRAGSystem:
35
+ def __init__(self):
36
+ """Initialize the LangChain RAG system components."""
37
+ self.embeddings = None
38
+ self.vectorstore = None
39
+ self.llm = None
40
+ self.retrieval_chain = None
41
+ self.memory = ConversationBufferMemory(
42
+ memory_key="chat_history",
43
+ return_messages=True,
44
+ output_key="answer"
45
+ )
46
+ self.setup_components()
47
+
48
+ def setup_components(self):
49
+ """Setup all LangChain components."""
50
+ # Initialize embeddings
51
+ self.embeddings = HuggingFaceEmbeddings(
52
+ model_name=EMBEDDING_MODEL,
53
+ model_kwargs={'device': 'cpu'},
54
+ encode_kwargs={'normalize_embeddings': True}
55
+ )
56
+
57
+ # Initialize LLM
58
+ self.llm = Groq(
59
+ groq_api_key=GROQ_API_KEY,
60
+ model_name=GROQ_MODEL,
61
+ temperature=0.1,
62
+ max_tokens=1024
63
+ )
64
+
65
+ # Load or create vectorstore
66
+ self.load_vectorstore()
67
+
68
+ # Setup retrieval chain
69
+ self.setup_retrieval_chain()
70
+
71
+ def load_vectorstore(self):
72
+ """Load existing vectorstore or create empty one."""
73
+ try:
74
+ self.vectorstore = Chroma(
75
+ persist_directory=CHROMA_PERSIST_DIR,
76
+ embedding_function=self.embeddings,
77
+ collection_name="icodeguru_knowledge"
78
+ )
79
+ st.info("βœ… Loaded existing knowledge base.")
80
+ except Exception as e:
81
+ st.warning(f"Creating new knowledge base: {e}")
82
+ self.vectorstore = Chroma(
83
+ persist_directory=CHROMA_PERSIST_DIR,
84
+ embedding_function=self.embeddings,
85
+ collection_name="icodeguru_knowledge"
86
+ )
87
+
88
+ def setup_retrieval_chain(self):
89
+ """Setup the conversational retrieval chain."""
90
+ # Custom prompt template
91
+ prompt_template = """You are an expert assistant for iCodeGuru, a programming education platform.
92
+ Use the following context to answer the user's question comprehensively and accurately.
93
+ Always provide relevant video links, website links, or resources when available in the context.
94
+ If you don't know the answer based on the context, say so clearly.
95
 
96
+ Context: {context}
 
 
 
 
 
 
 
 
97
 
98
+ Chat History: {chat_history}
 
 
 
 
 
 
 
 
 
 
99
 
100
+ Human: {question}
 
 
 
101
 
102
+ Assistant: I'll help you with that based on the iCodeGuru knowledge base.
 
103
 
104
+ """
105
+
106
+ PROMPT = PromptTemplate(
107
+ template=prompt_template,
108
+ input_variables=["context", "chat_history", "question"]
109
+ )
110
+
111
+ if self.vectorstore and self.vectorstore._collection.count() > 0:
112
+ # Create retriever
113
+ retriever = self.vectorstore.as_retriever(
114
+ search_type="similarity",
115
+ search_kwargs={"k": 4} # Retrieve top 4 most relevant chunks
116
+ )
117
+
118
+ # Create conversational retrieval chain
119
+ self.retrieval_chain = ConversationalRetrievalChain.from_llm(
120
+ llm=self.llm,
121
+ retriever=retriever,
122
+ memory=self.memory,
123
+ combine_docs_chain_kwargs={"prompt": PROMPT},
124
+ return_source_documents=True,
125
+ verbose=True
126
+ )
127
  else:
128
+ st.warning("⚠️ No documents in knowledge base. Please refresh the knowledge base first.")
129
+
130
+ def load_and_process_documents(self) -> List[Document]:
131
+ """Load and process JSON documents from the docs directory."""
132
+ documents = []
133
 
134
+ if not os.path.exists(DOCS_DIR):
135
+ st.error(f"❌ Documents directory '{DOCS_DIR}' not found!")
136
+ return documents
137
+
138
+ # Get all JSON files
139
+ json_files = [f for f in os.listdir(DOCS_DIR) if f.endswith('.json')]
140
+
141
+ if not json_files:
142
+ st.warning(f"⚠️ No JSON files found in '{DOCS_DIR}' directory!")
143
+ return documents
144
+
145
+ st.info(f"πŸ“‚ Found {len(json_files)} JSON files to process...")
146
+
147
+ for filename in json_files:
148
+ file_path = os.path.join(DOCS_DIR, filename)
149
+ try:
150
+ # Use JSONLoader with proper schema
151
+ loader = JSONLoader(
152
+ file_path=file_path,
153
+ jq_schema='.[]',
154
+ text_content=False
155
+ )
156
+ file_docs = loader.load()
157
+
158
+ # Add source metadata
159
+ for doc in file_docs:
160
+ doc.metadata['source_file'] = filename
161
+ doc.metadata['file_path'] = file_path
162
+
163
+ documents.extend(file_docs)
164
+ st.success(f"βœ… Loaded {len(file_docs)} documents from {filename}")
165
+
166
+ except Exception as e:
167
+ st.error(f"❌ Error loading {filename}: {str(e)}")
168
+ continue
169
+
170
+ return documents
171
+
172
+ def split_documents(self, documents: List[Document]) -> List[Document]:
173
+ """Split documents into smaller chunks."""
174
+ text_splitter = RecursiveCharacterTextSplitter(
175
+ chunk_size=800,
176
+ chunk_overlap=100,
177
+ length_function=len,
178
+ separators=["\n\n", "\n", " ", ""]
179
  )
180
+
181
+ chunks = text_splitter.split_documents(documents)
182
+ st.info(f"πŸ“„ Created {len(chunks)} document chunks")
183
+ return chunks
184
+
185
+ def clear_knowledge_base(self):
186
+ """Clear the existing knowledge base."""
187
+ try:
188
+ if self.vectorstore:
189
+ # Delete the collection
190
+ self.vectorstore.delete_collection()
191
+ st.success("πŸ—‘οΈ Cleared existing knowledge base")
192
+
193
+ # Recreate empty vectorstore
194
+ self.vectorstore = Chroma(
195
+ persist_directory=CHROMA_PERSIST_DIR,
196
+ embedding_function=self.embeddings,
197
+ collection_name="icodeguru_knowledge"
198
+ )
199
+ except Exception as e:
200
+ st.error(f"❌ Error clearing knowledge base: {str(e)}")
201
+
202
+ def ingest_documents(self):
203
+ """Complete document ingestion pipeline."""
204
+ with st.spinner("πŸ”„ Loading documents..."):
205
+ # Load documents
206
+ documents = self.load_and_process_documents()
207
+
208
+ if not documents:
209
+ st.error("❌ No documents loaded. Please check your docs folder.")
210
+ return False
211
+
212
+ with st.spinner("βœ‚οΈ Splitting documents into chunks..."):
213
+ # Split documents
214
+ chunks = self.split_documents(documents)
215
+
216
+ if not chunks:
217
+ st.error("❌ No document chunks created.")
218
+ return False
219
+
220
+ with st.spinner("🧠 Creating embeddings and storing in vector database..."):
221
+ try:
222
+ # Clear existing data
223
+ self.clear_knowledge_base()
224
+
225
+ # Add chunks to vectorstore
226
+ self.vectorstore.add_documents(chunks)
227
+
228
+ # Persist the vectorstore
229
+ self.vectorstore.persist()
230
+
231
+ st.success(f"βœ… Successfully ingested {len(chunks)} document chunks!")
232
+
233
+ # Recreate retrieval chain with new data
234
+ self.setup_retrieval_chain()
235
+
236
+ return True
237
+
238
+ except Exception as e:
239
+ st.error(f"❌ Error during ingestion: {str(e)}")
240
+ return False
241
+
242
+ def get_answer(self, question: str) -> dict:
243
+ """Get answer for a user question."""
244
+ if not self.retrieval_chain:
245
+ return {
246
+ "answer": "⚠️ Knowledge base is empty. Please refresh the knowledge base first.",
247
+ "source_documents": []
248
+ }
249
+
250
+ try:
251
+ # Get response from the chain
252
+ response = self.retrieval_chain({"question": question})
253
+ return response
254
+ except Exception as e:
255
+ return {
256
+ "answer": f"❌ Error getting answer: {str(e)}",
257
+ "source_documents": []
258
+ }
259
+
260
+ def reset_conversation(self):
261
+ """Reset the conversation memory."""
262
+ self.memory.clear()
263
+ st.success("πŸ”„ Conversation history cleared!")
264
+
265
+ # Initialize the RAG system
266
+ @st.cache_resource
267
+ def get_rag_system():
268
+ """Cache the RAG system to avoid reinitialization."""
269
+ return LangChainRAGSystem()
270
 
 
271
  def main():
272
+ """Main Streamlit application."""
273
+ st.set_page_config(
274
+ page_title="EduBot for iCodeGuru",
275
+ page_icon="πŸŽ“",
276
+ layout="wide",
277
+ initial_sidebar_state="expanded"
278
+ )
279
+
280
+ # Header
281
  st.title("πŸŽ“ EduBot for @icodeguru0")
282
+ st.markdown("**Powered by LangChain** | Ask anything based on pre-loaded iCodeGuru knowledge.")
283
+
284
+ # Initialize RAG system
285
+ rag_system = get_rag_system()
286
+
287
+ # Sidebar for admin functions
288
+ with st.sidebar:
289
+ st.header("βš™οΈ Admin Panel")
290
+
291
+ if st.button("πŸ”„ Refresh Knowledge Base", type="primary"):
292
+ success = rag_system.ingest_documents()
293
+ if success:
294
+ st.balloons()
295
+
296
+ if st.button("πŸ—‘οΈ Clear Conversation"):
297
+ rag_system.reset_conversation()
298
+
299
+ st.markdown("---")
300
+ st.subheader("πŸ“Š System Info")
301
+
302
+ # Show vectorstore stats
303
+ if rag_system.vectorstore:
304
+ try:
305
+ doc_count = rag_system.vectorstore._collection.count()
306
+ st.metric("Documents in KB", doc_count)
307
+ except:
308
+ st.metric("Documents in KB", "N/A")
309
+
310
+ st.markdown("---")
311
+ st.caption("🧠 **ChromaDB** for vector storage")
312
+ st.caption("⚑ **Groq LLM** for answers")
313
+ st.caption("πŸ”— **LangChain** for orchestration")
314
+
315
+ # Main chat interface
316
  st.markdown("---")
317
+
318
+ # Initialize session state for chat history
319
+ if "messages" not in st.session_state:
320
+ st.session_state.messages = []
321
+
322
+ # Display chat history
323
+ for message in st.session_state.messages:
324
+ with st.chat_message(message["role"]):
325
+ st.markdown(message["content"])
326
+ if "sources" in message and message["sources"]:
327
+ with st.expander("πŸ“š Sources"):
328
+ for i, source in enumerate(message["sources"], 1):
329
+ st.markdown(f"**Source {i}:** {source}")
330
+
331
+ # User input
332
+ if prompt := st.chat_input("πŸ’¬ Ask your question about iCodeGuru..."):
333
+ # Add user message to chat history
334
+ st.session_state.messages.append({"role": "user", "content": prompt})
335
+
336
+ # Display user message
337
+ with st.chat_message("user"):
338
+ st.markdown(prompt)
339
+
340
+ # Get assistant response
341
+ with st.chat_message("assistant"):
342
+ with st.spinner("πŸ€” Thinking..."):
343
+ response = rag_system.get_answer(prompt)
344
+ answer = response.get("answer", "No answer available.")
345
+ source_docs = response.get("source_documents", [])
346
+
347
+ st.markdown(answer)
348
+
349
+ # Show sources if available
350
+ if source_docs:
351
+ sources = []
352
+ for doc in source_docs[:3]: # Show top 3 sources
353
+ source = doc.metadata.get('source_file', 'Unknown source')
354
+ content_preview = doc.page_content[:100] + "..." if len(doc.page_content) > 100 else doc.page_content
355
+ sources.append(f"{source}: {content_preview}")
356
+
357
+ if sources:
358
+ with st.expander("πŸ“š Sources"):
359
+ for i, source in enumerate(sources, 1):
360
+ st.markdown(f"**Source {i}:** {source}")
361
+
362
+ # Add to session state with sources
363
+ st.session_state.messages.append({
364
+ "role": "assistant",
365
+ "content": answer,
366
+ "sources": sources
367
+ })
368
+ else:
369
+ st.session_state.messages.append({"role": "assistant", "content": answer})
370
+ else:
371
+ st.session_state.messages.append({"role": "assistant", "content": answer})
372
 
373
  if __name__ == "__main__":
374
+ main()