ahmadsanafarooq commited on
Commit
f0f7c9d
Β·
verified Β·
1 Parent(s): 7503262

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +147 -109
app.py CHANGED
@@ -1,24 +1,27 @@
 
1
  import os
2
  import gradio as gr
3
- import tempfile
 
 
 
 
 
4
  from langchain.text_splitter import RecursiveCharacterTextSplitter
5
  from langchain_community.vectorstores import Chroma
6
  from langchain.chains import RetrievalQA
7
- from langchain_groq import ChatGroq
8
- from langchain_community.document_loaders import TextLoader, PyPDFLoader
9
  from langchain.schema import Document
10
- from pathlib import Path
11
- from typing import List
12
- import logging
13
  import numpy as np
14
  from sklearn.feature_extraction.text import TfidfVectorizer
15
- from dotenv import load_dotenv
16
 
17
- # Logging config
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
- # TF-IDF fallback embedding class
22
  class SimpleEmbeddings:
23
  def __init__(self):
24
  self.vectorizer = TfidfVectorizer(max_features=384, stop_words='english')
@@ -28,42 +31,57 @@ class SimpleEmbeddings:
28
  if not self.fitted:
29
  self.vectorizer.fit(texts)
30
  self.fitted = True
31
- return self.vectorizer.transform(texts).toarray().tolist()
 
32
 
33
  def embed_query(self, text: str) -> List[float]:
34
  if not self.fitted:
35
  return [0.0] * 384
36
- return self.vectorizer.transform([text]).toarray()[0].tolist()
 
 
37
 
38
- # Core RAG Assistant class
39
  class RAGAssistant:
40
  def __init__(self, groq_api_key: str):
41
  self.groq_api_key = groq_api_key
42
  self.embeddings = self._init_embeddings()
43
- self.text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
 
 
 
 
 
 
44
  self.learning_vectorstore = None
45
  self.code_vectorstore = None
46
- self.llm = ChatGroq(groq_api_key=groq_api_key, model_name="llama3-70b-8192", temperature=0.1)
 
 
 
 
 
 
47
  self.learning_persist_dir = "./chroma_learning_db"
48
  self.code_persist_dir = "./chroma_code_db"
 
49
  self._init_vector_stores()
50
 
51
  def _init_embeddings(self):
52
  try:
53
  from langchain_huggingface import HuggingFaceEmbeddings
54
- for model_name in ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2", "all-mpnet-base-v2"]:
 
55
  try:
56
  embeddings = HuggingFaceEmbeddings(
57
  model_name=model_name,
58
  model_kwargs={'device': 'cpu'},
59
  encode_kwargs={'normalize_embeddings': False}
60
  )
61
- print(f"Loaded HuggingFace model: {model_name}")
62
  return embeddings
63
- except Exception as e:
64
- print(f"Failed to load {model_name}: {e}")
65
  except ImportError:
66
- print("HuggingFace not installed. Using TF-IDF fallback.")
67
  return SimpleEmbeddings()
68
 
69
  def _init_vector_stores(self):
@@ -82,15 +100,15 @@ class RAGAssistant:
82
  try:
83
  documents = []
84
  for file_path in files:
85
- try:
86
- loader = PyPDFLoader(file_path) if file_path.lower().endswith(".pdf") else TextLoader(file_path)
87
- docs = loader.load()
88
- documents.extend(docs)
89
- except Exception as e:
90
- print(f"Error loading {file_path}: {e}")
91
 
92
  if not documents:
93
- return "\u274C No documents could be loaded. Please check your file type or content."
94
 
95
  chunks = self.text_splitter.split_documents(documents)
96
  for chunk in chunks:
@@ -99,147 +117,167 @@ class RAGAssistant:
99
  if assistant_type == "learning":
100
  self.learning_vectorstore.add_documents(chunks)
101
  self.learning_vectorstore.persist()
102
- else:
103
  self.code_vectorstore.add_documents(chunks)
104
  self.code_vectorstore.persist()
105
 
106
- return f"\u2705 Loaded {len(chunks)} chunks from {len(documents)} documents."
107
  except Exception as e:
108
- return f"\u274C Failed: {str(e)}"
109
 
110
  def get_learning_tutor_response(self, question: str) -> str:
111
  if not self.learning_vectorstore:
112
- return "Please upload learning materials first."
113
-
114
  qa_chain = RetrievalQA.from_chain_type(
115
  llm=self.llm,
116
  chain_type="stuff",
117
  retriever=self.learning_vectorstore.as_retriever(search_kwargs={"k": 3}),
118
  return_source_documents=True
119
  )
120
-
121
- learning_prompt = f"""
122
- You are a helpful educational assistant. Use the following materials to answer:
123
-
124
- {question}
125
- """
126
- result = qa_chain({"query": learning_prompt})
127
  response = result['result']
128
  if result.get('source_documents'):
129
  response += "\n\n**Sources:**\n"
130
  for doc in result['source_documents'][:3]:
131
- response += f"- {Path(doc.metadata.get('source', 'unknown')).name}\n"
132
  return response
133
 
134
  def get_code_helper_response(self, question: str) -> str:
135
  if not self.code_vectorstore:
136
- return "Please upload code documents first."
137
-
138
  qa_chain = RetrievalQA.from_chain_type(
139
  llm=self.llm,
140
  chain_type="stuff",
141
  retriever=self.code_vectorstore.as_retriever(search_kwargs={"k": 3}),
142
  return_source_documents=True
143
  )
144
-
145
- code_prompt = f"""
146
- You are a technical assistant helping with code and APIs. Use the following context to respond:
147
-
148
- {question}
149
- """
150
- result = qa_chain({"query": code_prompt})
151
  response = result['result']
152
  if result.get('source_documents'):
153
- response += "\n\n**Sources:**\n"
154
  for doc in result['source_documents'][:3]:
155
- response += f"- {Path(doc.metadata.get('source', 'unknown')).name}\n"
156
  return response
157
 
158
- # Helper for Hugging Face uploads
159
-
160
- def save_uploaded_files(files):
161
- file_paths = []
162
- for f in files:
163
- if hasattr(f, "data") and hasattr(f, "name"):
164
- suffix = Path(f.name).suffix
165
- with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
166
- tmp.write(f.data)
167
- file_paths.append(tmp.name)
168
- elif hasattr(f, "path"):
169
- file_paths.append(f.path)
170
- return file_paths
171
-
172
- # Gradio interface setup
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
173
  def create_gradio_interface(assistant: RAGAssistant):
174
  def upload_learning_files(files):
175
- if not files:
176
- return "No files uploaded."
177
- file_paths = save_uploaded_files(files)
178
- return assistant.load_documents(file_paths, "learning")
179
 
180
  def upload_code_files(files):
181
- if not files:
182
- return "No files uploaded."
183
- file_paths = save_uploaded_files(files)
184
- return assistant.load_documents(file_paths, "code")
185
 
186
  def learning_chat(message, history):
187
- if not message.strip(): return history, ""
 
188
  response = assistant.get_learning_tutor_response(message)
189
  history.append((message, response))
190
  return history, ""
191
 
192
  def code_chat(message, history):
193
- if not message.strip(): return history, ""
 
194
  response = assistant.get_code_helper_response(message)
195
  history.append((message, response))
196
  return history, ""
197
 
198
- with gr.Blocks(title="RAG Assistant", theme=gr.themes.Soft()) as demo:
199
- gr.Markdown("# πŸŽ“ Learning & πŸ’» Code Assistant")
 
200
  with gr.Tabs():
201
- with gr.Tab("Learning Tutor"):
202
- with gr.Row():
203
- with gr.Column(scale=1):
204
- lf = gr.File(label="Upload Materials", file_types=['.pdf', '.txt'], file_count="multiple")
205
- lbtn = gr.Button("Upload")
206
- lstat = gr.Textbox(label="Upload Status", interactive=False)
207
- with gr.Column(scale=2):
208
- lchat = gr.Chatbot()
209
- lin = gr.Textbox(label="Ask your question")
210
- lsend = gr.Button("Ask")
211
- lbtn.click(upload_learning_files, inputs=[lf], outputs=[lstat])
212
- lsend.click(learning_chat, inputs=[lin, lchat], outputs=[lchat, lin])
213
- lin.submit(learning_chat, inputs=[lin, lchat], outputs=[lchat, lin])
214
-
215
- with gr.Tab("Code Helper"):
216
- with gr.Row():
217
- with gr.Column(scale=1):
218
- cf = gr.File(label="Upload Code Docs", file_types=['.pdf', '.txt', '.md', '.py', '.js'], file_count="multiple")
219
- cbtn = gr.Button("Upload")
220
- cstat = gr.Textbox(label="Upload Status", interactive=False)
221
- with gr.Column(scale=2):
222
- cchat = gr.Chatbot()
223
- cin = gr.Textbox(label="Ask a code question")
224
- csend = gr.Button("Ask")
225
- cbtn.click(upload_code_files, inputs=[cf], outputs=[cstat])
226
- csend.click(code_chat, inputs=[cin, cchat], outputs=[cchat, cin])
227
- cin.submit(code_chat, inputs=[cin, cchat], outputs=[cchat, cin])
228
-
229
- gr.Markdown("---\n*Powered by Groq, LangChain, ChromaDB*")
 
 
 
 
 
 
 
 
230
 
231
  return demo
232
 
 
233
  def main():
234
  load_dotenv()
235
  groq_api_key = os.getenv("GROQ_API_KEY")
236
  if not groq_api_key:
237
- print("Missing GROQ_API_KEY in .env file.")
238
  return
239
-
240
  assistant = RAGAssistant(groq_api_key)
241
  demo = create_gradio_interface(assistant)
242
- demo.launch(server_name="0.0.0.0", server_port=7860, share=False, debug=True)
 
243
 
244
  if __name__ == "__main__":
245
  main()
 
1
+ # Full merged code including the evaluation logic and tab
2
  import os
3
  import gradio as gr
4
+ import json
5
+ import logging
6
+ from pathlib import Path
7
+ from typing import List
8
+ from dotenv import load_dotenv
9
+
10
  from langchain.text_splitter import RecursiveCharacterTextSplitter
11
  from langchain_community.vectorstores import Chroma
12
  from langchain.chains import RetrievalQA
 
 
13
  from langchain.schema import Document
14
+ from langchain_community.document_loaders import TextLoader, PyPDFLoader
15
+ from langchain_groq import ChatGroq
16
+
17
  import numpy as np
18
  from sklearn.feature_extraction.text import TfidfVectorizer
 
19
 
20
+ # Logging
21
  logging.basicConfig(level=logging.INFO)
22
  logger = logging.getLogger(__name__)
23
 
24
+
25
  class SimpleEmbeddings:
26
  def __init__(self):
27
  self.vectorizer = TfidfVectorizer(max_features=384, stop_words='english')
 
31
  if not self.fitted:
32
  self.vectorizer.fit(texts)
33
  self.fitted = True
34
+ embeddings = self.vectorizer.transform(texts)
35
+ return embeddings.toarray().tolist()
36
 
37
  def embed_query(self, text: str) -> List[float]:
38
  if not self.fitted:
39
  return [0.0] * 384
40
+ embedding = self.vectorizer.transform([text])
41
+ return embedding.toarray()[0].tolist()
42
+
43
 
 
44
  class RAGAssistant:
45
  def __init__(self, groq_api_key: str):
46
  self.groq_api_key = groq_api_key
47
  self.embeddings = self._init_embeddings()
48
+
49
+ self.text_splitter = RecursiveCharacterTextSplitter(
50
+ chunk_size=1000,
51
+ chunk_overlap=200,
52
+ length_function=len
53
+ )
54
+
55
  self.learning_vectorstore = None
56
  self.code_vectorstore = None
57
+
58
+ self.llm = ChatGroq(
59
+ groq_api_key=groq_api_key,
60
+ model_name="llama3-70b-8192",
61
+ temperature=0.1
62
+ )
63
+
64
  self.learning_persist_dir = "./chroma_learning_db"
65
  self.code_persist_dir = "./chroma_code_db"
66
+
67
  self._init_vector_stores()
68
 
69
  def _init_embeddings(self):
70
  try:
71
  from langchain_huggingface import HuggingFaceEmbeddings
72
+ models_to_try = ["all-MiniLM-L6-v2", "paraphrase-MiniLM-L3-v2", "all-mpnet-base-v2"]
73
+ for model_name in models_to_try:
74
  try:
75
  embeddings = HuggingFaceEmbeddings(
76
  model_name=model_name,
77
  model_kwargs={'device': 'cpu'},
78
  encode_kwargs={'normalize_embeddings': False}
79
  )
 
80
  return embeddings
81
+ except Exception:
82
+ continue
83
  except ImportError:
84
+ pass
85
  return SimpleEmbeddings()
86
 
87
  def _init_vector_stores(self):
 
100
  try:
101
  documents = []
102
  for file_path in files:
103
+ if file_path.endswith('.pdf'):
104
+ loader = PyPDFLoader(file_path)
105
+ else:
106
+ loader = TextLoader(file_path, encoding='utf-8')
107
+ docs = loader.load()
108
+ documents.extend(docs)
109
 
110
  if not documents:
111
+ return "No documents could be loaded."
112
 
113
  chunks = self.text_splitter.split_documents(documents)
114
  for chunk in chunks:
 
117
  if assistant_type == "learning":
118
  self.learning_vectorstore.add_documents(chunks)
119
  self.learning_vectorstore.persist()
120
+ elif assistant_type == "code":
121
  self.code_vectorstore.add_documents(chunks)
122
  self.code_vectorstore.persist()
123
 
124
+ return f"Loaded {len(chunks)} chunks from {len(documents)} documents."
125
  except Exception as e:
126
+ return f"Error loading documents: {str(e)}"
127
 
128
  def get_learning_tutor_response(self, question: str) -> str:
129
  if not self.learning_vectorstore:
130
+ return "Please upload some learning materials."
 
131
  qa_chain = RetrievalQA.from_chain_type(
132
  llm=self.llm,
133
  chain_type="stuff",
134
  retriever=self.learning_vectorstore.as_retriever(search_kwargs={"k": 3}),
135
  return_source_documents=True
136
  )
137
+ result = qa_chain({"query": question})
 
 
 
 
 
 
138
  response = result['result']
139
  if result.get('source_documents'):
140
  response += "\n\n**Sources:**\n"
141
  for doc in result['source_documents'][:3]:
142
+ response += f"- {Path(doc.metadata.get('source', 'Unknown')).name}\n"
143
  return response
144
 
145
  def get_code_helper_response(self, question: str) -> str:
146
  if not self.code_vectorstore:
147
+ return "Please upload some code documentation."
 
148
  qa_chain = RetrievalQA.from_chain_type(
149
  llm=self.llm,
150
  chain_type="stuff",
151
  retriever=self.code_vectorstore.as_retriever(search_kwargs={"k": 3}),
152
  return_source_documents=True
153
  )
154
+ result = qa_chain({"query": question})
 
 
 
 
 
 
155
  response = result['result']
156
  if result.get('source_documents'):
157
+ response += "\n\n**Documentation Sources:**\n"
158
  for doc in result['source_documents'][:3]:
159
+ response += f"- {Path(doc.metadata.get('source', 'Unknown')).name}\n"
160
  return response
161
 
162
+
163
+ # --- Evaluation Utilities ---
164
+ def evaluate_retrieval(assistant, assistant_type: str, eval_file: str, k=3):
165
+ try:
166
+ if not os.path.exists(eval_file):
167
+ return f"Evaluation file {eval_file} not found."
168
+
169
+ with open(eval_file, 'r', encoding='utf-8') as f:
170
+ eval_data = json.load(f)
171
+
172
+ total = len(eval_data)
173
+ hits = 0
174
+ mrr = 0
175
+
176
+ for idx, item in enumerate(eval_data):
177
+ question = item.get("question", "")
178
+ keywords = item.get("keywords", [])
179
+ result = assistant.get_learning_tutor_response(question) if assistant_type == "learning" else assistant.get_code_helper_response(question)
180
+ hit = any(kw.lower() in result.lower() for kw in keywords)
181
+ if hit:
182
+ hits += 1
183
+ mrr += 1 / (idx + 1)
184
+
185
+ precision = hits / total if total else 0
186
+ recall = precision
187
+ mean_rr = mrr / total if total else 0
188
+
189
+ return f"""
190
+ πŸ“Š Evaluation Summary ({assistant_type.title()} Assistant):
191
+ - Total Queries: {total}
192
+ - Precision@{k}: {precision:.2f}
193
+ - Recall@{k}: {recall:.2f}
194
+ - MRR: {mean_rr:.2f}
195
+
196
+ βš™οΈ Config:
197
+ - Retriever Top-K: {k}
198
+ - Embedding Model: {getattr(assistant.embeddings, 'model_name', 'TF-IDF (fallback)')}
199
+ - Vector Store: ChromaDB
200
+ """
201
+ except Exception as e:
202
+ return f"Evaluation error: {str(e)}"
203
+
204
+
205
  def create_gradio_interface(assistant: RAGAssistant):
206
  def upload_learning_files(files):
207
+ return assistant.load_documents([f.name for f in files], "learning") if files else "No files uploaded."
 
 
 
208
 
209
  def upload_code_files(files):
210
+ return assistant.load_documents([f.name for f in files], "code") if files else "No files uploaded."
 
 
 
211
 
212
  def learning_chat(message, history):
213
+ if not message.strip():
214
+ return history, ""
215
  response = assistant.get_learning_tutor_response(message)
216
  history.append((message, response))
217
  return history, ""
218
 
219
  def code_chat(message, history):
220
+ if not message.strip():
221
+ return history, ""
222
  response = assistant.get_code_helper_response(message)
223
  history.append((message, response))
224
  return history, ""
225
 
226
+ with gr.Blocks(title="RAG-Based Learning & Code Assistant", theme=gr.themes.Soft()) as demo:
227
+ gr.Markdown("# πŸŽ“ RAG-Based Learning & Code Assistant")
228
+
229
  with gr.Tabs():
230
+ with gr.TabItem("πŸ“š Learning Tutor"):
231
+ learning_files = gr.File(file_types=[".pdf", ".txt", ".md"], file_count="multiple")
232
+ learning_upload_btn = gr.Button("Upload Materials")
233
+ learning_status = gr.Textbox()
234
+ learning_chatbot = gr.Chatbot(height=400)
235
+ learning_input = gr.Textbox(placeholder="Ask about your learning materials")
236
+ learning_submit = gr.Button("Ask")
237
+
238
+ learning_upload_btn.click(upload_learning_files, [learning_files], [learning_status])
239
+ learning_submit.click(learning_chat, [learning_input, learning_chatbot], [learning_chatbot, learning_input])
240
+ learning_input.submit(learning_chat, [learning_input, learning_chatbot], [learning_chatbot, learning_input])
241
+
242
+ with gr.TabItem("πŸ’» Code Documentation Helper"):
243
+ code_files = gr.File(file_types=[".pdf", ".txt", ".md", ".py", ".js", ".json"], file_count="multiple")
244
+ code_upload_btn = gr.Button("Upload Documentation")
245
+ code_status = gr.Textbox()
246
+ code_chatbot = gr.Chatbot(height=400)
247
+ code_input = gr.Textbox(placeholder="Ask about your codebase or APIs")
248
+ code_submit = gr.Button("Ask")
249
+
250
+ code_upload_btn.click(upload_code_files, [code_files], [code_status])
251
+ code_submit.click(code_chat, [code_input, code_chatbot], [code_chatbot, code_input])
252
+ code_input.submit(code_chat, [code_input, code_chatbot], [code_chatbot, code_input])
253
+
254
+ with gr.TabItem("πŸ“ˆ Evaluation Bench"):
255
+ gr.Markdown("Upload a supported document (.pdf, .txt, .md) containing evaluation questions and expected keywords.")
256
+ eval_file_input = gr.File(file_types=[".json"], file_count="single")
257
+ eval_assistant_choice = gr.Radio(["learning", "code"], label="Assistant", value="learning")
258
+ eval_button = gr.Button("Run Evaluation")
259
+ eval_output = gr.Textbox(lines=10)
260
+
261
+ def run_eval(file, assistant_type):
262
+ if file is None:
263
+ return "Please upload a valid evaluation file."
264
+ return evaluate_retrieval(assistant, assistant_type, file.name)
265
+
266
+ eval_button.click(run_eval, [eval_file_input, eval_assistant_choice], [eval_output])
267
 
268
  return demo
269
 
270
+
271
  def main():
272
  load_dotenv()
273
  groq_api_key = os.getenv("GROQ_API_KEY")
274
  if not groq_api_key:
275
+ print("Set your GROQ_API_KEY in .env file")
276
  return
 
277
  assistant = RAGAssistant(groq_api_key)
278
  demo = create_gradio_interface(assistant)
279
+ demo.launch(server_name="0.0.0.0", server_port=7860, debug=True)
280
+
281
 
282
  if __name__ == "__main__":
283
  main()