bshk57 commited on
Commit
7200c88
Β·
verified Β·
1 Parent(s): a4b2d10

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +524 -0
  2. requirements.txt +13 -0
  3. training_data.xlsx +0 -0
app.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Install required packages
2
+ #!pip install langchain langchain-community chromadb sentence-transformers transformers gradio deep-translator openpyxl --quiet
3
+ #!pip install --upgrade protobuf==4.23.3
4
+
5
+ import os, json
6
+ from datetime import datetime
7
+ import pandas as pd
8
+ from collections import Counter
9
+ from langchain_core.documents import Document
10
+ from langchain_community.document_loaders import WebBaseLoader
11
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
12
+ from langchain_community.embeddings import HuggingFaceEmbeddings
13
+ from langchain_community.vectorstores import Chroma
14
+ from langchain.chains import RetrievalQA
15
+ from langchain.prompts import PromptTemplate
16
+ from transformers import pipeline
17
+ from langchain.llms import HuggingFacePipeline
18
+ from deep_translator import GoogleTranslator
19
+ import gradio as gr
20
+ import re
21
+
22
+ # ---------------------------
23
+ # 1️⃣ Configuration
24
+ # ---------------------------
25
+ SASTRA_URLS = [
26
+ "https://www.sastra.edu/about-us.html",
27
+ "https://www.sastra.edu/academics/schools.html#school-of-computing",
28
+ "https://www.sastra.edu/admissions/ug-pg.html",
29
+ "https://www.sastra.edu/admissions/eligibility-criteria.html",
30
+ "https://www.sastra.edu/admissions/fee-structure.html",
31
+ "https://www.sastra.edu/admissions/hostel-fees.html",
32
+ "https://www.sastra.edu/infrastructure/physical-facilities.html",
33
+ "https://www.sastra.edu/about-us/mission-vision.html",
34
+ ]
35
+
36
+ EXCEL_FILE = "training_data.xlsx"
37
+ VECTOR_DB_PATH = "sastra_local_db"
38
+ LOG_FILE = "query_logs.json"
39
+ ANALYTICS_FILE = "analytics_data.json"
40
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
41
+ ADMIN_PASSWORD = "sastra_admin_2024" # Change this for security
42
+
43
+ # Global variables for dynamic retraining
44
+ vectordb = None
45
+ retriever = None
46
+ qa_chain = None
47
+ keyword_responses = []
48
+
49
+ # ---------------------------
50
+ # 2️⃣ Load keyword-response data from Excel
51
+ # ---------------------------
52
+ def load_keyword_responses(file_path):
53
+ """Load keyword-response pairs from Excel file"""
54
+ try:
55
+ df = pd.read_excel(file_path)
56
+ keyword_responses = []
57
+ for _, row in df.iterrows():
58
+ keywords_str = str(row['Keywords']).lower().split(',') if pd.notna(row['Keywords']) else []
59
+ response = str(row['Response']) if pd.notna(row['Response']) else ""
60
+ for kw in keywords_str:
61
+ keyword_responses.append((kw.strip().lower(), response))
62
+ return keyword_responses
63
+ except Exception as e:
64
+ print(f"Error loading keyword responses: {e}")
65
+ return []
66
+
67
+ # ---------------------------
68
+ # 3️⃣ Initialize model and vectorstore
69
+ # ---------------------------
70
+ def initialize_model(excel_path=EXCEL_FILE):
71
+ """Initialize or reinitialize the model with new data"""
72
+ global vectordb, retriever, qa_chain, keyword_responses
73
+
74
+ print("πŸ”„ Initializing model...")
75
+
76
+ # Load keyword responses
77
+ keyword_responses = load_keyword_responses(excel_path)
78
+ print(f"βœ… Loaded {len(keyword_responses)} keyword-response pairs")
79
+
80
+ # Load documents from URLs
81
+ docs = []
82
+ for url in SASTRA_URLS:
83
+ try:
84
+ loader = WebBaseLoader(url)
85
+ docs.extend(loader.load())
86
+ print(f"βœ… Loaded: {url}")
87
+ except Exception as e:
88
+ print(f"⚠ Error loading {url}: {e}")
89
+
90
+ # Add Excel data as additional documents
91
+ for kw, resp in keyword_responses:
92
+ if kw and resp:
93
+ excel_doc = Document(
94
+ page_content=f"Keyword: {kw}\nResponse: {resp}",
95
+ metadata={"source": "training_data"}
96
+ )
97
+ docs.append(excel_doc)
98
+
99
+ print(f"πŸ“„ Total documents loaded: {len(docs)}")
100
+
101
+ # Split documents
102
+ splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
103
+ chunks = splitter.split_documents(docs)
104
+
105
+ # Remove duplicate chunks
106
+ seen_content = set()
107
+ unique_chunks = []
108
+ for chunk in chunks:
109
+ content = chunk.page_content.strip()
110
+ if content not in seen_content:
111
+ seen_content.add(content)
112
+ unique_chunks.append(chunk)
113
+ chunks = unique_chunks
114
+
115
+ print(f"πŸ“Š Created {len(chunks)} unique chunks")
116
+
117
+ # Create embeddings and vector store
118
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
119
+ vectordb = Chroma.from_documents(chunks, embeddings, persist_directory=VECTOR_DB_PATH)
120
+ retriever = vectordb.as_retriever(search_kwargs={"k": 3})
121
+
122
+ print("πŸ” Vector store created")
123
+
124
+ # Initialize LLM with better parameters
125
+ MODEL_ID = "google/flan-t5-base"
126
+ generator = pipeline(
127
+ "text2text-generation",
128
+ model=MODEL_ID,
129
+ tokenizer=MODEL_ID,
130
+ max_new_tokens=200,
131
+ temperature=0.1,
132
+ top_p=0.85,
133
+ do_sample=True,
134
+ repetition_penalty=1.2
135
+ )
136
+ llm = HuggingFacePipeline(pipeline=generator)
137
+
138
+ print("πŸ€– LLM initialized")
139
+
140
+ # Create prompt template
141
+ prompt = PromptTemplate(
142
+ input_variables=["context", "question"],
143
+ template="""You are a SASTRA University information assistant. Use the context below to answer the question.
144
+
145
+ Context:
146
+ {context}
147
+
148
+ Instructions:
149
+ - Give a direct, concise answer based ONLY on the context provided
150
+ - Do NOT start with "Answer:", "Response:", or any prefix
151
+ - Include URLs and emails exactly as they appear in the context
152
+ - Combine information from multiple contexts if they relate to the same topic
153
+ - If context is insufficient, respond with only: "INSUFFICIENT_DATA"
154
+
155
+ Question: {question}
156
+
157
+ Direct Answer:"""
158
+ )
159
+
160
+ # Create RAG chain
161
+ qa_chain = RetrievalQA.from_chain_type(
162
+ llm=llm,
163
+ retriever=retriever,
164
+ chain_type="stuff",
165
+ chain_type_kwargs={"prompt": prompt},
166
+ return_source_documents=False
167
+ )
168
+
169
+ print("βœ… Model initialization complete!")
170
+ return "Model initialized successfully!"
171
+
172
+ # Initialize on startup
173
+ try:
174
+ initialize_model()
175
+ except Exception as e:
176
+ print(f"⚠ Initial model loading failed: {e}")
177
+
178
+
179
+
180
+ # ---------------------------
181
+ # 4️⃣ Query logging with analytics
182
+ # ---------------------------
183
+ def log_query(query, answer, language="en", response_type="success"):
184
+ """Log queries for analytics"""
185
+ entry = {
186
+ "query": query,
187
+ "answer": answer,
188
+ "language": language,
189
+ "response_type": response_type,
190
+ "timestamp": datetime.now().isoformat()
191
+ }
192
+
193
+ try:
194
+ if os.path.exists(LOG_FILE):
195
+ with open(LOG_FILE, "r", encoding="utf-8") as f:
196
+ logs = json.load(f)
197
+ else:
198
+ logs = []
199
+
200
+ logs.append(entry)
201
+
202
+ with open(LOG_FILE, "w", encoding="utf-8") as f:
203
+ json.dump(logs, f, ensure_ascii=False, indent=2)
204
+ except Exception as e:
205
+ print(f"Logging error: {e}")
206
+
207
+ # ---------------------------
208
+ # 5️⃣ Keyword matching function
209
+ # ---------------------------
210
+ def match_keyword(query):
211
+ """Check if query matches any predefined keywords"""
212
+ query_lower = query.lower()
213
+ for kw, resp in keyword_responses:
214
+ if kw in query_lower:
215
+ return resp
216
+ return None
217
+
218
+ # ---------------------------
219
+ # 6️⃣ Format response with clickable links
220
+ # ---------------------------
221
+ def format_response(answer):
222
+ """Format response with clickable links and clean HTML"""
223
+
224
+ # Clean up malformed HTML from Excel data
225
+ answer = re.sub(r'__.*?target="_blank">____', '', answer)
226
+ answer = re.sub(r"__.*?'>πŸ‘‰Click__", '', answer)
227
+ answer = re.sub(r'__+', '', answer)
228
+
229
+ # Function to make URLs clickable
230
+ def make_link(match):
231
+ url = match.group(0).strip()
232
+ # Remove any trailing punctuation or quotes
233
+ url = re.sub(r'["\'>]+$', '', url)
234
+ url = re.sub(r'^["\'>]+', '', url)
235
+ return f'<a href="{url}" target="_blank">{url}</a>'
236
+
237
+ # Make URLs clickable (avoid already linked URLs)
238
+ if '<a href=' not in answer:
239
+ answer = re.sub(r'https?://[^\s<>"\']+', make_link, answer)
240
+
241
+ # Make emails clickable (avoid already linked emails)
242
+ if 'mailto:' not in answer:
243
+ email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
244
+ answer = re.sub(email_pattern, r'<a href="mailto:\g<0>" target="_blank">\g<0></a>', answer)
245
+
246
+ return answer
247
+
248
+ # ---------------------------
249
+ # 7️⃣ Clean LLM output
250
+ # ---------------------------
251
+ def clean_llm_output(text):
252
+ """Clean and format LLM output"""
253
+
254
+ # Remove common prefixes
255
+ text = re.sub(r'^(Answer:|Response:|Direct Answer:)\s*', '', text.strip(), flags=re.IGNORECASE)
256
+
257
+ # Remove "INSUFFICIENT_DATA" if it appears with other text
258
+ if "INSUFFICIENT_DATA" in text and len(text.split()) > 3:
259
+ text = re.sub(r'\s*INSUFFICIENT_DATA\s*', '', text)
260
+
261
+ # Clean multiple newlines
262
+ text = re.sub(r'\n{3,}', '\n\n', text)
263
+
264
+ # Remove extra whitespace
265
+ text = ' '.join(text.split())
266
+
267
+ return text.strip()
268
+
269
+ # ---------------------------
270
+ # 8️⃣ Main query function
271
+ # ---------------------------
272
+ def ask_sastra(query, lang="en"):
273
+ """Main function to process queries and generate responses"""
274
+ original_query = query
275
+
276
+ # Translate to English if needed
277
+ if lang != "en":
278
+ try:
279
+ query = GoogleTranslator(source=lang, target="en").translate(query)
280
+ except Exception as e:
281
+ print(f"Translation error: {e}")
282
+ query = original_query
283
+
284
+ # First, check exact keyword match
285
+ keyword_match = match_keyword(query)
286
+ if keyword_match:
287
+ answer = keyword_match
288
+ response_type = "keyword_match"
289
+ else:
290
+ # Fallback to RAG
291
+ try:
292
+ rag_answer = qa_chain.run(query).strip()
293
+ # Clean the output
294
+ rag_answer = clean_llm_output(rag_answer)
295
+ except Exception as e:
296
+ print(f"RAG Error: {e}")
297
+ rag_answer = "INSUFFICIENT_DATA"
298
+
299
+ # Check if answer is valid
300
+ if (rag_answer == "INSUFFICIENT_DATA" or
301
+ not rag_answer or
302
+ len(rag_answer) < 10 or
303
+ "i don't know" in rag_answer.lower()):
304
+ answer = "I'm sorry, I don't have information related to this question. Please contact the SASTRA Admissions Office for assistance at <a href='mailto:admissions@sastra.edu'>admissions@sastra.edu</a> or visit <a href='https://www.sastra.edu' target='_blank'>www.sastra.edu</a>"
305
+ response_type = "insufficient_data"
306
+ else:
307
+ answer = rag_answer
308
+ response_type = "rag_success"
309
+
310
+ # Format response with clickable links
311
+ answer = format_response(answer)
312
+
313
+ # Translate back to original language (skip HTML tags)
314
+ if lang != "en" and response_type != "insufficient_data":
315
+ try:
316
+ # Extract text without HTML for translation
317
+ text_only = re.sub(r'<[^>]+>', '', answer)
318
+ translated = GoogleTranslator(source="en", target=lang).translate(text_only)
319
+ # Keep original HTML links
320
+ links = re.findall(r'<a[^>]+>.*?</a>', answer)
321
+ translated_with_links = translated
322
+ for link in links:
323
+ translated_with_links += f" {link}"
324
+ answer = translated_with_links
325
+ except Exception as e:
326
+ print(f"Translation error: {e}")
327
+
328
+ log_query(original_query, answer, language=lang, response_type=response_type)
329
+ return answer
330
+
331
+ # ---------------------------
332
+ # 9️⃣ Analytics Functions
333
+ # ---------------------------
334
+ def get_analytics():
335
+ """Retrieve analytics data from logs"""
336
+ if not os.path.exists(LOG_FILE):
337
+ return {
338
+ "total_queries": 0,
339
+ "top_questions": [],
340
+ "language_distribution": {},
341
+ "response_types": {},
342
+ "recent_queries": []
343
+ }
344
+
345
+ try:
346
+ with open(LOG_FILE, "r", encoding="utf-8") as f:
347
+ logs = json.load(f)
348
+ except:
349
+ return {
350
+ "total_queries": 0,
351
+ "top_questions": [],
352
+ "language_distribution": {},
353
+ "response_types": {},
354
+ "recent_queries": []
355
+ }
356
+
357
+ total_queries = len(logs)
358
+
359
+ # Most frequently asked questions
360
+ questions = [log["query"] for log in logs]
361
+ question_counts = Counter(questions)
362
+ top_questions = question_counts.most_common(10)
363
+
364
+ # Language distribution
365
+ languages = [log.get("language", "en") for log in logs]
366
+ language_dist = dict(Counter(languages))
367
+
368
+ # Response type distribution
369
+ response_types = [log.get("response_type", "unknown") for log in logs]
370
+ response_type_dist = dict(Counter(response_types))
371
+
372
+ # Recent queries (last 20)
373
+ recent_queries = logs[-20:][::-1]
374
+
375
+ return {
376
+ "total_queries": total_queries,
377
+ "top_questions": top_questions,
378
+ "language_distribution": language_dist,
379
+ "response_types": response_type_dist,
380
+ "recent_queries": recent_queries
381
+ }
382
+
383
+ def display_analytics():
384
+ """Display analytics in formatted text"""
385
+ analytics = get_analytics()
386
+
387
+ output = f"## πŸ“Š Analytics Dashboard\n\n"
388
+ output += f"**Total Queries:** {analytics['total_queries']}\n\n"
389
+
390
+ output += "### πŸ”₯ Top 10 Most Frequently Asked Questions:\n"
391
+ if analytics['top_questions']:
392
+ for i, (q, count) in enumerate(analytics['top_questions'], 1):
393
+ output += f"{i}. {q} - ({count} times)\n"
394
+ else:
395
+ output += "No queries yet.\n"
396
+
397
+ output += "\n### 🌍 Language Distribution:\n"
398
+ if analytics['language_distribution']:
399
+ for lang, count in analytics['language_distribution'].items():
400
+ output += f"- {lang}: {count} queries\n"
401
+ else:
402
+ output += "No data yet.\n"
403
+
404
+ output += "\n### βœ… Response Type Distribution:\n"
405
+ if analytics['response_types']:
406
+ for resp_type, count in analytics['response_types'].items():
407
+ output += f"- {resp_type}: {count}\n"
408
+ else:
409
+ output += "No data yet.\n"
410
+
411
+ output += "\n### πŸ•’ Recent Queries (Last 20):\n"
412
+ if analytics['recent_queries']:
413
+ for i, query in enumerate(analytics['recent_queries'][:10], 1):
414
+ output += f"{i}. [{query.get('timestamp', 'N/A')}] {query.get('query', 'N/A')} ({query.get('language', 'N/A')})\n"
415
+ else:
416
+ output += "No queries yet.\n"
417
+
418
+ return output
419
+
420
+ def download_logs():
421
+ """Return path to log file for download"""
422
+ if os.path.exists(LOG_FILE):
423
+ return LOG_FILE
424
+ return None
425
+
426
+ # ---------------------------
427
+ # πŸ”Ÿ Admin Functions - Upload & Retrain
428
+ # ---------------------------
429
+ def retrain_model(file, password):
430
+ """Retrain model with new Excel data"""
431
+ if password != ADMIN_PASSWORD:
432
+ return "❌ Invalid password. Access denied."
433
+
434
+ if file is None:
435
+ return "❌ Please upload an Excel file."
436
+
437
+ try:
438
+ # Save uploaded file - handle both file path and file object
439
+ new_excel_path = "uploaded_training_data.xlsx"
440
+
441
+ # If file is a string (file path), copy it
442
+ if isinstance(file, str):
443
+ import shutil
444
+ shutil.copy(file, new_excel_path)
445
+ else:
446
+ # If file is a file object, read and write it
447
+ with open(new_excel_path, "wb") as f:
448
+ if hasattr(file, 'read'):
449
+ content = file.read()
450
+ if isinstance(content, bytes):
451
+ f.write(content)
452
+ else:
453
+ f.write(content.encode())
454
+ else:
455
+ f.write(file)
456
+
457
+ # Reinitialize model with new data
458
+ result = initialize_model(new_excel_path)
459
+ return f"βœ… Model retrained successfully with new data!\n{result}"
460
+ except Exception as e:
461
+ return f"❌ Error during retraining: {str(e)}"
462
+
463
+ # ---------------------------
464
+ # 1️⃣1️⃣ Gradio Interfaces
465
+ # ---------------------------
466
+ langs = {"English":"en", "Tamil":"ta", "Telugu":"te", "Kannada":"kn", "Hindi":"hi"}
467
+
468
+ def gradio_chatbot(query, language):
469
+ """Gradio interface for chatbot"""
470
+ return ask_sastra(query, lang=langs[language])
471
+
472
+ # Chatbot Interface
473
+ chatbot_interface = gr.Interface(
474
+ fn=gradio_chatbot,
475
+ inputs=[
476
+ gr.Textbox(label="Ask your question", placeholder="Type your question here..."),
477
+ gr.Dropdown(list(langs.keys()), label="Language", value="English")
478
+ ],
479
+ outputs=gr.HTML(label="Response"),
480
+ title="πŸŽ“ AskSASTRA - AI Multilingual Chatbot",
481
+ description="Ask any question about SASTRA University and get instant answers in your preferred language.",
482
+ theme="soft"
483
+ )
484
+
485
+ # Admin Dashboard Interface
486
+ admin_interface = gr.Interface(
487
+ fn=retrain_model,
488
+ inputs=[
489
+ gr.File(label="Upload Training Data (Excel)", file_types=[".xlsx"]),
490
+ gr.Textbox(label="Admin Password", type="password")
491
+ ],
492
+ outputs=gr.Textbox(label="Status"),
493
+ title="πŸ” Admin Dashboard - Model Retraining",
494
+ description="Upload new training data to retrain the chatbot model."
495
+ )
496
+
497
+ # Analytics Interface
498
+ analytics_interface = gr.Interface(
499
+ fn=lambda: display_analytics(),
500
+ inputs=[],
501
+ outputs=gr.Markdown(label="Analytics Report"),
502
+ title="πŸ“Š Analytics Dashboard",
503
+ description="View chatbot usage statistics and insights."
504
+ )
505
+
506
+ # Download Logs Interface
507
+ logs_interface = gr.Interface(
508
+ fn=download_logs,
509
+ inputs=[],
510
+ outputs=gr.File(label="Download Query Logs"),
511
+ title="πŸ“₯ Download Logs",
512
+ description="Download complete query logs for analysis."
513
+ )
514
+
515
+ # ---------------------------
516
+ # 1️⃣2️⃣ Launch Combined Interface
517
+ # ---------------------------
518
+ demo = gr.TabbedInterface(
519
+ [chatbot_interface, admin_interface, analytics_interface, logs_interface],
520
+ ["πŸ’¬ Chatbot", "πŸ” Admin Panel", "πŸ“Š Analytics", "πŸ“₯ Download Logs"],
521
+ title="AskSASTRA - Complete Management System"
522
+ )
523
+
524
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ langchain
3
+ langchain-community
4
+ chromadb
5
+ sentence-transformers
6
+ transformers
7
+ deep-translator
8
+ openpyxl
9
+ pandas
10
+ torch
11
+ accelerate
12
+ protobuf==4.23.3
13
+
training_data.xlsx ADDED
Binary file (25.8 kB). View file