Spaces:

bshk57
/

ASKSASTRA

Sleeping

App Files Files Community

bshk57 commited on Jan 5

Commit

7200c88

verified ·

1 Parent(s): a4b2d10

Upload 3 files

Browse files

Files changed (3) hide show

app.py +524 -0
requirements.txt +13 -0
training_data.xlsx +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,524 @@

+# Install required packages
+#!pip install langchain langchain-community chromadb sentence-transformers transformers gradio deep-translator openpyxl --quiet
+#!pip install --upgrade protobuf==4.23.3
+import os, json
+from datetime import datetime
+import pandas as pd
+from collections import Counter
+from langchain_core.documents import Document
+from langchain_community.document_loaders import WebBaseLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+from transformers import pipeline
+from langchain.llms import HuggingFacePipeline
+from deep_translator import GoogleTranslator
+import gradio as gr
+import re
+# ---------------------------
+# 1️⃣ Configuration
+# ---------------------------
+SASTRA_URLS = [
+    "https://www.sastra.edu/about-us.html",
+    "https://www.sastra.edu/academics/schools.html#school-of-computing",
+    "https://www.sastra.edu/admissions/ug-pg.html",
+    "https://www.sastra.edu/admissions/eligibility-criteria.html",
+    "https://www.sastra.edu/admissions/fee-structure.html",
+    "https://www.sastra.edu/admissions/hostel-fees.html",
+    "https://www.sastra.edu/infrastructure/physical-facilities.html",
+    "https://www.sastra.edu/about-us/mission-vision.html",
+]
+EXCEL_FILE = "training_data.xlsx"
+VECTOR_DB_PATH = "sastra_local_db"
+LOG_FILE = "query_logs.json"
+ANALYTICS_FILE = "analytics_data.json"
+EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
+ADMIN_PASSWORD = "sastra_admin_2024"  # Change this for security
+# Global variables for dynamic retraining
+vectordb = None
+retriever = None
+qa_chain = None
+keyword_responses = []
+# ---------------------------
+# 2️⃣ Load keyword-response data from Excel
+# ---------------------------
+def load_keyword_responses(file_path):
+    """Load keyword-response pairs from Excel file"""
+    try:
+        df = pd.read_excel(file_path)
+        keyword_responses = []
+        for _, row in df.iterrows():
+            keywords_str = str(row['Keywords']).lower().split(',') if pd.notna(row['Keywords']) else []
+            response = str(row['Response']) if pd.notna(row['Response']) else ""
+            for kw in keywords_str:
+                keyword_responses.append((kw.strip().lower(), response))
+        return keyword_responses
+    except Exception as e:
+        print(f"Error loading keyword responses: {e}")
+        return []
+# ---------------------------
+# 3️⃣ Initialize model and vectorstore
+# ---------------------------
+def initialize_model(excel_path=EXCEL_FILE):
+    """Initialize or reinitialize the model with new data"""
+    global vectordb, retriever, qa_chain, keyword_responses
+    print("🔄 Initializing model...")
+    # Load keyword responses
+    keyword_responses = load_keyword_responses(excel_path)
+    print(f"✅ Loaded {len(keyword_responses)} keyword-response pairs")
+    # Load documents from URLs
+    docs = []
+    for url in SASTRA_URLS:
+        try:
+            loader = WebBaseLoader(url)
+            docs.extend(loader.load())
+            print(f"✅ Loaded: {url}")
+        except Exception as e:
+            print(f"⚠ Error loading {url}: {e}")
+    # Add Excel data as additional documents
+    for kw, resp in keyword_responses:
+        if kw and resp:
+            excel_doc = Document(
+                page_content=f"Keyword: {kw}\nResponse: {resp}",
+                metadata={"source": "training_data"}
+            )
+            docs.append(excel_doc)
+    print(f"📄 Total documents loaded: {len(docs)}")
+    # Split documents
+    splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
+    chunks = splitter.split_documents(docs)
+    # Remove duplicate chunks
+    seen_content = set()
+    unique_chunks = []
+    for chunk in chunks:
+        content = chunk.page_content.strip()
+        if content not in seen_content:
+            seen_content.add(content)
+            unique_chunks.append(chunk)
+    chunks = unique_chunks
+    print(f"📊 Created {len(chunks)} unique chunks")
+    # Create embeddings and vector store
+    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    vectordb = Chroma.from_documents(chunks, embeddings, persist_directory=VECTOR_DB_PATH)
+    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
+    print("🔍 Vector store created")
+    # Initialize LLM with better parameters
+    MODEL_ID = "google/flan-t5-base"
+    generator = pipeline(
+        "text2text-generation",
+        model=MODEL_ID,
+        tokenizer=MODEL_ID,
+        max_new_tokens=200,
+        temperature=0.1,
+        top_p=0.85,
+        do_sample=True,
+        repetition_penalty=1.2
+    )
+    llm = HuggingFacePipeline(pipeline=generator)
+    print("🤖 LLM initialized")
+    # Create prompt template
+    prompt = PromptTemplate(
+        input_variables=["context", "question"],
+        template="""You are a SASTRA University information assistant. Use the context below to answer the question.
+Context:
+{context}
+Instructions:
+- Give a direct, concise answer based ONLY on the context provided
+- Do NOT start with "Answer:", "Response:", or any prefix
+- Include URLs and emails exactly as they appear in the context
+- Combine information from multiple contexts if they relate to the same topic
+- If context is insufficient, respond with only: "INSUFFICIENT_DATA"
+Question: {question}
+Direct Answer:"""
+    )
+    # Create RAG chain
+    qa_chain = RetrievalQA.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        chain_type_kwargs={"prompt": prompt},
+        return_source_documents=False
+    )
+    print("✅ Model initialization complete!")
+    return "Model initialized successfully!"
+# Initialize on startup
+try:
+    initialize_model()
+except Exception as e:
+    print(f"⚠ Initial model loading failed: {e}")
+# ---------------------------
+# 4️⃣ Query logging with analytics
+# ---------------------------
+def log_query(query, answer, language="en", response_type="success"):
+    """Log queries for analytics"""
+    entry = {
+        "query": query,
+        "answer": answer,
+        "language": language,
+        "response_type": response_type,
+        "timestamp": datetime.now().isoformat()
+    }
+    try:
+        if os.path.exists(LOG_FILE):
+            with open(LOG_FILE, "r", encoding="utf-8") as f:
+                logs = json.load(f)
+        else:
+            logs = []
+        logs.append(entry)
+        with open(LOG_FILE, "w", encoding="utf-8") as f:
+            json.dump(logs, f, ensure_ascii=False, indent=2)
+    except Exception as e:
+        print(f"Logging error: {e}")
+# ---------------------------
+# 5️⃣ Keyword matching function
+# ---------------------------
+def match_keyword(query):
+    """Check if query matches any predefined keywords"""
+    query_lower = query.lower()
+    for kw, resp in keyword_responses:
+        if kw in query_lower:
+            return resp
+    return None
+# ---------------------------
+# 6️⃣ Format response with clickable links
+# ---------------------------
+def format_response(answer):
+    """Format response with clickable links and clean HTML"""
+    # Clean up malformed HTML from Excel data
+    answer = re.sub(r'__.*?target="_blank">____', '', answer)
+    answer = re.sub(r"__.*?'>👉Click__", '', answer)
+    answer = re.sub(r'__+', '', answer)
+    # Function to make URLs clickable
+    def make_link(match):
+        url = match.group(0).strip()
+        # Remove any trailing punctuation or quotes
+        url = re.sub(r'["\'>]+$', '', url)
+        url = re.sub(r'^["\'>]+', '', url)
+        return f'<a href="{url}" target="_blank">{url}</a>'
+    # Make URLs clickable (avoid already linked URLs)
+    if '<a href=' not in answer:
+        answer = re.sub(r'https?://[^\s<>"\']+', make_link, answer)
+    # Make emails clickable (avoid already linked emails)
+    if 'mailto:' not in answer:
+        email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
+        answer = re.sub(email_pattern, r'<a href="mailto:\g<0>" target="_blank">\g<0></a>', answer)
+    return answer
+# ---------------------------
+# 7️⃣ Clean LLM output
+# ---------------------------
+def clean_llm_output(text):
+    """Clean and format LLM output"""
+    # Remove common prefixes
+    text = re.sub(r'^(Answer:|Response:|Direct Answer:)\s*', '', text.strip(), flags=re.IGNORECASE)
+    # Remove "INSUFFICIENT_DATA" if it appears with other text
+    if "INSUFFICIENT_DATA" in text and len(text.split()) > 3:
+        text = re.sub(r'\s*INSUFFICIENT_DATA\s*', '', text)
+    # Clean multiple newlines
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Remove extra whitespace
+    text = ' '.join(text.split())
+    return text.strip()
+# ---------------------------
+# 8️⃣ Main query function
+# ---------------------------
+def ask_sastra(query, lang="en"):
+    """Main function to process queries and generate responses"""
+    original_query = query
+    # Translate to English if needed
+    if lang != "en":
+        try:
+            query = GoogleTranslator(source=lang, target="en").translate(query)
+        except Exception as e:
+            print(f"Translation error: {e}")
+            query = original_query
+    # First, check exact keyword match
+    keyword_match = match_keyword(query)
+    if keyword_match:
+        answer = keyword_match
+        response_type = "keyword_match"
+    else:
+        # Fallback to RAG
+        try:
+            rag_answer = qa_chain.run(query).strip()
+            # Clean the output
+            rag_answer = clean_llm_output(rag_answer)
+        except Exception as e:
+            print(f"RAG Error: {e}")
+            rag_answer = "INSUFFICIENT_DATA"
+        # Check if answer is valid
+        if (rag_answer == "INSUFFICIENT_DATA" or
+            not rag_answer or
+            len(rag_answer) < 10 or
+            "i don't know" in rag_answer.lower()):
+            answer = "I'm sorry, I don't have information related to this question. Please contact the SASTRA Admissions Office for assistance at <a href='mailto:admissions@sastra.edu'>admissions@sastra.edu</a> or visit <a href='https://www.sastra.edu' target='_blank'>www.sastra.edu</a>"
+            response_type = "insufficient_data"
+        else:
+            answer = rag_answer
+            response_type = "rag_success"
+    # Format response with clickable links
+    answer = format_response(answer)
+    # Translate back to original language (skip HTML tags)
+    if lang != "en" and response_type != "insufficient_data":
+        try:
+            # Extract text without HTML for translation
+            text_only = re.sub(r'<[^>]+>', '', answer)
+            translated = GoogleTranslator(source="en", target=lang).translate(text_only)
+            # Keep original HTML links
+            links = re.findall(r'<a[^>]+>.*?</a>', answer)
+            translated_with_links = translated
+            for link in links:
+                translated_with_links += f" {link}"
+            answer = translated_with_links
+        except Exception as e:
+            print(f"Translation error: {e}")
+    log_query(original_query, answer, language=lang, response_type=response_type)
+    return answer
+# ---------------------------
+# 9️⃣ Analytics Functions
+# ---------------------------
+def get_analytics():
+    """Retrieve analytics data from logs"""
+    if not os.path.exists(LOG_FILE):
+        return {
+            "total_queries": 0,
+            "top_questions": [],
+            "language_distribution": {},
+            "response_types": {},
+            "recent_queries": []
+        }
+    try:
+        with open(LOG_FILE, "r", encoding="utf-8") as f:
+            logs = json.load(f)
+    except:
+        return {
+            "total_queries": 0,
+            "top_questions": [],
+            "language_distribution": {},
+            "response_types": {},
+            "recent_queries": []
+        }
+    total_queries = len(logs)
+    # Most frequently asked questions
+    questions = [log["query"] for log in logs]
+    question_counts = Counter(questions)
+    top_questions = question_counts.most_common(10)
+    # Language distribution
+    languages = [log.get("language", "en") for log in logs]
+    language_dist = dict(Counter(languages))
+    # Response type distribution
+    response_types = [log.get("response_type", "unknown") for log in logs]
+    response_type_dist = dict(Counter(response_types))
+    # Recent queries (last 20)
+    recent_queries = logs[-20:][::-1]
+    return {
+        "total_queries": total_queries,
+        "top_questions": top_questions,
+        "language_distribution": language_dist,
+        "response_types": response_type_dist,
+        "recent_queries": recent_queries
+    }
+def display_analytics():
+    """Display analytics in formatted text"""
+    analytics = get_analytics()
+    output = f"## 📊 Analytics Dashboard\n\n"
+    output += f"**Total Queries:** {analytics['total_queries']}\n\n"
+    output += "### 🔥 Top 10 Most Frequently Asked Questions:\n"
+    if analytics['top_questions']:
+        for i, (q, count) in enumerate(analytics['top_questions'], 1):
+            output += f"{i}. {q} - ({count} times)\n"
+    else:
+        output += "No queries yet.\n"
+    output += "\n### 🌍 Language Distribution:\n"
+    if analytics['language_distribution']:
+        for lang, count in analytics['language_distribution'].items():
+            output += f"- {lang}: {count} queries\n"
+    else:
+        output += "No data yet.\n"
+    output += "\n### ✅ Response Type Distribution:\n"
+    if analytics['response_types']:
+        for resp_type, count in analytics['response_types'].items():
+            output += f"- {resp_type}: {count}\n"
+    else:
+        output += "No data yet.\n"
+    output += "\n### 🕒 Recent Queries (Last 20):\n"
+    if analytics['recent_queries']:
+        for i, query in enumerate(analytics['recent_queries'][:10], 1):
+            output += f"{i}. [{query.get('timestamp', 'N/A')}] {query.get('query', 'N/A')} ({query.get('language', 'N/A')})\n"
+    else:
+        output += "No queries yet.\n"
+    return output
+def download_logs():
+    """Return path to log file for download"""
+    if os.path.exists(LOG_FILE):
+        return LOG_FILE
+    return None
+# ---------------------------
+# 🔟 Admin Functions - Upload & Retrain
+# ---------------------------
+def retrain_model(file, password):
+    """Retrain model with new Excel data"""
+    if password != ADMIN_PASSWORD:
+        return "❌ Invalid password. Access denied."
+    if file is None:
+        return "❌ Please upload an Excel file."
+    try:
+        # Save uploaded file - handle both file path and file object
+        new_excel_path = "uploaded_training_data.xlsx"
+        # If file is a string (file path), copy it
+        if isinstance(file, str):
+            import shutil
+            shutil.copy(file, new_excel_path)
+        else:
+            # If file is a file object, read and write it
+            with open(new_excel_path, "wb") as f:
+                if hasattr(file, 'read'):
+                    content = file.read()
+                    if isinstance(content, bytes):
+                        f.write(content)
+                    else:
+                        f.write(content.encode())
+                else:
+                    f.write(file)
+        # Reinitialize model with new data
+        result = initialize_model(new_excel_path)
+        return f"✅ Model retrained successfully with new data!\n{result}"
+    except Exception as e:
+        return f"❌ Error during retraining: {str(e)}"
+# ---------------------------
+# 1️⃣1️⃣ Gradio Interfaces
+# ---------------------------
+langs = {"English":"en", "Tamil":"ta", "Telugu":"te", "Kannada":"kn", "Hindi":"hi"}
+def gradio_chatbot(query, language):
+    """Gradio interface for chatbot"""
+    return ask_sastra(query, lang=langs[language])
+# Chatbot Interface
+chatbot_interface = gr.Interface(
+    fn=gradio_chatbot,
+    inputs=[
+        gr.Textbox(label="Ask your question", placeholder="Type your question here..."),
+        gr.Dropdown(list(langs.keys()), label="Language", value="English")
+    ],
+    outputs=gr.HTML(label="Response"),
+    title="🎓 AskSASTRA - AI Multilingual Chatbot",
+    description="Ask any question about SASTRA University and get instant answers in your preferred language.",
+    theme="soft"
+)
+# Admin Dashboard Interface
+admin_interface = gr.Interface(
+    fn=retrain_model,
+    inputs=[
+        gr.File(label="Upload Training Data (Excel)", file_types=[".xlsx"]),
+        gr.Textbox(label="Admin Password", type="password")
+    ],
+    outputs=gr.Textbox(label="Status"),
+    title="🔐 Admin Dashboard - Model Retraining",
+    description="Upload new training data to retrain the chatbot model."
+)
+# Analytics Interface
+analytics_interface = gr.Interface(
+    fn=lambda: display_analytics(),
+    inputs=[],
+    outputs=gr.Markdown(label="Analytics Report"),
+    title="📊 Analytics Dashboard",
+    description="View chatbot usage statistics and insights."
+)
+# Download Logs Interface
+logs_interface = gr.Interface(
+    fn=download_logs,
+    inputs=[],
+    outputs=gr.File(label="Download Query Logs"),
+    title="📥 Download Logs",
+    description="Download complete query logs for analysis."
+)
+# ---------------------------
+# 1️⃣2️⃣ Launch Combined Interface
+# ---------------------------
+demo = gr.TabbedInterface(
+    [chatbot_interface, admin_interface, analytics_interface, logs_interface],
+    ["💬 Chatbot", "🔐 Admin Panel", "📊 Analytics", "📥 Download Logs"],
+    title="AskSASTRA - Complete Management System"
+)
+demo.launch(server_name="0.0.0.0", server_port=7860)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+gradio
+langchain
+langchain-community
+chromadb
+sentence-transformers
+transformers
+deep-translator
+openpyxl
+pandas
+torch
+accelerate
+protobuf==4.23.3

training_data.xlsx ADDED Viewed

Binary file (25.8 kB). View file