from flask import Flask, request, jsonify from flask_cors import CORS import re app = Flask(__name__) CORS(app) def extract_urls(text): url_pattern = r'https?://[^\s]+' return re.findall(url_pattern, text) @app.route("/", methods=["GET"]) def home(): return jsonify({"message": "API is running 🚀"}) @app.route("/process-text", methods=["POST"]) def process_text(): data = request.get_json() if not data or "input_text" not in data: return jsonify({"error": "Please provide 'input_text'"}), 400 user_text = data["input_text"] found_urls = extract_urls(user_text) return jsonify({ "original_text": user_text, "message": f"Received your text! It is {len(user_text)} characters long.", "urls_found": found_urls, "status": "success" }) import os import re from typing import List import pandas as pd from deep_translator import GoogleTranslator from langchain_core.documents import Document from langchain_community.document_loaders import ( WebBaseLoader, PyPDFLoader, Docx2txtLoader ) from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain_community.embeddings import HuggingFaceEmbeddings from langchain_community.vectorstores import Chroma from langchain.chains import RetrievalQA from langchain.prompts import PromptTemplate from langchain.llms import HuggingFacePipeline from transformers import pipeline from huggingface_hub import snapshot_download import os DATASET_REPO = "bshk57/Sastra_data" LOCAL_DIR = "knowledge_base" os.makedirs(LOCAL_DIR, exist_ok=True) local_path = snapshot_download( repo_id=DATASET_REPO, repo_type="dataset", local_dir=LOCAL_DIR, local_dir_use_symlinks=False, ignore_patterns=[".gitattributes"] ) # ============================================================ # 1️⃣ CONFIGURATION # ============================================================ SASTRA_URLS = [ "https://www.sastra.edu/about-us.html", "https://www.sastra.edu/academics/schools.html#school-of-computing", "https://www.sastra.edu/admissions/ug-pg.html", "https://www.sastra.edu/admissions/eligibility-criteria.html", "https://www.sastra.edu/admissions/fee-structure.html", "https://www.sastra.edu/admissions/hostel-fees.html", "https://www.sastra.edu/infrastructure/physical-facilities.html", "https://www.sastra.edu/about-us/mission-vision.html", ] KEYWORD_EXCEL = "Chat Bot- Keywords and Responses0511.xlsx" UPLOAD_DIR = "knowledge_base" VECTOR_DB_PATH = "sastra_vector_db" EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" LLM_MODEL = "google/flan-t5-base" os.makedirs(UPLOAD_DIR, exist_ok=True) # ============================================================ # 2️⃣ GLOBAL OBJECTS # ============================================================ vectordb = None retriever = None qa_chain = None keyword_responses: List[tuple] = [] # ============================================================ # 3️⃣ LOAD KEYWORD RESPONSES # ============================================================ def load_keyword_responses(path): pairs = [] if not os.path.exists(path): return pairs df = pd.read_excel(path) for _, row in df.iterrows(): if pd.notna(row.get("Keywords")) and pd.notna(row.get("Response")): for kw in str(row["Keywords"]).split(","): pairs.append((kw.strip().lower(), str(row["Response"]))) return pairs # ============================================================ # 4️⃣ LOAD LOCAL DOCUMENTS # ============================================================ def load_local_documents(): docs = [] for file in os.listdir(UPLOAD_DIR): path = os.path.join(UPLOAD_DIR, file) try: if file.lower().endswith(".pdf"): docs.extend(PyPDFLoader(path).load()) elif file.lower().endswith(".docx"): docs.extend(Docx2txtLoader(path).load()) elif file.lower().endswith(".xlsx"): df = pd.read_excel(path) for _, row in df.iterrows(): text = " | ".join( f"{col}: {row[col]}" for col in df.columns if pd.notna(row[col]) ) docs.append( Document(page_content=text, metadata={"source": file}) ) except Exception as e: print(f"⚠ Error loading {file}: {e}") return docs # ============================================================ # 5️⃣ INITIALIZE RAG MODEL # ============================================================ def initialize_model(): global vectordb, retriever, qa_chain, keyword_responses docs = [] # Load website data for url in SASTRA_URLS: try: docs.extend(WebBaseLoader(url).load()) except: pass # Load local docs docs.extend(load_local_documents()) # Load keyword responses keyword_responses = load_keyword_responses(KEYWORD_EXCEL) for k, v in keyword_responses: docs.append( Document( page_content=f"{k}: {v}", metadata={"source": "keywords"} ) ) splitter = RecursiveCharacterTextSplitter( chunk_size=600, chunk_overlap=50 ) chunks = splitter.split_documents(docs) embeddings = HuggingFaceEmbeddings( model_name=EMBEDDING_MODEL ) vectordb = Chroma.from_documents( chunks, embeddings, persist_directory=VECTOR_DB_PATH ) retriever = vectordb.as_retriever(search_kwargs={"k": 4}) generator = pipeline( "text2text-generation", model=LLM_MODEL, tokenizer=LLM_MODEL, max_new_tokens=200, temperature=0.1, top_p=0.85, do_sample=True, repetition_penalty=1.2 ) llm = HuggingFacePipeline(pipeline=generator) prompt = PromptTemplate( input_variables=["context", "question"], template=""" You are AskSASTRA, the official SASTRA University admissions assistant. Answer ONLY from the context. If not found, say INSUFFICIENT_DATA. Context: {context} Question: {question} Answer: """ ) qa_chain = RetrievalQA.from_chain_type( llm=llm, retriever=retriever, chain_type="stuff", chain_type_kwargs={"prompt": prompt}, return_source_documents=False ) print("✅ AskSASTRA model initialized") # Initialize on startup initialize_model() # ============================================================ # 6️⃣ CHAT UTILITIES # ============================================================ def clean_llm_output(text: str) -> str: text = re.sub(r'^(Answer:|Response:)', '', text, flags=re.I).strip() if text.lower().startswith("insufficient_data"): return "" return re.sub(r'\s+', ' ', text)[:600] def match_keyword(query: str): for k, v in keyword_responses: if k in query.lower(): return v return None # ============================================================ # 8️⃣ CHATBOT API ENDPOINT # ============================================================ @app.route("/chat", methods=["POST"]) def chat(): data = request.get_json(force=True) query = data.get("query", "").strip() lang = data.get("language", "en") if not query: return jsonify({"answer": "Please ask a valid question."}) # Translate to English if lang != "en": try: query_en = GoogleTranslator( source=lang, target="en" ).translate(query) except: query_en = query else: query_en = query # 1️⃣ Keyword match keyword_answer = match_keyword(query_en) if keyword_answer: return jsonify({"answer": keyword_answer}) # 2️⃣ RAG inference try: result = qa_chain.invoke({"query": query_en}) raw = result.get("result", "") answer = clean_llm_output(raw) except Exception: answer = "" if answer and len(answer.split()) >= 5: return jsonify({"answer": answer}) # 3️⃣ Fallback return jsonify({ "answer": ( "I couldn't find confident information related to this question. " "Please contact the SASTRA Admissions Office at " "admissions@sastra.edu or visit www.sastra.edu." ) })