| | from flask import Flask, request, jsonify |
| | from flask_cors import CORS |
| | import re |
| |
|
| | app = Flask(__name__) |
| | CORS(app) |
| |
|
| | def extract_urls(text): |
| | url_pattern = r'https?://[^\s]+' |
| | return re.findall(url_pattern, text) |
| |
|
| | @app.route("/", methods=["GET"]) |
| | def home(): |
| | return jsonify({"message": "API is running 🚀"}) |
| |
|
| | @app.route("/process-text", methods=["POST"]) |
| | def process_text(): |
| | data = request.get_json() |
| |
|
| | if not data or "input_text" not in data: |
| | return jsonify({"error": "Please provide 'input_text'"}), 400 |
| |
|
| | user_text = data["input_text"] |
| | found_urls = extract_urls(user_text) |
| |
|
| | return jsonify({ |
| | "original_text": user_text, |
| | "message": f"Received your text! It is {len(user_text)} characters long.", |
| | "urls_found": found_urls, |
| | "status": "success" |
| | }) |
| | import os |
| | import re |
| | from typing import List |
| | import pandas as pd |
| | from deep_translator import GoogleTranslator |
| |
|
| | from langchain_core.documents import Document |
| | from langchain_community.document_loaders import ( |
| | WebBaseLoader, PyPDFLoader, Docx2txtLoader |
| | ) |
| | from langchain.text_splitter import RecursiveCharacterTextSplitter |
| | from langchain_community.embeddings import HuggingFaceEmbeddings |
| | from langchain_community.vectorstores import Chroma |
| | from langchain.chains import RetrievalQA |
| | from langchain.prompts import PromptTemplate |
| | from langchain.llms import HuggingFacePipeline |
| | from transformers import pipeline |
| |
|
| | from huggingface_hub import snapshot_download |
| | import os |
| |
|
| | DATASET_REPO = "bshk57/Sastra_data" |
| | LOCAL_DIR = "knowledge_base" |
| |
|
| | os.makedirs(LOCAL_DIR, exist_ok=True) |
| |
|
| | local_path = snapshot_download( |
| | repo_id=DATASET_REPO, |
| | repo_type="dataset", |
| | local_dir=LOCAL_DIR, |
| | local_dir_use_symlinks=False, |
| | ignore_patterns=[".gitattributes"] |
| | ) |
| |
|
| | |
| | |
| | |
| |
|
| | SASTRA_URLS = [ |
| | "https://www.sastra.edu/about-us.html", |
| | "https://www.sastra.edu/academics/schools.html#school-of-computing", |
| | "https://www.sastra.edu/admissions/ug-pg.html", |
| | "https://www.sastra.edu/admissions/eligibility-criteria.html", |
| | "https://www.sastra.edu/admissions/fee-structure.html", |
| | "https://www.sastra.edu/admissions/hostel-fees.html", |
| | "https://www.sastra.edu/infrastructure/physical-facilities.html", |
| | "https://www.sastra.edu/about-us/mission-vision.html", |
| | ] |
| |
|
| | KEYWORD_EXCEL = "Chat Bot- Keywords and Responses0511.xlsx" |
| | UPLOAD_DIR = "knowledge_base" |
| | VECTOR_DB_PATH = "sastra_vector_db" |
| |
|
| | EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2" |
| | LLM_MODEL = "google/flan-t5-base" |
| |
|
| | os.makedirs(UPLOAD_DIR, exist_ok=True) |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | vectordb = None |
| | retriever = None |
| | qa_chain = None |
| | keyword_responses: List[tuple] = [] |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def load_keyword_responses(path): |
| | pairs = [] |
| | if not os.path.exists(path): |
| | return pairs |
| |
|
| | df = pd.read_excel(path) |
| | for _, row in df.iterrows(): |
| | if pd.notna(row.get("Keywords")) and pd.notna(row.get("Response")): |
| | for kw in str(row["Keywords"]).split(","): |
| | pairs.append((kw.strip().lower(), str(row["Response"]))) |
| | return pairs |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def load_local_documents(): |
| | docs = [] |
| | for file in os.listdir(UPLOAD_DIR): |
| | path = os.path.join(UPLOAD_DIR, file) |
| | try: |
| | if file.lower().endswith(".pdf"): |
| | docs.extend(PyPDFLoader(path).load()) |
| |
|
| | elif file.lower().endswith(".docx"): |
| | docs.extend(Docx2txtLoader(path).load()) |
| |
|
| | elif file.lower().endswith(".xlsx"): |
| | df = pd.read_excel(path) |
| | for _, row in df.iterrows(): |
| | text = " | ".join( |
| | f"{col}: {row[col]}" |
| | for col in df.columns |
| | if pd.notna(row[col]) |
| | ) |
| | docs.append( |
| | Document(page_content=text, metadata={"source": file}) |
| | ) |
| | except Exception as e: |
| | print(f"⚠ Error loading {file}: {e}") |
| |
|
| | return docs |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def initialize_model(): |
| | global vectordb, retriever, qa_chain, keyword_responses |
| |
|
| | docs = [] |
| |
|
| | |
| | for url in SASTRA_URLS: |
| | try: |
| | docs.extend(WebBaseLoader(url).load()) |
| | except: |
| | pass |
| |
|
| | |
| | docs.extend(load_local_documents()) |
| |
|
| | |
| | keyword_responses = load_keyword_responses(KEYWORD_EXCEL) |
| | for k, v in keyword_responses: |
| | docs.append( |
| | Document( |
| | page_content=f"{k}: {v}", |
| | metadata={"source": "keywords"} |
| | ) |
| | ) |
| |
|
| | splitter = RecursiveCharacterTextSplitter( |
| | chunk_size=600, |
| | chunk_overlap=50 |
| | ) |
| | chunks = splitter.split_documents(docs) |
| |
|
| | embeddings = HuggingFaceEmbeddings( |
| | model_name=EMBEDDING_MODEL |
| | ) |
| |
|
| | vectordb = Chroma.from_documents( |
| | chunks, |
| | embeddings, |
| | persist_directory=VECTOR_DB_PATH |
| | ) |
| |
|
| | retriever = vectordb.as_retriever(search_kwargs={"k": 4}) |
| |
|
| | generator = pipeline( |
| | "text2text-generation", |
| | model=LLM_MODEL, |
| | tokenizer=LLM_MODEL, |
| | max_new_tokens=200, |
| | temperature=0.1, |
| | top_p=0.85, |
| | do_sample=True, |
| | repetition_penalty=1.2 |
| | ) |
| |
|
| | llm = HuggingFacePipeline(pipeline=generator) |
| |
|
| | prompt = PromptTemplate( |
| | input_variables=["context", "question"], |
| | template=""" |
| | You are AskSASTRA, the official SASTRA University admissions assistant. |
| | Answer ONLY from the context. |
| | If not found, say INSUFFICIENT_DATA. |
| | |
| | Context: |
| | {context} |
| | |
| | Question: |
| | {question} |
| | |
| | Answer: |
| | """ |
| | ) |
| |
|
| | qa_chain = RetrievalQA.from_chain_type( |
| | llm=llm, |
| | retriever=retriever, |
| | chain_type="stuff", |
| | chain_type_kwargs={"prompt": prompt}, |
| | return_source_documents=False |
| | ) |
| |
|
| | print("✅ AskSASTRA model initialized") |
| |
|
| |
|
| | |
| | initialize_model() |
| |
|
| |
|
| | |
| | |
| | |
| |
|
| | def clean_llm_output(text: str) -> str: |
| | text = re.sub(r'^(Answer:|Response:)', '', text, flags=re.I).strip() |
| | if text.lower().startswith("insufficient_data"): |
| | return "" |
| | return re.sub(r'\s+', ' ', text)[:600] |
| |
|
| |
|
| | def match_keyword(query: str): |
| | for k, v in keyword_responses: |
| | if k in query.lower(): |
| | return v |
| | return None |
| |
|
| | |
| | |
| | |
| |
|
| | @app.route("/chat", methods=["POST"]) |
| | def chat(): |
| | data = request.get_json(force=True) |
| |
|
| | query = data.get("query", "").strip() |
| | lang = data.get("language", "en") |
| |
|
| | if not query: |
| | return jsonify({"answer": "Please ask a valid question."}) |
| |
|
| | |
| | if lang != "en": |
| | try: |
| | query_en = GoogleTranslator( |
| | source=lang, target="en" |
| | ).translate(query) |
| | except: |
| | query_en = query |
| | else: |
| | query_en = query |
| |
|
| | |
| | keyword_answer = match_keyword(query_en) |
| | if keyword_answer: |
| | return jsonify({"answer": keyword_answer}) |
| |
|
| | |
| | try: |
| | result = qa_chain.invoke({"query": query_en}) |
| | raw = result.get("result", "") |
| | answer = clean_llm_output(raw) |
| | except Exception: |
| | answer = "" |
| |
|
| | if answer and len(answer.split()) >= 5: |
| | return jsonify({"answer": answer}) |
| |
|
| | |
| | return jsonify({ |
| | "answer": ( |
| | "I couldn't find confident information related to this question. " |
| | "Please contact the SASTRA Admissions Office at " |
| | "admissions@sastra.edu or visit www.sastra.edu." |
| | ) |
| | }) |