FlaskAsk / appold.py
bshk57's picture
Rename app.py to appold.py
72ad9ee verified
from flask import Flask, request, jsonify
from flask_cors import CORS
import re
app = Flask(__name__)
CORS(app)
def extract_urls(text):
url_pattern = r'https?://[^\s]+'
return re.findall(url_pattern, text)
@app.route("/", methods=["GET"])
def home():
return jsonify({"message": "API is running 🚀"})
@app.route("/process-text", methods=["POST"])
def process_text():
data = request.get_json()
if not data or "input_text" not in data:
return jsonify({"error": "Please provide 'input_text'"}), 400
user_text = data["input_text"]
found_urls = extract_urls(user_text)
return jsonify({
"original_text": user_text,
"message": f"Received your text! It is {len(user_text)} characters long.",
"urls_found": found_urls,
"status": "success"
})
import os
import re
from typing import List
import pandas as pd
from deep_translator import GoogleTranslator
from langchain_core.documents import Document
from langchain_community.document_loaders import (
WebBaseLoader, PyPDFLoader, Docx2txtLoader
)
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import pipeline
from huggingface_hub import snapshot_download
import os
DATASET_REPO = "bshk57/Sastra_data"
LOCAL_DIR = "knowledge_base"
os.makedirs(LOCAL_DIR, exist_ok=True)
local_path = snapshot_download(
repo_id=DATASET_REPO,
repo_type="dataset",
local_dir=LOCAL_DIR,
local_dir_use_symlinks=False,
ignore_patterns=[".gitattributes"]
)
# ============================================================
# 1️⃣ CONFIGURATION
# ============================================================
SASTRA_URLS = [
"https://www.sastra.edu/about-us.html",
"https://www.sastra.edu/academics/schools.html#school-of-computing",
"https://www.sastra.edu/admissions/ug-pg.html",
"https://www.sastra.edu/admissions/eligibility-criteria.html",
"https://www.sastra.edu/admissions/fee-structure.html",
"https://www.sastra.edu/admissions/hostel-fees.html",
"https://www.sastra.edu/infrastructure/physical-facilities.html",
"https://www.sastra.edu/about-us/mission-vision.html",
]
KEYWORD_EXCEL = "Chat Bot- Keywords and Responses0511.xlsx"
UPLOAD_DIR = "knowledge_base"
VECTOR_DB_PATH = "sastra_vector_db"
EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
LLM_MODEL = "google/flan-t5-base"
os.makedirs(UPLOAD_DIR, exist_ok=True)
# ============================================================
# 2️⃣ GLOBAL OBJECTS
# ============================================================
vectordb = None
retriever = None
qa_chain = None
keyword_responses: List[tuple] = []
# ============================================================
# 3️⃣ LOAD KEYWORD RESPONSES
# ============================================================
def load_keyword_responses(path):
pairs = []
if not os.path.exists(path):
return pairs
df = pd.read_excel(path)
for _, row in df.iterrows():
if pd.notna(row.get("Keywords")) and pd.notna(row.get("Response")):
for kw in str(row["Keywords"]).split(","):
pairs.append((kw.strip().lower(), str(row["Response"])))
return pairs
# ============================================================
# 4️⃣ LOAD LOCAL DOCUMENTS
# ============================================================
def load_local_documents():
docs = []
for file in os.listdir(UPLOAD_DIR):
path = os.path.join(UPLOAD_DIR, file)
try:
if file.lower().endswith(".pdf"):
docs.extend(PyPDFLoader(path).load())
elif file.lower().endswith(".docx"):
docs.extend(Docx2txtLoader(path).load())
elif file.lower().endswith(".xlsx"):
df = pd.read_excel(path)
for _, row in df.iterrows():
text = " | ".join(
f"{col}: {row[col]}"
for col in df.columns
if pd.notna(row[col])
)
docs.append(
Document(page_content=text, metadata={"source": file})
)
except Exception as e:
print(f"⚠ Error loading {file}: {e}")
return docs
# ============================================================
# 5️⃣ INITIALIZE RAG MODEL
# ============================================================
def initialize_model():
global vectordb, retriever, qa_chain, keyword_responses
docs = []
# Load website data
for url in SASTRA_URLS:
try:
docs.extend(WebBaseLoader(url).load())
except:
pass
# Load local docs
docs.extend(load_local_documents())
# Load keyword responses
keyword_responses = load_keyword_responses(KEYWORD_EXCEL)
for k, v in keyword_responses:
docs.append(
Document(
page_content=f"{k}: {v}",
metadata={"source": "keywords"}
)
)
splitter = RecursiveCharacterTextSplitter(
chunk_size=600,
chunk_overlap=50
)
chunks = splitter.split_documents(docs)
embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL
)
vectordb = Chroma.from_documents(
chunks,
embeddings,
persist_directory=VECTOR_DB_PATH
)
retriever = vectordb.as_retriever(search_kwargs={"k": 4})
generator = pipeline(
"text2text-generation",
model=LLM_MODEL,
tokenizer=LLM_MODEL,
max_new_tokens=200,
temperature=0.1,
top_p=0.85,
do_sample=True,
repetition_penalty=1.2
)
llm = HuggingFacePipeline(pipeline=generator)
prompt = PromptTemplate(
input_variables=["context", "question"],
template="""
You are AskSASTRA, the official SASTRA University admissions assistant.
Answer ONLY from the context.
If not found, say INSUFFICIENT_DATA.
Context:
{context}
Question:
{question}
Answer:
"""
)
qa_chain = RetrievalQA.from_chain_type(
llm=llm,
retriever=retriever,
chain_type="stuff",
chain_type_kwargs={"prompt": prompt},
return_source_documents=False
)
print("✅ AskSASTRA model initialized")
# Initialize on startup
initialize_model()
# ============================================================
# 6️⃣ CHAT UTILITIES
# ============================================================
def clean_llm_output(text: str) -> str:
text = re.sub(r'^(Answer:|Response:)', '', text, flags=re.I).strip()
if text.lower().startswith("insufficient_data"):
return ""
return re.sub(r'\s+', ' ', text)[:600]
def match_keyword(query: str):
for k, v in keyword_responses:
if k in query.lower():
return v
return None
# ============================================================
# 8️⃣ CHATBOT API ENDPOINT
# ============================================================
@app.route("/chat", methods=["POST"])
def chat():
data = request.get_json(force=True)
query = data.get("query", "").strip()
lang = data.get("language", "en")
if not query:
return jsonify({"answer": "Please ask a valid question."})
# Translate to English
if lang != "en":
try:
query_en = GoogleTranslator(
source=lang, target="en"
).translate(query)
except:
query_en = query
else:
query_en = query
# 1️⃣ Keyword match
keyword_answer = match_keyword(query_en)
if keyword_answer:
return jsonify({"answer": keyword_answer})
# 2️⃣ RAG inference
try:
result = qa_chain.invoke({"query": query_en})
raw = result.get("result", "")
answer = clean_llm_output(raw)
except Exception:
answer = ""
if answer and len(answer.split()) >= 5:
return jsonify({"answer": answer})
# 3️⃣ Fallback
return jsonify({
"answer": (
"I couldn't find confident information related to this question. "
"Please contact the SASTRA Admissions Office at "
"admissions@sastra.edu or visit www.sastra.edu."
)
})