Spaces:

Vipplav
/

MSME_Chat_bot

Runtime error

App Files Files Community

MSME_Chat_bot / app.py

Vipplav

Update app.py

e710c18 verified 7 months ago

raw

history blame contribute delete

10.3 kB

	import gradio as gr
	import torch, re
	from pymongo import MongoClient
	from datetime import datetime
	from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,AutoModelForSeq2SeqLM
	from sentence_transformers import SentenceTransformer, util
	from langchain_core.prompts import PromptTemplate
	from langchain_community.llms import HuggingFacePipeline
	from IndicTransToolkit.processor import IndicProcessor
	from transformers import BitsAndBytesConfig

	# === MongoDB ===
	mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
	client = MongoClient(mongo_uri)
	db = client["msme_schemes_db"]
	udyam_coll = db["udyam_profiles"]
	schemes_chunk_coll = db["schemes_chunks_only"]
	schemes_info_coll = db["schemes_embedded"]
	query_logs_coll = db["query_logs"]

	# === LLM ===
	model_id = "google/gemma-2b-it"
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
	generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
	llm = HuggingFacePipeline(pipeline=generator)
	embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")

	# === IndicTrans2 ===
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	ip = IndicProcessor(inference=True)

	def initialize_translator(ckpt_dir):
	tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
	model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, trust_remote_code=True).to(DEVICE)
	model.eval()
	return tokenizer, model

	def translate_to_telugu(text, tokenizer, model):
	batch = ip.preprocess_batch([text], src_lang="eng_Latn", tgt_lang="tel_Telu")
	inputs = tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)
	with torch.no_grad():
	outputs = model.generate(**inputs, max_length=256, num_beams=5)
	result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
	return ip.postprocess_batch(result, lang="tel_Telu")[0]

	translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B")

	# === Prompt ===
	rephrase_template = PromptTemplate.from_template("""
	You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
	Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
	Only return the query. Avoid comments.
	Enterprise Profile:
	{profile_summary}
	""")

	# === Utilities ===
	def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
	def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
	def get_profile_by_uid(uid):
	uid = normalize_udyam(uid)
	if not is_valid_udyam(uid): return None
	return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})

	def summarize_profile(profile):
	return (
	f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
	f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has "
	f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
	)

	def generate_search_query(profile):
	summary = summarize_profile(profile)
	prompt = rephrase_template.format(profile_summary=summary)
	response = llm.invoke(prompt)
	return response.strip().split("\n")[0].strip(), summary

	def get_top_matching_schemes(query_text, top_k=5):
	query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
	matches = []
	for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
	for chunk in doc["rag_chunks"]:
	if "embedding" in chunk and chunk["embedding"]:
	chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
	score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
	matches.append({
	"score": score,
	"scheme_id": doc.get("scheme_id"),
	"scheme_name": doc.get("scheme_name")
	})
	seen, top_results = set(), []
	for m in sorted(matches, key=lambda x: x["score"], reverse=True):
	if m["scheme_id"] not in seen:
	top_results.append(m)
	seen.add(m["scheme_id"])
	if len(top_results) == top_k:
	break
	return top_results

	def fetch_scheme_field_llm(scheme_id, field_input):
	field_map = {
	"eligibility": "eligibility_list",
	"benefits": "key_benefits_list",
	"assistance": "assistance_list",
	"apply": "how_to_apply_list",
	"documents": "required_documents_list"
	}
	matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
	if not matched_field:
	return "❌ Try asking about eligibility, benefits, how to apply, or documents."
	doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
	if doc and matched_field in doc:
	raw_text = "\n".join(doc[matched_field][:5])
	prompt = f"""
	Summarize the following information into a clear and professional explanation for business owners:

	Scheme: {doc['scheme_name']}
	Section: {matched_field.replace('_list','').title()}

	{raw_text}
	"""
	return llm.invoke(prompt).strip()
	return "⚠️ Couldn’t find that information for the selected scheme."

	# === Chat State ===
	chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""}

	def chatbot(msg, history):
	if chat_state["stage"] == 0:
	chat_state["stage"] = 1
	chat_state["last_bot_msg"] = "👋 Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
	return chat_state["last_bot_msg"]

	if chat_state["stage"] == 1:
	if msg.lower().startswith("udyam-"):
	profile = get_profile_by_uid(msg)
	if profile:
	chat_state["profile"] = profile
	chat_state["stage"] = 3
	summary = summarize_profile(profile)
	chat_state["summary"] = summary
	chat_state["last_bot_msg"] = f"✅ Profile found! Generating recommendations...\n\n🔍 Based on your profile: {summary}\n\nType 'show related schemes' to view top matches."
	return chat_state["last_bot_msg"]
	chat_state["last_bot_msg"] = "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'."
	return chat_state["last_bot_msg"]
	elif "manual" in msg.lower():
	chat_state["stage"] = 2
	chat_state["last_bot_msg"] = "📝 Great! What's your enterprise name?"
	return chat_state["last_bot_msg"]
	chat_state["last_bot_msg"] = "Please enter a valid Udyam ID or type 'manual'."
	return chat_state["last_bot_msg"]

	if chat_state["stage"] == 2:
	steps = [
	"Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
	"Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
	]
	curr_index = len(chat_state["profile"])
	key = steps[curr_index]
	chat_state["profile"][key] = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg
	if len(chat_state["profile"]) == len(steps):
	chat_state["stage"] = 3
	summary = summarize_profile(chat_state["profile"])
	chat_state["summary"] = summary
	chat_state["last_bot_msg"] = f"✅ Thanks! Profile completed.\n\n🔍 Based on your profile: {summary}\n\nType 'show related schemes' to view top matches."
	return chat_state["last_bot_msg"]
	prompt = f"{steps[curr_index + 1]}?"
	chat_state["last_bot_msg"] = prompt
	return prompt

	if chat_state["stage"] == 3:
	if "show" in msg.lower() and "scheme" in msg.lower():
	query, summary = generate_search_query(chat_state["profile"])
	top_schemes = get_top_matching_schemes(query)
	if not top_schemes:
	chat_state["last_bot_msg"] = "⚠️ No matching schemes found."
	return chat_state["last_bot_msg"]
	chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
	chat_state["stage"] = 4
	schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
	chat_state["last_bot_msg"] = f"📈 Recommended Schemes:\n{schemes_text}\n\nYou can now ask about eligibility, apply, documents, etc."
	query_logs_coll.insert_one({
	"timestamp": datetime.utcnow(),
	"udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
	"profile_summary": summary,
	"query": query,
	"top_schemes": top_schemes,
	"selected_scheme": top_schemes[0]["scheme_name"]
	})
	return chat_state["last_bot_msg"]
	chat_state["last_bot_msg"] = "Type 'show related schemes' to proceed."
	return chat_state["last_bot_msg"]

	if chat_state["stage"] == 4:
	response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)
	chat_state["last_bot_msg"] = response
	return response

	def translate_last_response():
	if chat_state["last_bot_msg"]:
	return "📄 Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model)
	return "⚠️ No message to translate."

	# === UI ===
	with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo:
	chatbot_ui = gr.ChatInterface(fn=chatbot, title="🤖 MSME Scheme Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
	translate_btn = gr.Button("🌐 Translate Last Response to Telugu")
	translation_output = gr.Textbox(label="🗣️ Telugu Translation", lines=5)
	translate_btn.click(fn=translate_last_response, outputs=translation_output)

	demo.launch()