import gradio as gr import torch, re from pymongo import MongoClient from datetime import datetime from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline,AutoModelForSeq2SeqLM from sentence_transformers import SentenceTransformer, util from langchain_core.prompts import PromptTemplate from langchain_community.llms import HuggingFacePipeline from IndicTransToolkit.processor import IndicProcessor from transformers import BitsAndBytesConfig # === MongoDB === mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/" client = MongoClient(mongo_uri) db = client["msme_schemes_db"] udyam_coll = db["udyam_profiles"] schemes_chunk_coll = db["schemes_chunks_only"] schemes_info_coll = db["schemes_embedded"] query_logs_coll = db["query_logs"] # === LLM === model_id = "google/gemma-2b-it" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32) generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False) llm = HuggingFacePipeline(pipeline=generator) embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu") # === IndicTrans2 === DEVICE = "cuda" if torch.cuda.is_available() else "cpu" ip = IndicProcessor(inference=True) def initialize_translator(ckpt_dir): tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, trust_remote_code=True).to(DEVICE) model.eval() return tokenizer, model def translate_to_telugu(text, tokenizer, model): batch = ip.preprocess_batch([text], src_lang="eng_Latn", tgt_lang="tel_Telu") inputs = tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE) with torch.no_grad(): outputs = model.generate(**inputs, max_length=256, num_beams=5) result = tokenizer.batch_decode(outputs, skip_special_tokens=True) return ip.postprocess_batch(result, lang="tel_Telu")[0] translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B") # === Prompt === rephrase_template = PromptTemplate.from_template(""" You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes. Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment. Only return the query. Avoid comments. Enterprise Profile: {profile_summary} """) # === Utilities === def normalize_udyam(uid): return uid.strip().upper().replace(" ", "") def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid)) def get_profile_by_uid(uid): uid = normalize_udyam(uid) if not is_valid_udyam(uid): return None return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0}) def summarize_profile(profile): return ( f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. " f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has " f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}." ) def generate_search_query(profile): summary = summarize_profile(profile) prompt = rephrase_template.format(profile_summary=summary) response = llm.invoke(prompt) return response.strip().split("\n")[0].strip(), summary def get_top_matching_schemes(query_text, top_k=5): query_embedding = embed_model.encode(query_text, convert_to_tensor=True) matches = [] for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}): for chunk in doc["rag_chunks"]: if "embedding" in chunk and chunk["embedding"]: chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device) score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item() matches.append({ "score": score, "scheme_id": doc.get("scheme_id"), "scheme_name": doc.get("scheme_name") }) seen, top_results = set(), [] for m in sorted(matches, key=lambda x: x["score"], reverse=True): if m["scheme_id"] not in seen: top_results.append(m) seen.add(m["scheme_id"]) if len(top_results) == top_k: break return top_results def fetch_scheme_field_llm(scheme_id, field_input): field_map = { "eligibility": "eligibility_list", "benefits": "key_benefits_list", "assistance": "assistance_list", "apply": "how_to_apply_list", "documents": "required_documents_list" } matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None) if not matched_field: return "❌ Try asking about eligibility, benefits, how to apply, or documents." doc = schemes_info_coll.find_one({"scheme_id": scheme_id}) if doc and matched_field in doc: raw_text = "\n".join(doc[matched_field][:5]) prompt = f""" Summarize the following information into a clear and professional explanation for business owners: Scheme: {doc['scheme_name']} Section: {matched_field.replace('_list','').title()} {raw_text} """ return llm.invoke(prompt).strip() return "⚠️ Couldn’t find that information for the selected scheme." # === Chat State === chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""} def chatbot(msg, history): if chat_state["stage"] == 0: chat_state["stage"] = 1 chat_state["last_bot_msg"] = "👋 Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself." return chat_state["last_bot_msg"] if chat_state["stage"] == 1: if msg.lower().startswith("udyam-"): profile = get_profile_by_uid(msg) if profile: chat_state["profile"] = profile chat_state["stage"] = 3 summary = summarize_profile(profile) chat_state["summary"] = summary chat_state["last_bot_msg"] = f"✅ Profile found! Generating recommendations...\n\n🔍 Based on your profile: {summary}\n\nType 'show related schemes' to view top matches." return chat_state["last_bot_msg"] chat_state["last_bot_msg"] = "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'." return chat_state["last_bot_msg"] elif "manual" in msg.lower(): chat_state["stage"] = 2 chat_state["last_bot_msg"] = "📝 Great! What's your enterprise name?" return chat_state["last_bot_msg"] chat_state["last_bot_msg"] = "Please enter a valid Udyam ID or type 'manual'." return chat_state["last_bot_msg"] if chat_state["stage"] == 2: steps = [ "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type", "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment" ] curr_index = len(chat_state["profile"]) key = steps[curr_index] chat_state["profile"][key] = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg if len(chat_state["profile"]) == len(steps): chat_state["stage"] = 3 summary = summarize_profile(chat_state["profile"]) chat_state["summary"] = summary chat_state["last_bot_msg"] = f"✅ Thanks! Profile completed.\n\n🔍 Based on your profile: {summary}\n\nType 'show related schemes' to view top matches." return chat_state["last_bot_msg"] prompt = f"{steps[curr_index + 1]}?" chat_state["last_bot_msg"] = prompt return prompt if chat_state["stage"] == 3: if "show" in msg.lower() and "scheme" in msg.lower(): query, summary = generate_search_query(chat_state["profile"]) top_schemes = get_top_matching_schemes(query) if not top_schemes: chat_state["last_bot_msg"] = "⚠️ No matching schemes found." return chat_state["last_bot_msg"] chat_state["scheme_id"] = top_schemes[0]["scheme_id"] chat_state["stage"] = 4 schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)]) chat_state["last_bot_msg"] = f"📈 Recommended Schemes:\n{schemes_text}\n\nYou can now ask about eligibility, apply, documents, etc." query_logs_coll.insert_one({ "timestamp": datetime.utcnow(), "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"), "profile_summary": summary, "query": query, "top_schemes": top_schemes, "selected_scheme": top_schemes[0]["scheme_name"] }) return chat_state["last_bot_msg"] chat_state["last_bot_msg"] = "Type 'show related schemes' to proceed." return chat_state["last_bot_msg"] if chat_state["stage"] == 4: response = fetch_scheme_field_llm(chat_state["scheme_id"], msg) chat_state["last_bot_msg"] = response return response def translate_last_response(): if chat_state["last_bot_msg"]: return "📄 Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model) return "⚠️ No message to translate." # === UI === with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo: chatbot_ui = gr.ChatInterface(fn=chatbot, title="🤖 MSME Scheme Assistant", textbox=gr.Textbox(placeholder="Type your message here...")) translate_btn = gr.Button("🌐 Translate Last Response to Telugu") translation_output = gr.Textbox(label="🗣️ Telugu Translation", lines=5) translate_btn.click(fn=translate_last_response, outputs=translation_output) demo.launch()