Spaces:

Vipplav
/

MSME_Chat_bot

Runtime error

App Files Files Community

Vipplav commited on Jun 8, 2025

Commit

35714a8

verified ·

1 Parent(s): 2622a38

11

Browse files

Files changed (1) hide show

app.py +103 -66

app.py CHANGED Viewed

@@ -1,32 +1,48 @@
 import gradio as gr
 from pymongo import MongoClient
-from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer, util
 from langchain_core.prompts import PromptTemplate
 from langchain_community.llms import HuggingFacePipeline
-from datetime import datetime
-import torch, re
-# === Setup ===
-mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
-client = MongoClient(mongo_uri)
 db = client["msme_schemes_db"]
 udyam_coll = db["udyam_profiles"]
 schemes_chunk_coll = db["schemes_chunks_only"]
 schemes_info_coll = db["schemes_embedded"]
 query_logs_coll = db["query_logs"]
-model_id = "google/gemma-2b-it"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map="auto",
-    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 )
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
 llm = HuggingFacePipeline(pipeline=generator)
 embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
 rephrase_template = PromptTemplate.from_template("""
 You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
 Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
@@ -35,20 +51,19 @@ Enterprise Profile:
 {profile_summary}
 """)
-# === Utils ===
 def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
 def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 def get_profile_by_uid(uid):
     uid = normalize_udyam(uid)
-    if not is_valid_udyam(uid): return None
-    return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
 def summarize_profile(profile):
     return (
-        f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
-        f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has "
-        f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
     )
 def generate_search_query(profile):
@@ -62,7 +77,7 @@ def get_top_matching_schemes(query_text, top_k=5):
     matches = []
     for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
         for chunk in doc["rag_chunks"]:
-            if "embedding" in chunk and chunk["embedding"]:
                 chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
                 score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
                 matches.append({
@@ -88,29 +103,21 @@ def fetch_scheme_field_llm(scheme_id, field_input):
         "documents": "required_documents_list"
     }
     matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
-    if not matched_field:
-        return "❌ Try asking about eligibility, benefits, how to apply, or documents."
     doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
-    if doc and matched_field in doc:
         raw_text = "\n".join(doc[matched_field][:5])
-        prompt = f"""
-Summarize the following information into a clear and professional explanation for business owners:
-Scheme: {doc['scheme_name']}
-Section: {matched_field.replace('_list','').title()}
-{raw_text}
-"""
         return llm.invoke(prompt).strip()
-    return "⚠️ Couldn’t find that information for the selected scheme."
-# === Chatbot ===
-chat_state = {"stage": 0, "profile": {}, "scheme_id": None}
 def chatbot(msg, history):
     if chat_state["stage"] == 0:
         chat_state["stage"] = 1
-        return "👋 Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
     if chat_state["stage"] == 1:
         if msg.lower().startswith("udyam-"):
@@ -118,49 +125,79 @@ def chatbot(msg, history):
             if profile:
                 chat_state["profile"] = profile
                 chat_state["stage"] = 3
-                return "✅ Profile found! Generating recommendations..."
-            return "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'."
         elif "manual" in msg.lower():
             chat_state["stage"] = 2
-            return "📝 Great! What's your enterprise name?"
-        return "Please enter a valid Udyam ID or type 'manual'."
     if chat_state["stage"] == 2:
         steps = [
-            "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
-            "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
         ]
-        curr_index = len(chat_state["profile"])
-        key = steps[curr_index]
-        if "Cost" in key or "Turnover" in key or "Employment" in key:
-            chat_state["profile"][key] = int(msg)
-        else:
-            chat_state["profile"][key] = msg
-        if len(chat_state["profile"]) == len(steps):
             chat_state["stage"] = 3
-            return "✅ Thanks! Now generating recommendations..."
-        return f"{steps[curr_index + 1]}?"
     if chat_state["stage"] == 3:
-        query, summary = generate_search_query(chat_state["profile"])
-        top_schemes = get_top_matching_schemes(query)
-        if not top_schemes:
-            return "⚠️ No matching schemes found."
-        chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
-        query_logs_coll.insert_one({
-            "timestamp": datetime.utcnow(),
-            "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
-            "profile_summary": summary,
-            "query": query,
-            "top_schemes": top_schemes,
-            "selected_scheme": top_schemes[0]["scheme_name"]
-        })
-        names = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
-        chat_state["stage"] = 4
-        return f"🔍 Based on your profile: {summary}\n\n📈 Recommended Schemes:\n{names}\n\nYou can now ask about this scheme using keywords like 'eligibility', 'apply', or 'documents'."
     if chat_state["stage"] == 4:
-        return fetch_scheme_field_llm(chat_state["scheme_id"], msg)
-demo = gr.ChatInterface(fn=chatbot, title="🤖 MSME Chatbot Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
 demo.launch()

 import gradio as gr
+import torch, re
 from pymongo import MongoClient
+from datetime import datetime
+from transformers import (
+    AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline
+)
 from sentence_transformers import SentenceTransformer, util
 from langchain_core.prompts import PromptTemplate
 from langchain_community.llms import HuggingFacePipeline
+from IndicTransToolkit.processor import IndicProcessor
+# === MongoDB ===
+client = MongoClient("mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/")
 db = client["msme_schemes_db"]
 udyam_coll = db["udyam_profiles"]
 schemes_chunk_coll = db["schemes_chunks_only"]
 schemes_info_coll = db["schemes_embedded"]
 query_logs_coll = db["query_logs"]
+# === LLM Setup ===
+MODEL_ID = "Vipplav/gemma-finetuned-faq"
+tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
 model = AutoModelForCausalLM.from_pretrained(
+    MODEL_ID, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 )
 generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
 llm = HuggingFacePipeline(pipeline=generator)
+# === Embedding Model ===
 embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
+# === IndicTrans2 ===
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+ip = IndicProcessor(inference=True)
+def initialize_translator(ckpt_dir):
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
+    model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, trust_remote_code=True).to(DEVICE)
+    model.eval()
+    return tokenizer, model
+translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B")
+# === Prompt Template ===
 rephrase_template = PromptTemplate.from_template("""
 You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
 Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
 {profile_summary}
 """)
+# === Utilities ===
 def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
 def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 def get_profile_by_uid(uid):
     uid = normalize_udyam(uid)
+    return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0}) if is_valid_udyam(uid) else None
 def summarize_profile(profile):
     return (
+        f"The user represents '{profile['Enterprise Name']}', a {profile['Enterprise Type']} enterprise from {profile['State']}, in the {profile['Major Activity']} sector. "
+        f"They are a {profile['Gender']} entrepreneur with an investment of ₹{profile['Investment Cost (In Rs.)']:,}, a turnover of ₹{profile['Net Turnover (In Rs.)']:,}, "
+        f"and {profile['Employment']} employees running a {profile['Organisation Type'].lower()}."
     )
 def generate_search_query(profile):
     matches = []
     for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
         for chunk in doc["rag_chunks"]:
+            if "embedding" in chunk:
                 chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
                 score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
                 matches.append({
         "documents": "required_documents_list"
     }
     matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
     doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
+    if doc and matched_field and matched_field in doc:
         raw_text = "\n".join(doc[matched_field][:5])
+        prompt = f"""Summarize this section professionally for MSME users:\n\nScheme: {doc['scheme_name']}\nSection: {matched_field.replace('_list','').title()}\n\n{raw_text}"""
         return llm.invoke(prompt).strip()
+    return "⚠️ Please ask about: eligibility, benefits, apply process, or required documents."
+# === Chat State ===
+chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""}
+# === Chatbot Logic ===
 def chatbot(msg, history):
     if chat_state["stage"] == 0:
         chat_state["stage"] = 1
+        return "👋 Welcome! Enter your Udyam ID (e.g., `UDYAM-TS-12-1234567`) or type `manual` to fill in your profile."
     if chat_state["stage"] == 1:
         if msg.lower().startswith("udyam-"):
             if profile:
                 chat_state["profile"] = profile
                 chat_state["stage"] = 3
+                summary = summarize_profile(profile)
+                chat_state["summary"] = summary
+                return f"✅ Found your profile!\n\n🔍 {summary}\n\nType `show related schemes` to get recommendations."
+            return "❌ Invalid or unregistered Udyam ID. Try again or say `manual` to fill manually."
         elif "manual" in msg.lower():
             chat_state["stage"] = 2
+            chat_state["profile"] = {}
+            return "📝 Let's build your profile.\n\nStep 1: What's your enterprise name? (e.g., `Sri Laxmi Industries`)"
+        return "🔄 Please enter a valid Udyam ID or say `manual`."
     if chat_state["stage"] == 2:
         steps = [
+            ("Enterprise Name", "e.g., `Sri Laxmi Textiles`"),
+            ("Gender", "e.g., `Female`"),
+            ("Enterprise Type", "e.g., `Micro`"),
+            ("Organisation Type", "e.g., `Sole Proprietorship`"),
+            ("Major Activity", "e.g., `Manufacturing`"),
+            ("State", "e.g., `Telangana`"),
+            ("Investment Cost (In Rs.)", "e.g., `5000000`"),
+            ("Net Turnover (In Rs.)", "e.g., `12000000`"),
+            ("Employment", "e.g., `23`")
         ]
+        idx = len(chat_state["profile"])
+        key, example = steps[idx]
+        value = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg
+        chat_state["profile"][key] = value
+        if idx + 1 == len(steps):
             chat_state["stage"] = 3
+            summary = summarize_profile(chat_state["profile"])
+            chat_state["summary"] = summary
+            return f"✅ Thanks! Here's your profile:\n\n🔍 {summary}\n\nType `show related schemes` to get recommendations."
+        next_key, next_example = steps[idx + 1]
+        return f"Step {idx + 2}: {next_key}? ({next_example})"
     if chat_state["stage"] == 3:
+        if "show" in msg.lower() and "scheme" in msg.lower():
+            query, summary = generate_search_query(chat_state["profile"])
+            top_schemes = get_top_matching_schemes(query)
+            if not top_schemes:
+                return "⚠️ No schemes found. Please refine your profile."
+            chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
+            chat_state["stage"] = 4
+            query_logs_coll.insert_one({
+                "timestamp": datetime.utcnow(),
+                "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
+                "profile_summary": summary,
+                "query": query,
+                "top_schemes": top_schemes,
+                "selected_scheme": top_schemes[0]["scheme_name"]
+            })
+            schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'], 4)})" for i, s in enumerate(top_schemes)])
+            return f"📈 Top Matches:\n{schemes_text}\n\nAsk: `eligibility`, `how to apply`, `documents needed`, etc."
+        return "📢 Type `show related schemes` to get scheme suggestions."
     if chat_state["stage"] == 4:
+        response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)
+        return f"{response}\n\n💬 Try asking: `What are the eligibility criteria?`, `What documents are required?`, `How to apply?`"
+def translate_last_response():
+    if chat_state["last_bot_msg"]:
+        return "📄 Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model)
+    return "⚠️ No message to translate."
+# === Gradio UI ===
+with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo:
+    chatbot_ui = gr.ChatInterface(
+        fn=chatbot,
+        title="🤖 MSME Scheme Assistant",
+        textbox=gr.Textbox(placeholder="Type your message here...")
+    )
+    with gr.Row():
+        translate_btn = gr.Button("🌐 Translate Last Response to Telugu")
+        translation_output = gr.Textbox(label="🗣️ Telugu Translation", lines=5)
+    translate_btn.click(fn=translate_last_response, outputs=translation_output)
 demo.launch()