Spaces:

Vipplav
/

MSME_Chat_bot

Runtime error

App Files Files Community

Vipplav commited on Jun 8, 2025

Commit

2622a38

verified ·

1 Parent(s): c394e1d

10 pras

Browse files

Files changed (1) hide show

app.py +97 -164

app.py CHANGED Viewed

@@ -1,16 +1,13 @@
-# === Imports ===
 from pymongo import MongoClient
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer, util
-from langchain_huggingface import HuggingFacePipeline
 from langchain_core.prompts import PromptTemplate
-from difflib import get_close_matches
 from datetime import datetime
 import torch, re
-import gradio as gr
-# === MongoDB Setup ===
 mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
 client = MongoClient(mongo_uri)
 db = client["msme_schemes_db"]
@@ -19,20 +16,34 @@ schemes_chunk_coll = db["schemes_chunks_only"]
 schemes_info_coll = db["schemes_embedded"]
 query_logs_coll = db["query_logs"]
-# === UID Utility ===
-def normalize_udyam(uid):
-    return uid.strip().upper().replace(" ", "")
-def is_valid_udyam(uid):
-    return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 def get_profile_by_uid(uid):
     uid = normalize_udyam(uid)
-    if not is_valid_udyam(uid):
-       return None, "❌ That doesn't look valid. Please enter again."
     return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
-# === Summary ===
 def summarize_profile(profile):
     return (
         f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
@@ -40,33 +51,12 @@ def summarize_profile(profile):
         f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
     )
-# === Prompt Template ===
-rephrase_template = PromptTemplate.from_template("""
-You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
-Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
-Only return the query. Avoid comments.
-Enterprise Profile:
-{profile_summary}
-""")
-# === Load LLM ===
-model_id = "google/gemma-2b-it"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
-generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
-llm = HuggingFacePipeline(pipeline=generator)
-# === Embedding Model ===
-embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
-# === Query Generation ===
 def generate_search_query(profile):
     summary = summarize_profile(profile)
     prompt = rephrase_template.format(profile_summary=summary)
     response = llm.invoke(prompt)
-    return response.strip().split("\n")[0].strip()
-# === Chunk Retrieval ===
 def get_top_matching_schemes(query_text, top_k=5):
     query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
     matches = []
@@ -75,7 +65,11 @@ def get_top_matching_schemes(query_text, top_k=5):
             if "embedding" in chunk and chunk["embedding"]:
                 chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
                 score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
-                matches.append({"score": score, "scheme_id": doc.get("scheme_id"), "scheme_name": doc.get("scheme_name")})
     seen, top_results = set(), []
     for m in sorted(matches, key=lambda x: x["score"], reverse=True):
         if m["scheme_id"] not in seen:
@@ -85,7 +79,6 @@ def get_top_matching_schemes(query_text, top_k=5):
             break
     return top_results
-# === Scheme Field with LLM Formatting ===
 def fetch_scheme_field_llm(scheme_id, field_input):
     field_map = {
         "eligibility": "eligibility_list",
@@ -94,140 +87,80 @@ def fetch_scheme_field_llm(scheme_id, field_input):
         "apply": "how_to_apply_list",
         "documents": "required_documents_list"
     }
-    # figure out which section they asked for
-    matched_field = next(
-        (v for k, v in field_map.items() if k in field_input.lower()),
-        None
-    )
     if not matched_field:
         return "❌ Try asking about eligibility, benefits, how to apply, or documents."
-    # fetch the scheme document
     doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
-    if not doc or matched_field not in doc:
-        return "⚠️ Couldn’t find that information for the selected scheme."
-    # take up to first 5 list items
-    raw_text = "\n".join(doc[matched_field][:5])
-    prompt = f"""
 Summarize the following information into a clear and professional explanation for business owners:
 Scheme: {doc['scheme_name']}
 Section: {matched_field.replace('_list','').title()}
 {raw_text}
-""".strip()
-    response = llm.invoke(prompt).strip()
-    section_title = matched_field.replace('_list','').replace('_',' ').title()
-    return f"📄 **{section_title} for {doc['scheme_name']}:**\n{response}"
-# === Chat‐driven UI ===
-def chat_fn(user_message, history, state):
-    """
-    history: list of (user, bot) tuples
-    state: dict carrying keys:
-      - 'step': which step we’re on
-      - 'profile': dict under construction
-      - 'schemes': top_k list once query generated
-      - 'current_scheme_id': when user picks one
-    """
-    if state is None:
-        state = {'step': 'start', 'profile': {}}
-    # STEP 1: greeting & ask for Udyam or manual
-    if state['step'] == 'start':
-        bot = "👋 Welcome! Please enter your Udyam Registration No, or type 'manual' to enter details yourself."
-        state['step'] = 'await_uid'
-        return history + [(user_message, bot)], state
-    # STEP 2: process Udyam or manual trigger
-    if state['step'] == 'await_uid':
-        text = user_message.strip()
-        if text.lower() == 'manual':
-            bot = "📝 Enter your Enterprise Name."
-            state['step'] = 'manual_name'
         else:
-            profile, err = get_profile_by_uid(text) if text else (None, None)
-            if err:
-                bot = err
-            else:
-                state['profile'] = profile
-                summary = summarize_profile(profile)
-                bot = f"✅ Found profile:\n{summary}\n\nType 'yes' to continue or 'edit' to re-enter."
-                state['step'] = 'confirm_profile'
-        return history + [(user_message, bot)], state
-    # STEP 3: Manual entry fields
-    if state['step'].startswith('manual_'):
-        field = state['step'].split('_')[1]
-        # map steps to prompts
-        prompts = {
-            'name':   ("Enterprise Name",    "🏷️ Enterprise size (Micro/Small/Medium)"),
-            'size':   ("Enterprise Size",     "🏛️ Organisation Type?"),
-            'org':    ("Organisation Type",   "🛠️ Major Activity?"),
-            'activity':("Major Activity",     "📍 State?"),
-            'state':  ("State",              "💰 Investment (₹)?"),
-            'invest':("Investment Cost (In Rs.)","📈 Annual Turnover (₹)?"),
-            'turnover':("Net Turnover (In Rs.)","👥 Number of employees?"),
-            'emps':   ("Employment",         None)
-        }
-        key, next_prompt = prompts[field]
-        # store the answer
-        state['profile'][key] = user_message.strip() if not key in ["Employment","Investment Cost (In Rs.)","Net Turnover (In Rs.)"] else int(user_message)
-        # advance step
-        next_steps = {
-            'name':'manual_size','size':'manual_org','org':'manual_activity',
-            'activity':'manual_state','state':'manual_invest','invest':'manual_turnover',
-            'turnover':'manual_emps','emps':'post_manual'
-        }
-        state['step'] = next_steps[field]
-        bot = next_prompt or ""
-        return history + [(user_message, bot)], state
-    # STEP 4: after manual collected
-    if state['step'] == 'post_manual':
-        summary = summarize_profile(state['profile'])
-        bot = f"✅ Got it. Profile summary:\n{summary}\n\nType 'yes' to continue."
-        state['step'] = 'confirm_profile'
-        return history + [(user_message, bot)], state
-    # STEP 5: confirm profile
-    if state['step'] == 'confirm_profile':
-        if user_message.strip().lower() == 'yes':
-            query = generate_search_query(state['profile'])
-            schemes = get_top_matching_schemes(query)
-            state['schemes'] = schemes
-            listing = "\n".join(f"{i+1}. {s['scheme_name']}" for i,s in enumerate(schemes))
-            bot = f"🔍 Search query: {query}\nTop Schemes:\n{listing}\n\nReply with the number to pick one."
-            state['step'] = 'pick_scheme'
-        else:
-            bot = "Type 'manual' to re-enter details or your Udyam again."
-            state['step'] = 'await_uid'
-        return history + [(user_message, bot)], state
-    # STEP 6: pick scheme
-    if state['step'] == 'pick_scheme':
-        idx = int(user_message.strip()) - 1
-        scheme = state['schemes'][idx]
-        state['current_scheme_id'] = scheme['scheme_id']
-        bot = f"🎯 You selected *{scheme['scheme_name']}*. Ask about eligibility, benefits, apply, or documents."
-        state['step'] = 'in_scheme'
-        return history + [(user_message, bot)], state
-    # STEP 7: within scheme
-    if state['step'] == 'in_scheme':
-        reply = fetch_scheme_field_llm(state['current_scheme_id'], user_message)
-        return history + [(user_message, reply)], state
-    # fallback
-    return history, state
-# Create the chat interface
-demo = gr.ChatInterface(
-    fn=chat_fn,
-    title="MSME Scheme Assistant",
-    description="All steps—profile, search, details—done via chat.",
-    theme="default",
-)
 demo.launch()

+import gradio as gr
 from pymongo import MongoClient
 from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 from sentence_transformers import SentenceTransformer, util
 from langchain_core.prompts import PromptTemplate
+from langchain_community.llms import HuggingFacePipeline
 from datetime import datetime
 import torch, re
+# === Setup ===
 mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
 client = MongoClient(mongo_uri)
 db = client["msme_schemes_db"]
 schemes_info_coll = db["schemes_embedded"]
 query_logs_coll = db["query_logs"]
+model_id = "google/gemma-2b-it"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
+llm = HuggingFacePipeline(pipeline=generator)
+embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
+rephrase_template = PromptTemplate.from_template("""
+You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
+Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
+Only return the query. Avoid comments.
+Enterprise Profile:
+{profile_summary}
+""")
+# === Utils ===
+def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
+def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 def get_profile_by_uid(uid):
     uid = normalize_udyam(uid)
+    if not is_valid_udyam(uid): return None
     return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
 def summarize_profile(profile):
     return (
         f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
         f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
     )
 def generate_search_query(profile):
     summary = summarize_profile(profile)
     prompt = rephrase_template.format(profile_summary=summary)
     response = llm.invoke(prompt)
+    return response.strip().split("\n")[0].strip(), summary
 def get_top_matching_schemes(query_text, top_k=5):
     query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
     matches = []
             if "embedding" in chunk and chunk["embedding"]:
                 chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
                 score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
+                matches.append({
+                    "score": score,
+                    "scheme_id": doc.get("scheme_id"),
+                    "scheme_name": doc.get("scheme_name")
+                })
     seen, top_results = set(), []
     for m in sorted(matches, key=lambda x: x["score"], reverse=True):
         if m["scheme_id"] not in seen:
             break
     return top_results
 def fetch_scheme_field_llm(scheme_id, field_input):
     field_map = {
         "eligibility": "eligibility_list",
         "apply": "how_to_apply_list",
         "documents": "required_documents_list"
     }
+    matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
     if not matched_field:
         return "❌ Try asking about eligibility, benefits, how to apply, or documents."
     doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
+    if doc and matched_field in doc:
+        raw_text = "\n".join(doc[matched_field][:5])
+        prompt = f"""
 Summarize the following information into a clear and professional explanation for business owners:
 Scheme: {doc['scheme_name']}
 Section: {matched_field.replace('_list','').title()}
 {raw_text}
+"""
+        return llm.invoke(prompt).strip()
+    return "⚠️ Couldn’t find that information for the selected scheme."
+# === Chatbot ===
+chat_state = {"stage": 0, "profile": {}, "scheme_id": None}
+def chatbot(msg, history):
+    if chat_state["stage"] == 0:
+        chat_state["stage"] = 1
+        return "👋 Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
+    if chat_state["stage"] == 1:
+        if msg.lower().startswith("udyam-"):
+            profile = get_profile_by_uid(msg)
+            if profile:
+                chat_state["profile"] = profile
+                chat_state["stage"] = 3
+                return "✅ Profile found! Generating recommendations..."
+            return "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'."
+        elif "manual" in msg.lower():
+            chat_state["stage"] = 2
+            return "📝 Great! What's your enterprise name?"
+        return "Please enter a valid Udyam ID or type 'manual'."
+    if chat_state["stage"] == 2:
+        steps = [
+            "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
+            "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
+        ]
+        curr_index = len(chat_state["profile"])
+        key = steps[curr_index]
+        if "Cost" in key or "Turnover" in key or "Employment" in key:
+            chat_state["profile"][key] = int(msg)
         else:
+            chat_state["profile"][key] = msg
+        if len(chat_state["profile"]) == len(steps):
+            chat_state["stage"] = 3
+            return "✅ Thanks! Now generating recommendations..."
+        return f"{steps[curr_index + 1]}?"
+    if chat_state["stage"] == 3:
+        query, summary = generate_search_query(chat_state["profile"])
+        top_schemes = get_top_matching_schemes(query)
+        if not top_schemes:
+            return "⚠️ No matching schemes found."
+        chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
+        query_logs_coll.insert_one({
+            "timestamp": datetime.utcnow(),
+            "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
+            "profile_summary": summary,
+            "query": query,
+            "top_schemes": top_schemes,
+            "selected_scheme": top_schemes[0]["scheme_name"]
+        })
+        names = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
+        chat_state["stage"] = 4
+        return f"🔍 Based on your profile: {summary}\n\n📈 Recommended Schemes:\n{names}\n\nYou can now ask about this scheme using keywords like 'eligibility', 'apply', or 'documents'."
+    if chat_state["stage"] == 4:
+        return fetch_scheme_field_llm(chat_state["scheme_id"], msg)
+demo = gr.ChatInterface(fn=chatbot, title="🤖 MSME Chatbot Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
 demo.launch()