Spaces:

Vipplav
/

MSME_Chat_bot

Runtime error

App Files Files Community

Vipplav commited on Jun 8, 2025

Commit

73d9537

verified ·

1 Parent(s): 3032a83

++

Browse files

Files changed (2) hide show

app.py +247 -63
requirements.txt +9 -1

app.py CHANGED Viewed

@@ -1,64 +1,248 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+# === Imports ===
+from pymongo import MongoClient
+from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
+from sentence_transformers import SentenceTransformer, util
+from langchain_community.llms import HuggingFacePipeline
+from langchain_core.prompts import PromptTemplate
+from difflib import get_close_matches
+from datetime import datetime
+import torch, re
 import gradio as gr
+# === MongoDB Setup ===
+mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
+client = MongoClient(mongo_uri)
+db = client["msme_schemes_db"]
+udyam_coll = db["udyam_profiles"]
+schemes_chunk_coll = db["schemes_chunks_only"]
+schemes_info_coll = db["schemes_embedded"]
+query_logs_coll = db["query_logs"]
+# === UID Utility ===
+def normalize_udyam(uid):
+    return uid.strip().upper().replace(" ", "")
+def is_valid_udyam(uid):
+    return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
+def get_profile_by_uid(uid):
+    uid = normalize_udyam(uid)
+    if not is_valid_udyam(uid):
+        console.print("\n❌ That doesn't look like a valid Udyam Registration Number. Please double-check.", style="bold red")
+        return None
+    return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
+# === Summary ===
+def summarize_profile(profile):
+    return (
+        f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
+        f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has "
+        f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
+    )
+# === Prompt Template ===
+rephrase_template = PromptTemplate.from_template("""
+You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
+Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
+Only return the query. Avoid comments.
+Enterprise Profile:
+{profile_summary}
+""")
+# === Load LLM ===
+model_id = "google/gemma-2b-it"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
+generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
+llm = HuggingFacePipeline(pipeline=generator)
+# === Embedding Model ===
+embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
+# === Query Generation ===
+def generate_search_query(profile):
+    summary = summarize_profile(profile)
+    prompt = rephrase_template.format(profile_summary=summary)
+    response = llm.invoke(prompt)
+    return response.strip().split("\n")[0].strip()
+# === Chunk Retrieval ===
+def get_top_matching_schemes(query_text, top_k=5):
+    query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
+    matches = []
+    for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
+        for chunk in doc["rag_chunks"]:
+            if "embedding" in chunk and chunk["embedding"]:
+                chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
+                score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
+                matches.append({"score": score, "scheme_id": doc.get("scheme_id"), "scheme_name": doc.get("scheme_name")})
+    seen, top_results = set(), []
+    for m in sorted(matches, key=lambda x: x["score"], reverse=True):
+        if m["scheme_id"] not in seen:
+            top_results.append(m)
+            seen.add(m["scheme_id"])
+        if len(top_results) == top_k:
+            break
+    return top_results
+# === Scheme Field with LLM Formatting ===
+def fetch_scheme_field_llm(scheme_id, field_input):
+    field_map = {
+        "eligibility": "eligibility_list",
+        "benefits": "key_benefits_list",
+        "assistance": "assistance_list",
+        "apply": "how_to_apply_list",
+        "documents": "required_documents_list"
+    }
+    # figure out which section they asked for
+    matched_field = next(
+        (v for k, v in field_map.items() if k in field_input.lower()),
+        None
+    )
+    if not matched_field:
+        return "❌ Try asking about eligibility, benefits, how to apply, or documents."
+    # fetch the scheme document
+    doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
+    if not doc or matched_field not in doc:
+        return "⚠️ Couldn’t find that information for the selected scheme."
+    # take up to first 5 list items
+    raw_text = "\n".join(doc[matched_field][:5])
+    prompt = f"""
+Summarize the following information into a clear and professional explanation for business owners:
+Scheme: {doc['scheme_name']}
+Section: {matched_field.replace('_list','').title()}
+{raw_text}
+""".strip()
+    response = llm.invoke(prompt).strip()
+    section_title = matched_field.replace('_list','').replace('_',' ').title()
+    return f"📄 **{section_title} for {doc['scheme_name']}:**\n{response}"
+# === Gradio UI ===
+def start_session(udyam_id):
+    """
+    Called when user submits an Udyam ID (or leaves blank for manual).
+    Returns:
+      - A list of chat history tuples to seed the Chatbot
+      - A dict storing our profile in `gr.State`
+    """
+    if udyam_id:
+        profile = get_profile_by_uid(udyam_id)
+        if profile is None:
+            # invalid ID
+            return [("system","❌ That doesn't look valid. Please correct or go manual.")], {}
+    else:
+        # empty => we’ll ask for manual details next
+        return [("system","📝 Please fill in your enterprise details below.")], {}
+    # valid profile fetched
+    summary = summarize_profile(profile)
+    query   = generate_search_query(profile)
+    schemes = get_top_matching_schemes(query)
+    response = (
+        f"✅ Profile OK:\n{summary}\n\n"
+        f"🔍 Query: {query}\n\n"
+        "📈 Top schemes:\n" +
+        "\n".join(f"{i+1}. {s['scheme_name']} (score {s['score']:.3f})"
+                  for i,s in enumerate(schemes))
+    )
+    # store for later steps
+    state = {"profile": profile, "schemes": schemes}
+    return [("user", udyam_id or "<manual>"), ("assistant", response)], state
+def handle_manual(enterprise_name, gender, ent_type, org_type, activity, state):
+    """
+    Called when user submits manual-entered profile.
+    """
+    profile = {
+        "Enterprise Name": enterprise_name,
+        "Gender": gender,
+        "Enterprise Type": ent_type,
+        "Organisation Type": org_type,
+        "Major Activity": activity,
+        # …you can add investment, turnover, employment later
+    }
+    summary = summarize_profile(profile)
+    query   = generate_search_query(profile)
+    schemes = get_top_matching_schemes(query)
+    response = (
+        f"✅ Profile recorded:\n{summary}\n\n"
+        f"🔍 Query: {query}\n\n"
+        "📈 Top schemes:\n" +
+        "\n".join(f"{i+1}. {s['scheme_name']} (score {s['score']:.3f})"
+                  for i,s in enumerate(schemes))
+    )
+    state["profile"] = profile
+    state["schemes"] = schemes
+    return [("assistant", response)], state
+def chat_with_scheme(message, state):
+    """
+    Called once a scheme is selected or user asks for eligibility/benefits.
+    """
+    # assume the user typed “3” or the scheme name
+    # map to scheme_id via state["schemes"]
+    # then call fetch_scheme_field_llm(...)
+    scheme_map = {str(i+1): s for i,s in enumerate(state["schemes"])}
+    key = message.strip()
+    if key in scheme_map:
+        sid = scheme_map[key]["scheme_id"]
+        state["current_scheme_id"] = sid
+        doc = schemes_info_coll.find_one({"scheme_id": sid})
+        title = doc["scheme_name"]
+        return [("assistant", f"🎯 *{title}* selected.  What would you like to know? (eligibility, benefits, apply, docs)")]
+    elif "current_scheme_id" in state:
+        # interpret as field query
+        output = fetch_scheme_field_llm(state["current_scheme_id"], message)
+        return [("assistant", output)], state
+    else:
+        return [("assistant","❓ Please pick a scheme number first.")], state
+with gr.Blocks() as demo:
+    gr.Markdown("# 🚀 MSME Scheme Assistant")
+    # Step 1: Udyam ID or Manual
+    udyam_in = gr.Textbox(label="Enter your Udyam ID (or leave blank for manual)")
+    start_btn = gr.Button("Start")
+    chatbot   = gr.Chatbot()
+    state     = gr.State({})  # will hold profile, schemes, etc.
+    # Step 2a: Manual fields (initially hidden)
+    with gr.Row(visible=False) as manual_row:
+        ent_name = gr.Textbox(label="Enterprise Name")
+        gender   = gr.Radio(["Male","Female","Other"], label="Gender")
+        ent_type = gr.Dropdown(["Micro","Small","Medium"], label="Enterprise Size")
+        org_type = gr.Dropdown(["Proprietorship","LLP","Private Ltd."], label="Organisation Type")
+        activity = gr.Textbox(label="Major Activity (Manufacturing/Services)")
+        manual_btn = gr.Button("Submit Manual Profile")
+    # wiring:
+    start_btn.click(fn=start_session,
+                    inputs=[udyam_in],
+                    outputs=[chatbot, state])
+    def should_show_manual(udyam_id):
+        # only show manual fields when user leaves Udyam blank
+        return gr.update(visible=(not udyam_id.strip()))
+    start_btn.click(
+        fn=should_show_manual,
+        inputs=[udyam_in],
+        outputs=[manual_row]
+    )
+    manual_btn.click(fn=handle_manual,
+                     inputs=[ent_name, gender, ent_type, org_type, activity, state],
+                     outputs=[chatbot, state])
+    # Finally, let them chat about schemes
+    msg = gr.Textbox(placeholder="Ask about schemes…")
+    msg.submit(fn=chat_with_scheme,
+               inputs=[msg, state],
+               outputs=[chatbot, state])
+demo.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,9 @@
1	- huggingface_hub==0.25.2

+huggingface_hub==0.25.2
+pymongo>=4.3.0
+transformers>=4.35.0
+sentence-transformers>=2.2.2
+torch>=2.0.1
+langchain>=0.0.200
+langchain-community>=0.0.30
+gradio>=3.44.0
+accelerate