Vipplav commited on
Commit
2622a38
·
verified ·
1 Parent(s): c394e1d
Files changed (1) hide show
  1. app.py +97 -164
app.py CHANGED
@@ -1,16 +1,13 @@
1
- # === Imports ===
2
  from pymongo import MongoClient
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  from sentence_transformers import SentenceTransformer, util
5
- from langchain_huggingface import HuggingFacePipeline
6
  from langchain_core.prompts import PromptTemplate
7
- from difflib import get_close_matches
8
  from datetime import datetime
9
  import torch, re
10
- import gradio as gr
11
-
12
 
13
- # === MongoDB Setup ===
14
  mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
15
  client = MongoClient(mongo_uri)
16
  db = client["msme_schemes_db"]
@@ -19,20 +16,34 @@ schemes_chunk_coll = db["schemes_chunks_only"]
19
  schemes_info_coll = db["schemes_embedded"]
20
  query_logs_coll = db["query_logs"]
21
 
22
- # === UID Utility ===
23
- def normalize_udyam(uid):
24
- return uid.strip().upper().replace(" ", "")
 
 
 
 
 
 
 
25
 
26
- def is_valid_udyam(uid):
27
- return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 
 
 
 
 
 
 
 
 
28
 
29
  def get_profile_by_uid(uid):
30
  uid = normalize_udyam(uid)
31
- if not is_valid_udyam(uid):
32
- return None, "❌ That doesn't look valid. Please enter again."
33
  return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
34
 
35
- # === Summary ===
36
  def summarize_profile(profile):
37
  return (
38
  f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
@@ -40,33 +51,12 @@ def summarize_profile(profile):
40
  f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
41
  )
42
 
43
- # === Prompt Template ===
44
- rephrase_template = PromptTemplate.from_template("""
45
- You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
46
- Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
47
- Only return the query. Avoid comments.
48
- Enterprise Profile:
49
- {profile_summary}
50
- """)
51
-
52
- # === Load LLM ===
53
- model_id = "google/gemma-2b-it"
54
- tokenizer = AutoTokenizer.from_pretrained(model_id)
55
- model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
56
- generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
57
- llm = HuggingFacePipeline(pipeline=generator)
58
-
59
- # === Embedding Model ===
60
- embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
61
-
62
- # === Query Generation ===
63
  def generate_search_query(profile):
64
  summary = summarize_profile(profile)
65
  prompt = rephrase_template.format(profile_summary=summary)
66
  response = llm.invoke(prompt)
67
- return response.strip().split("\n")[0].strip()
68
 
69
- # === Chunk Retrieval ===
70
  def get_top_matching_schemes(query_text, top_k=5):
71
  query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
72
  matches = []
@@ -75,7 +65,11 @@ def get_top_matching_schemes(query_text, top_k=5):
75
  if "embedding" in chunk and chunk["embedding"]:
76
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
77
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
78
- matches.append({"score": score, "scheme_id": doc.get("scheme_id"), "scheme_name": doc.get("scheme_name")})
 
 
 
 
79
  seen, top_results = set(), []
80
  for m in sorted(matches, key=lambda x: x["score"], reverse=True):
81
  if m["scheme_id"] not in seen:
@@ -85,7 +79,6 @@ def get_top_matching_schemes(query_text, top_k=5):
85
  break
86
  return top_results
87
 
88
- # === Scheme Field with LLM Formatting ===
89
  def fetch_scheme_field_llm(scheme_id, field_input):
90
  field_map = {
91
  "eligibility": "eligibility_list",
@@ -94,140 +87,80 @@ def fetch_scheme_field_llm(scheme_id, field_input):
94
  "apply": "how_to_apply_list",
95
  "documents": "required_documents_list"
96
  }
97
- # figure out which section they asked for
98
- matched_field = next(
99
- (v for k, v in field_map.items() if k in field_input.lower()),
100
- None
101
- )
102
  if not matched_field:
103
  return "❌ Try asking about eligibility, benefits, how to apply, or documents."
104
-
105
- # fetch the scheme document
106
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
107
- if not doc or matched_field not in doc:
108
- return "⚠️ Couldn’t find that information for the selected scheme."
109
-
110
- # take up to first 5 list items
111
- raw_text = "\n".join(doc[matched_field][:5])
112
- prompt = f"""
113
  Summarize the following information into a clear and professional explanation for business owners:
114
 
115
  Scheme: {doc['scheme_name']}
116
  Section: {matched_field.replace('_list','').title()}
117
 
118
  {raw_text}
119
- """.strip()
120
-
121
- response = llm.invoke(prompt).strip()
122
- section_title = matched_field.replace('_list','').replace('_',' ').title()
123
- return f"📄 **{section_title} for {doc['scheme_name']}:**\n{response}"
124
-
125
- # === Chat‐driven UI ===
126
- def chat_fn(user_message, history, state):
127
- """
128
- history: list of (user, bot) tuples
129
- state: dict carrying keys:
130
- - 'step': which step we’re on
131
- - 'profile': dict under construction
132
- - 'schemes': top_k list once query generated
133
- - 'current_scheme_id': when user picks one
134
- """
135
- if state is None:
136
- state = {'step': 'start', 'profile': {}}
137
-
138
- # STEP 1: greeting & ask for Udyam or manual
139
- if state['step'] == 'start':
140
- bot = "👋 Welcome! Please enter your Udyam Registration No, or type 'manual' to enter details yourself."
141
- state['step'] = 'await_uid'
142
- return history + [(user_message, bot)], state
143
-
144
- # STEP 2: process Udyam or manual trigger
145
- if state['step'] == 'await_uid':
146
- text = user_message.strip()
147
- if text.lower() == 'manual':
148
- bot = "📝 Enter your Enterprise Name."
149
- state['step'] = 'manual_name'
 
 
 
150
  else:
151
- profile, err = get_profile_by_uid(text) if text else (None, None)
152
- if err:
153
- bot = err
154
- else:
155
- state['profile'] = profile
156
- summary = summarize_profile(profile)
157
- bot = f" Found profile:\n{summary}\n\nType 'yes' to continue or 'edit' to re-enter."
158
- state['step'] = 'confirm_profile'
159
- return history + [(user_message, bot)], state
160
-
161
- # STEP 3: Manual entry fields
162
- if state['step'].startswith('manual_'):
163
- field = state['step'].split('_')[1]
164
- # map steps to prompts
165
- prompts = {
166
- 'name': ("Enterprise Name", "🏷️ Enterprise size (Micro/Small/Medium)"),
167
- 'size': ("Enterprise Size", "🏛️ Organisation Type?"),
168
- 'org': ("Organisation Type", "🛠️ Major Activity?"),
169
- 'activity':("Major Activity", "📍 State?"),
170
- 'state': ("State", "💰 Investment (₹)?"),
171
- 'invest':("Investment Cost (In Rs.)","📈 Annual Turnover ()?"),
172
- 'turnover':("Net Turnover (In Rs.)","👥 Number of employees?"),
173
- 'emps': ("Employment", None)
174
- }
175
- key, next_prompt = prompts[field]
176
- # store the answer
177
- state['profile'][key] = user_message.strip() if not key in ["Employment","Investment Cost (In Rs.)","Net Turnover (In Rs.)"] else int(user_message)
178
- # advance step
179
- next_steps = {
180
- 'name':'manual_size','size':'manual_org','org':'manual_activity',
181
- 'activity':'manual_state','state':'manual_invest','invest':'manual_turnover',
182
- 'turnover':'manual_emps','emps':'post_manual'
183
- }
184
- state['step'] = next_steps[field]
185
- bot = next_prompt or ""
186
- return history + [(user_message, bot)], state
187
-
188
- # STEP 4: after manual collected
189
- if state['step'] == 'post_manual':
190
- summary = summarize_profile(state['profile'])
191
- bot = f"✅ Got it. Profile summary:\n{summary}\n\nType 'yes' to continue."
192
- state['step'] = 'confirm_profile'
193
- return history + [(user_message, bot)], state
194
-
195
- # STEP 5: confirm profile
196
- if state['step'] == 'confirm_profile':
197
- if user_message.strip().lower() == 'yes':
198
- query = generate_search_query(state['profile'])
199
- schemes = get_top_matching_schemes(query)
200
- state['schemes'] = schemes
201
- listing = "\n".join(f"{i+1}. {s['scheme_name']}" for i,s in enumerate(schemes))
202
- bot = f"🔍 Search query: {query}\nTop Schemes:\n{listing}\n\nReply with the number to pick one."
203
- state['step'] = 'pick_scheme'
204
- else:
205
- bot = "Type 'manual' to re-enter details or your Udyam again."
206
- state['step'] = 'await_uid'
207
- return history + [(user_message, bot)], state
208
-
209
- # STEP 6: pick scheme
210
- if state['step'] == 'pick_scheme':
211
- idx = int(user_message.strip()) - 1
212
- scheme = state['schemes'][idx]
213
- state['current_scheme_id'] = scheme['scheme_id']
214
- bot = f"🎯 You selected *{scheme['scheme_name']}*. Ask about eligibility, benefits, apply, or documents."
215
- state['step'] = 'in_scheme'
216
- return history + [(user_message, bot)], state
217
-
218
- # STEP 7: within scheme
219
- if state['step'] == 'in_scheme':
220
- reply = fetch_scheme_field_llm(state['current_scheme_id'], user_message)
221
- return history + [(user_message, reply)], state
222
-
223
- # fallback
224
- return history, state
225
-
226
- # Create the chat interface
227
- demo = gr.ChatInterface(
228
- fn=chat_fn,
229
- title="MSME Scheme Assistant",
230
- description="All steps—profile, search, details—done via chat.",
231
- theme="default",
232
- )
233
  demo.launch()
 
1
+ import gradio as gr
2
  from pymongo import MongoClient
3
  from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
  from sentence_transformers import SentenceTransformer, util
 
5
  from langchain_core.prompts import PromptTemplate
6
+ from langchain_community.llms import HuggingFacePipeline
7
  from datetime import datetime
8
  import torch, re
 
 
9
 
10
+ # === Setup ===
11
  mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
12
  client = MongoClient(mongo_uri)
13
  db = client["msme_schemes_db"]
 
16
  schemes_info_coll = db["schemes_embedded"]
17
  query_logs_coll = db["query_logs"]
18
 
19
+ model_id = "google/gemma-2b-it"
20
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ model_id,
23
+ device_map="auto",
24
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
25
+ )
26
+ generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
27
+ llm = HuggingFacePipeline(pipeline=generator)
28
+ embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
29
 
30
+ rephrase_template = PromptTemplate.from_template("""
31
+ You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
32
+ Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
33
+ Only return the query. Avoid comments.
34
+ Enterprise Profile:
35
+ {profile_summary}
36
+ """)
37
+
38
+ # === Utils ===
39
+ def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
40
+ def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
41
 
42
  def get_profile_by_uid(uid):
43
  uid = normalize_udyam(uid)
44
+ if not is_valid_udyam(uid): return None
 
45
  return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
46
 
 
47
  def summarize_profile(profile):
48
  return (
49
  f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
 
51
  f"{profile['Employment']} employees, with an investment of ₹{profile['Investment Cost (In Rs.)']:,} and a turnover of ₹{profile['Net Turnover (In Rs.)']:,}."
52
  )
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def generate_search_query(profile):
55
  summary = summarize_profile(profile)
56
  prompt = rephrase_template.format(profile_summary=summary)
57
  response = llm.invoke(prompt)
58
+ return response.strip().split("\n")[0].strip(), summary
59
 
 
60
  def get_top_matching_schemes(query_text, top_k=5):
61
  query_embedding = embed_model.encode(query_text, convert_to_tensor=True)
62
  matches = []
 
65
  if "embedding" in chunk and chunk["embedding"]:
66
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
67
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
68
+ matches.append({
69
+ "score": score,
70
+ "scheme_id": doc.get("scheme_id"),
71
+ "scheme_name": doc.get("scheme_name")
72
+ })
73
  seen, top_results = set(), []
74
  for m in sorted(matches, key=lambda x: x["score"], reverse=True):
75
  if m["scheme_id"] not in seen:
 
79
  break
80
  return top_results
81
 
 
82
  def fetch_scheme_field_llm(scheme_id, field_input):
83
  field_map = {
84
  "eligibility": "eligibility_list",
 
87
  "apply": "how_to_apply_list",
88
  "documents": "required_documents_list"
89
  }
90
+ matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
 
 
 
 
91
  if not matched_field:
92
  return "❌ Try asking about eligibility, benefits, how to apply, or documents."
 
 
93
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
94
+ if doc and matched_field in doc:
95
+ raw_text = "\n".join(doc[matched_field][:5])
96
+ prompt = f"""
 
 
 
97
  Summarize the following information into a clear and professional explanation for business owners:
98
 
99
  Scheme: {doc['scheme_name']}
100
  Section: {matched_field.replace('_list','').title()}
101
 
102
  {raw_text}
103
+ """
104
+ return llm.invoke(prompt).strip()
105
+ return "⚠️ Couldn’t find that information for the selected scheme."
106
+
107
+ # === Chatbot ===
108
+ chat_state = {"stage": 0, "profile": {}, "scheme_id": None}
109
+
110
+ def chatbot(msg, history):
111
+ if chat_state["stage"] == 0:
112
+ chat_state["stage"] = 1
113
+ return "👋 Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
114
+
115
+ if chat_state["stage"] == 1:
116
+ if msg.lower().startswith("udyam-"):
117
+ profile = get_profile_by_uid(msg)
118
+ if profile:
119
+ chat_state["profile"] = profile
120
+ chat_state["stage"] = 3
121
+ return "✅ Profile found! Generating recommendations..."
122
+ return "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'."
123
+ elif "manual" in msg.lower():
124
+ chat_state["stage"] = 2
125
+ return "📝 Great! What's your enterprise name?"
126
+ return "Please enter a valid Udyam ID or type 'manual'."
127
+
128
+ if chat_state["stage"] == 2:
129
+ steps = [
130
+ "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
131
+ "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
132
+ ]
133
+ curr_index = len(chat_state["profile"])
134
+ key = steps[curr_index]
135
+ if "Cost" in key or "Turnover" in key or "Employment" in key:
136
+ chat_state["profile"][key] = int(msg)
137
  else:
138
+ chat_state["profile"][key] = msg
139
+ if len(chat_state["profile"]) == len(steps):
140
+ chat_state["stage"] = 3
141
+ return "✅ Thanks! Now generating recommendations..."
142
+ return f"{steps[curr_index + 1]}?"
143
+
144
+ if chat_state["stage"] == 3:
145
+ query, summary = generate_search_query(chat_state["profile"])
146
+ top_schemes = get_top_matching_schemes(query)
147
+ if not top_schemes:
148
+ return "⚠️ No matching schemes found."
149
+ chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
150
+ query_logs_coll.insert_one({
151
+ "timestamp": datetime.utcnow(),
152
+ "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
153
+ "profile_summary": summary,
154
+ "query": query,
155
+ "top_schemes": top_schemes,
156
+ "selected_scheme": top_schemes[0]["scheme_name"]
157
+ })
158
+ names = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
159
+ chat_state["stage"] = 4
160
+ return f"🔍 Based on your profile: {summary}\n\n📈 Recommended Schemes:\n{names}\n\nYou can now ask about this scheme using keywords like 'eligibility', 'apply', or 'documents'."
161
+
162
+ if chat_state["stage"] == 4:
163
+ return fetch_scheme_field_llm(chat_state["scheme_id"], msg)
164
+
165
+ demo = gr.ChatInterface(fn=chatbot, title="🤖 MSME Chatbot Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  demo.launch()