Vipplav commited on
Commit
623ba09
ยท
verified ยท
1 Parent(s): 9f5887d
Files changed (1) hide show
  1. app.py +69 -60
app.py CHANGED
@@ -2,32 +2,28 @@ import gradio as gr
2
  import torch, re
3
  from pymongo import MongoClient
4
  from datetime import datetime
5
- from transformers import (
6
- AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline
7
- )
8
  from sentence_transformers import SentenceTransformer, util
9
  from langchain_core.prompts import PromptTemplate
10
  from langchain_community.llms import HuggingFacePipeline
11
  from IndicTransToolkit.processor import IndicProcessor
 
12
 
13
  # === MongoDB ===
14
- client = MongoClient("mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/")
 
15
  db = client["msme_schemes_db"]
16
  udyam_coll = db["udyam_profiles"]
17
  schemes_chunk_coll = db["schemes_chunks_only"]
18
  schemes_info_coll = db["schemes_embedded"]
19
  query_logs_coll = db["query_logs"]
20
 
21
- # === LLM Setup ===
22
- MODEL_ID = "Vipplav/gemma-finetuned-faq"
23
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
- model = AutoModelForCausalLM.from_pretrained(
25
- MODEL_ID, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
26
- )
27
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
28
  llm = HuggingFacePipeline(pipeline=generator)
29
-
30
- # === Embedding Model ===
31
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
32
 
33
  # === IndicTrans2 ===
@@ -40,9 +36,17 @@ def initialize_translator(ckpt_dir):
40
  model.eval()
41
  return tokenizer, model
42
 
 
 
 
 
 
 
 
 
43
  translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B")
44
 
45
- # === Prompt Template ===
46
  rephrase_template = PromptTemplate.from_template("""
47
  You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
48
  Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
@@ -54,16 +58,16 @@ Enterprise Profile:
54
  # === Utilities ===
55
  def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
56
  def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
57
-
58
  def get_profile_by_uid(uid):
59
  uid = normalize_udyam(uid)
60
- return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0}) if is_valid_udyam(uid) else None
 
61
 
62
  def summarize_profile(profile):
63
  return (
64
- f"The user represents '{profile['Enterprise Name']}', a {profile['Enterprise Type']} enterprise from {profile['State']}, in the {profile['Major Activity']} sector. "
65
- f"They are a {profile['Gender']} entrepreneur with an investment of โ‚น{profile['Investment Cost (In Rs.)']:,}, a turnover of โ‚น{profile['Net Turnover (In Rs.)']:,}, "
66
- f"and {profile['Employment']} employees running a {profile['Organisation Type'].lower()}."
67
  )
68
 
69
  def generate_search_query(profile):
@@ -77,7 +81,7 @@ def get_top_matching_schemes(query_text, top_k=5):
77
  matches = []
78
  for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
79
  for chunk in doc["rag_chunks"]:
80
- if "embedding" in chunk:
81
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
82
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
83
  matches.append({
@@ -103,21 +107,30 @@ def fetch_scheme_field_llm(scheme_id, field_input):
103
  "documents": "required_documents_list"
104
  }
105
  matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
 
 
106
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
107
- if doc and matched_field and matched_field in doc:
108
  raw_text = "\n".join(doc[matched_field][:5])
109
- prompt = f"""Summarize this section professionally for MSME users:\n\nScheme: {doc['scheme_name']}\nSection: {matched_field.replace('_list','').title()}\n\n{raw_text}"""
 
 
 
 
 
 
 
110
  return llm.invoke(prompt).strip()
111
- return "โš ๏ธ Please ask about: eligibility, benefits, apply process, or required documents."
112
 
113
  # === Chat State ===
114
  chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""}
115
 
116
- # === Chatbot Logic ===
117
  def chatbot(msg, history):
118
  if chat_state["stage"] == 0:
119
  chat_state["stage"] = 1
120
- return "๐Ÿ‘‹ Welcome! Enter your Udyam ID (e.g., `UDYAM-TS-12-1234567`) or type `manual` to fill in your profile."
 
121
 
122
  if chat_state["stage"] == 1:
123
  if msg.lower().startswith("udyam-"):
@@ -127,46 +140,46 @@ def chatbot(msg, history):
127
  chat_state["stage"] = 3
128
  summary = summarize_profile(profile)
129
  chat_state["summary"] = summary
130
- return f"โœ… Found your profile!\n\n๐Ÿ” {summary}\n\nType `show related schemes` to get recommendations."
131
- return "โŒ Invalid or unregistered Udyam ID. Try again or say `manual` to fill manually."
 
 
132
  elif "manual" in msg.lower():
133
  chat_state["stage"] = 2
134
- chat_state["profile"] = {}
135
- return "๐Ÿ“ Let's build your profile.\n\nStep 1: What's your enterprise name? (e.g., `Sri Laxmi Industries`)"
136
- return "๐Ÿ”„ Please enter a valid Udyam ID or say `manual`."
 
137
 
138
  if chat_state["stage"] == 2:
139
  steps = [
140
- ("Enterprise Name", "e.g., `Sri Laxmi Textiles`"),
141
- ("Gender", "e.g., `Female`"),
142
- ("Enterprise Type", "e.g., `Micro`"),
143
- ("Organisation Type", "e.g., `Sole Proprietorship`"),
144
- ("Major Activity", "e.g., `Manufacturing`"),
145
- ("State", "e.g., `Telangana`"),
146
- ("Investment Cost (In Rs.)", "e.g., `5000000`"),
147
- ("Net Turnover (In Rs.)", "e.g., `12000000`"),
148
- ("Employment", "e.g., `23`")
149
  ]
150
- idx = len(chat_state["profile"])
151
- key, example = steps[idx]
152
- value = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg
153
- chat_state["profile"][key] = value
154
- if idx + 1 == len(steps):
155
  chat_state["stage"] = 3
156
  summary = summarize_profile(chat_state["profile"])
157
  chat_state["summary"] = summary
158
- return f"โœ… Thanks! Here's your profile:\n\n๐Ÿ” {summary}\n\nType `show related schemes` to get recommendations."
159
- next_key, next_example = steps[idx + 1]
160
- return f"Step {idx + 2}: {next_key}? ({next_example})"
 
 
161
 
162
  if chat_state["stage"] == 3:
163
  if "show" in msg.lower() and "scheme" in msg.lower():
164
  query, summary = generate_search_query(chat_state["profile"])
165
  top_schemes = get_top_matching_schemes(query)
166
  if not top_schemes:
167
- return "โš ๏ธ No schemes found. Please refine your profile."
 
168
  chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
169
  chat_state["stage"] = 4
 
 
170
  query_logs_coll.insert_one({
171
  "timestamp": datetime.utcnow(),
172
  "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
@@ -175,29 +188,25 @@ def chatbot(msg, history):
175
  "top_schemes": top_schemes,
176
  "selected_scheme": top_schemes[0]["scheme_name"]
177
  })
178
- schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'], 4)})" for i, s in enumerate(top_schemes)])
179
- return f"๐Ÿ“ˆ Top Matches:\n{schemes_text}\n\nAsk: `eligibility`, `how to apply`, `documents needed`, etc."
180
- return "๐Ÿ“ข Type `show related schemes` to get scheme suggestions."
181
 
182
  if chat_state["stage"] == 4:
183
  response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)
184
- return f"{response}\n\n๐Ÿ’ฌ Try asking: `What are the eligibility criteria?`, `What documents are required?`, `How to apply?`"
 
185
 
186
  def translate_last_response():
187
  if chat_state["last_bot_msg"]:
188
  return "๐Ÿ“„ Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model)
189
  return "โš ๏ธ No message to translate."
190
 
191
- # === Gradio UI ===
192
  with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo:
193
- chatbot_ui = gr.ChatInterface(
194
- fn=chatbot,
195
- title="๐Ÿค– MSME Scheme Assistant",
196
- textbox=gr.Textbox(placeholder="Type your message here...")
197
- )
198
- with gr.Row():
199
- translate_btn = gr.Button("๐ŸŒ Translate Last Response to Telugu")
200
- translation_output = gr.Textbox(label="๐Ÿ—ฃ๏ธ Telugu Translation", lines=5)
201
  translate_btn.click(fn=translate_last_response, outputs=translation_output)
202
 
203
  demo.launch()
 
2
  import torch, re
3
  from pymongo import MongoClient
4
  from datetime import datetime
5
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
6
  from sentence_transformers import SentenceTransformer, util
7
  from langchain_core.prompts import PromptTemplate
8
  from langchain_community.llms import HuggingFacePipeline
9
  from IndicTransToolkit.processor import IndicProcessor
10
+ from transformers import BitsAndBytesConfig
11
 
12
  # === MongoDB ===
13
+ mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
14
+ client = MongoClient(mongo_uri)
15
  db = client["msme_schemes_db"]
16
  udyam_coll = db["udyam_profiles"]
17
  schemes_chunk_coll = db["schemes_chunks_only"]
18
  schemes_info_coll = db["schemes_embedded"]
19
  query_logs_coll = db["query_logs"]
20
 
21
+ # === LLM ===
22
+ model_id = "Vipplav/gemma-finetuned-faq"
23
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
24
+ model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
 
 
25
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
26
  llm = HuggingFacePipeline(pipeline=generator)
 
 
27
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
28
 
29
  # === IndicTrans2 ===
 
36
  model.eval()
37
  return tokenizer, model
38
 
39
+ def translate_to_telugu(text, tokenizer, model):
40
+ batch = ip.preprocess_batch([text], src_lang="eng_Latn", tgt_lang="tel_Telu")
41
+ inputs = tokenizer(batch, return_tensors="pt", padding=True).to(DEVICE)
42
+ with torch.no_grad():
43
+ outputs = model.generate(**inputs, max_length=256, num_beams=5)
44
+ result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
45
+ return ip.postprocess_batch(result, lang="tel_Telu")[0]
46
+
47
  translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B")
48
 
49
+ # === Prompt ===
50
  rephrase_template = PromptTemplate.from_template("""
51
  You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
52
  Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
 
58
  # === Utilities ===
59
  def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
60
  def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
 
61
  def get_profile_by_uid(uid):
62
  uid = normalize_udyam(uid)
63
+ if not is_valid_udyam(uid): return None
64
+ return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
65
 
66
  def summarize_profile(profile):
67
  return (
68
+ f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
69
+ f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has "
70
+ f"{profile['Employment']} employees, with an investment of โ‚น{profile['Investment Cost (In Rs.)']:,} and a turnover of โ‚น{profile['Net Turnover (In Rs.)']:,}."
71
  )
72
 
73
  def generate_search_query(profile):
 
81
  matches = []
82
  for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
83
  for chunk in doc["rag_chunks"]:
84
+ if "embedding" in chunk and chunk["embedding"]:
85
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
86
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
87
  matches.append({
 
107
  "documents": "required_documents_list"
108
  }
109
  matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
110
+ if not matched_field:
111
+ return "โŒ Try asking about eligibility, benefits, how to apply, or documents."
112
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
113
+ if doc and matched_field in doc:
114
  raw_text = "\n".join(doc[matched_field][:5])
115
+ prompt = f"""
116
+ Summarize the following information into a clear and professional explanation for business owners:
117
+
118
+ Scheme: {doc['scheme_name']}
119
+ Section: {matched_field.replace('_list','').title()}
120
+
121
+ {raw_text}
122
+ """
123
  return llm.invoke(prompt).strip()
124
+ return "โš ๏ธ Couldnโ€™t find that information for the selected scheme."
125
 
126
  # === Chat State ===
127
  chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""}
128
 
 
129
  def chatbot(msg, history):
130
  if chat_state["stage"] == 0:
131
  chat_state["stage"] = 1
132
+ chat_state["last_bot_msg"] = "๐Ÿ‘‹ Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
133
+ return chat_state["last_bot_msg"]
134
 
135
  if chat_state["stage"] == 1:
136
  if msg.lower().startswith("udyam-"):
 
140
  chat_state["stage"] = 3
141
  summary = summarize_profile(profile)
142
  chat_state["summary"] = summary
143
+ chat_state["last_bot_msg"] = f"โœ… Profile found! Generating recommendations...\n\n๐Ÿ” Based on your profile: {summary}\n\nType 'show related schemes' to view top matches."
144
+ return chat_state["last_bot_msg"]
145
+ chat_state["last_bot_msg"] = "โŒ Invalid or unregistered Udyam ID. Try again or say 'manual'."
146
+ return chat_state["last_bot_msg"]
147
  elif "manual" in msg.lower():
148
  chat_state["stage"] = 2
149
+ chat_state["last_bot_msg"] = "๐Ÿ“ Great! What's your enterprise name?"
150
+ return chat_state["last_bot_msg"]
151
+ chat_state["last_bot_msg"] = "Please enter a valid Udyam ID or type 'manual'."
152
+ return chat_state["last_bot_msg"]
153
 
154
  if chat_state["stage"] == 2:
155
  steps = [
156
+ "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
157
+ "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
 
 
 
 
 
 
 
158
  ]
159
+ curr_index = len(chat_state["profile"])
160
+ key = steps[curr_index]
161
+ chat_state["profile"][key] = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg
162
+ if len(chat_state["profile"]) == len(steps):
 
163
  chat_state["stage"] = 3
164
  summary = summarize_profile(chat_state["profile"])
165
  chat_state["summary"] = summary
166
+ chat_state["last_bot_msg"] = f"โœ… Thanks! Profile completed.\n\n๐Ÿ” Based on your profile: {summary}\n\nType 'show related schemes' to view top matches."
167
+ return chat_state["last_bot_msg"]
168
+ prompt = f"{steps[curr_index + 1]}?"
169
+ chat_state["last_bot_msg"] = prompt
170
+ return prompt
171
 
172
  if chat_state["stage"] == 3:
173
  if "show" in msg.lower() and "scheme" in msg.lower():
174
  query, summary = generate_search_query(chat_state["profile"])
175
  top_schemes = get_top_matching_schemes(query)
176
  if not top_schemes:
177
+ chat_state["last_bot_msg"] = "โš ๏ธ No matching schemes found."
178
+ return chat_state["last_bot_msg"]
179
  chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
180
  chat_state["stage"] = 4
181
+ schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
182
+ chat_state["last_bot_msg"] = f"๐Ÿ“ˆ Recommended Schemes:\n{schemes_text}\n\nYou can now ask about eligibility, apply, documents, etc."
183
  query_logs_coll.insert_one({
184
  "timestamp": datetime.utcnow(),
185
  "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
 
188
  "top_schemes": top_schemes,
189
  "selected_scheme": top_schemes[0]["scheme_name"]
190
  })
191
+ return chat_state["last_bot_msg"]
192
+ chat_state["last_bot_msg"] = "Type 'show related schemes' to proceed."
193
+ return chat_state["last_bot_msg"]
194
 
195
  if chat_state["stage"] == 4:
196
  response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)
197
+ chat_state["last_bot_msg"] = response
198
+ return response
199
 
200
  def translate_last_response():
201
  if chat_state["last_bot_msg"]:
202
  return "๐Ÿ“„ Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model)
203
  return "โš ๏ธ No message to translate."
204
 
205
+ # === UI ===
206
  with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo:
207
+ chatbot_ui = gr.ChatInterface(fn=chatbot, title="๐Ÿค– MSME Scheme Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
208
+ translate_btn = gr.Button("๐ŸŒ Translate Last Response to Telugu")
209
+ translation_output = gr.Textbox(label="๐Ÿ—ฃ๏ธ Telugu Translation", lines=5)
 
 
 
 
 
210
  translate_btn.click(fn=translate_last_response, outputs=translation_output)
211
 
212
  demo.launch()