Vipplav commited on
Commit
35714a8
Β·
verified Β·
1 Parent(s): 2622a38
Files changed (1) hide show
  1. app.py +103 -66
app.py CHANGED
@@ -1,32 +1,48 @@
1
  import gradio as gr
 
2
  from pymongo import MongoClient
3
- from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
 
 
 
4
  from sentence_transformers import SentenceTransformer, util
5
  from langchain_core.prompts import PromptTemplate
6
  from langchain_community.llms import HuggingFacePipeline
7
- from datetime import datetime
8
- import torch, re
9
 
10
- # === Setup ===
11
- mongo_uri = "mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/"
12
- client = MongoClient(mongo_uri)
13
  db = client["msme_schemes_db"]
14
  udyam_coll = db["udyam_profiles"]
15
  schemes_chunk_coll = db["schemes_chunks_only"]
16
  schemes_info_coll = db["schemes_embedded"]
17
  query_logs_coll = db["query_logs"]
18
 
19
- model_id = "google/gemma-2b-it"
20
- tokenizer = AutoTokenizer.from_pretrained(model_id)
 
21
  model = AutoModelForCausalLM.from_pretrained(
22
- model_id,
23
- device_map="auto",
24
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
25
  )
26
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
27
  llm = HuggingFacePipeline(pipeline=generator)
 
 
28
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  rephrase_template = PromptTemplate.from_template("""
31
  You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
32
  Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
@@ -35,20 +51,19 @@ Enterprise Profile:
35
  {profile_summary}
36
  """)
37
 
38
- # === Utils ===
39
  def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
40
  def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
41
 
42
  def get_profile_by_uid(uid):
43
  uid = normalize_udyam(uid)
44
- if not is_valid_udyam(uid): return None
45
- return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0})
46
 
47
  def summarize_profile(profile):
48
  return (
49
- f"The user represents an enterprise named '{profile['Enterprise Name']}', based in {profile['State']}, operating in the {profile['Major Activity']} sector. "
50
- f"They identify as {profile['Gender']}, run a {profile['Enterprise Type']} sized {profile['Organisation Type'].lower()} organization. The enterprise has "
51
- f"{profile['Employment']} employees, with an investment of β‚Ή{profile['Investment Cost (In Rs.)']:,} and a turnover of β‚Ή{profile['Net Turnover (In Rs.)']:,}."
52
  )
53
 
54
  def generate_search_query(profile):
@@ -62,7 +77,7 @@ def get_top_matching_schemes(query_text, top_k=5):
62
  matches = []
63
  for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
64
  for chunk in doc["rag_chunks"]:
65
- if "embedding" in chunk and chunk["embedding"]:
66
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
67
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
68
  matches.append({
@@ -88,29 +103,21 @@ def fetch_scheme_field_llm(scheme_id, field_input):
88
  "documents": "required_documents_list"
89
  }
90
  matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
91
- if not matched_field:
92
- return "❌ Try asking about eligibility, benefits, how to apply, or documents."
93
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
94
- if doc and matched_field in doc:
95
  raw_text = "\n".join(doc[matched_field][:5])
96
- prompt = f"""
97
- Summarize the following information into a clear and professional explanation for business owners:
98
-
99
- Scheme: {doc['scheme_name']}
100
- Section: {matched_field.replace('_list','').title()}
101
-
102
- {raw_text}
103
- """
104
  return llm.invoke(prompt).strip()
105
- return "⚠️ Couldn’t find that information for the selected scheme."
106
 
107
- # === Chatbot ===
108
- chat_state = {"stage": 0, "profile": {}, "scheme_id": None}
109
 
 
110
  def chatbot(msg, history):
111
  if chat_state["stage"] == 0:
112
  chat_state["stage"] = 1
113
- return "πŸ‘‹ Hello! Please enter your Udyam ID or say 'manual' to fill in details yourself."
114
 
115
  if chat_state["stage"] == 1:
116
  if msg.lower().startswith("udyam-"):
@@ -118,49 +125,79 @@ def chatbot(msg, history):
118
  if profile:
119
  chat_state["profile"] = profile
120
  chat_state["stage"] = 3
121
- return "βœ… Profile found! Generating recommendations..."
122
- return "❌ Invalid or unregistered Udyam ID. Try again or say 'manual'."
 
 
123
  elif "manual" in msg.lower():
124
  chat_state["stage"] = 2
125
- return "πŸ“ Great! What's your enterprise name?"
126
- return "Please enter a valid Udyam ID or type 'manual'."
 
127
 
128
  if chat_state["stage"] == 2:
129
  steps = [
130
- "Enterprise Name", "Gender", "Enterprise Type", "Organisation Type",
131
- "Major Activity", "State", "Investment Cost (In Rs.)", "Net Turnover (In Rs.)", "Employment"
 
 
 
 
 
 
 
132
  ]
133
- curr_index = len(chat_state["profile"])
134
- key = steps[curr_index]
135
- if "Cost" in key or "Turnover" in key or "Employment" in key:
136
- chat_state["profile"][key] = int(msg)
137
- else:
138
- chat_state["profile"][key] = msg
139
- if len(chat_state["profile"]) == len(steps):
140
  chat_state["stage"] = 3
141
- return "βœ… Thanks! Now generating recommendations..."
142
- return f"{steps[curr_index + 1]}?"
 
 
 
143
 
144
  if chat_state["stage"] == 3:
145
- query, summary = generate_search_query(chat_state["profile"])
146
- top_schemes = get_top_matching_schemes(query)
147
- if not top_schemes:
148
- return "⚠️ No matching schemes found."
149
- chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
150
- query_logs_coll.insert_one({
151
- "timestamp": datetime.utcnow(),
152
- "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
153
- "profile_summary": summary,
154
- "query": query,
155
- "top_schemes": top_schemes,
156
- "selected_scheme": top_schemes[0]["scheme_name"]
157
- })
158
- names = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'],4)})" for i, s in enumerate(top_schemes)])
159
- chat_state["stage"] = 4
160
- return f"πŸ” Based on your profile: {summary}\n\nπŸ“ˆ Recommended Schemes:\n{names}\n\nYou can now ask about this scheme using keywords like 'eligibility', 'apply', or 'documents'."
 
 
161
 
162
  if chat_state["stage"] == 4:
163
- return fetch_scheme_field_llm(chat_state["scheme_id"], msg)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
164
 
165
- demo = gr.ChatInterface(fn=chatbot, title="πŸ€– MSME Chatbot Assistant", textbox=gr.Textbox(placeholder="Type your message here..."))
166
  demo.launch()
 
1
  import gradio as gr
2
+ import torch, re
3
  from pymongo import MongoClient
4
+ from datetime import datetime
5
+ from transformers import (
6
+ AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM, pipeline
7
+ )
8
  from sentence_transformers import SentenceTransformer, util
9
  from langchain_core.prompts import PromptTemplate
10
  from langchain_community.llms import HuggingFacePipeline
11
+ from IndicTransToolkit.processor import IndicProcessor
 
12
 
13
+ # === MongoDB ===
14
+ client = MongoClient("mongodb+srv://vipplavai:pravip2025@cluster0.zcsijsa.mongodb.net/")
 
15
  db = client["msme_schemes_db"]
16
  udyam_coll = db["udyam_profiles"]
17
  schemes_chunk_coll = db["schemes_chunks_only"]
18
  schemes_info_coll = db["schemes_embedded"]
19
  query_logs_coll = db["query_logs"]
20
 
21
+ # === LLM Setup ===
22
+ MODEL_ID = "Vipplav/gemma-finetuned-faq"
23
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
24
  model = AutoModelForCausalLM.from_pretrained(
25
+ MODEL_ID, device_map="auto", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
 
 
26
  )
27
  generator = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=128, do_sample=False)
28
  llm = HuggingFacePipeline(pipeline=generator)
29
+
30
+ # === Embedding Model ===
31
  embed_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device="cuda" if torch.cuda.is_available() else "cpu")
32
 
33
+ # === IndicTrans2 ===
34
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
35
+ ip = IndicProcessor(inference=True)
36
+
37
+ def initialize_translator(ckpt_dir):
38
+ tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
39
+ model = AutoModelForSeq2SeqLM.from_pretrained(ckpt_dir, trust_remote_code=True).to(DEVICE)
40
+ model.eval()
41
+ return tokenizer, model
42
+
43
+ translator_tokenizer, translator_model = initialize_translator("ai4bharat/indictrans2-en-indic-1B")
44
+
45
+ # === Prompt Template ===
46
  rephrase_template = PromptTemplate.from_template("""
47
  You're a helpful assistant guiding Indian MSMEs to the best-matching government schemes.
48
  Based on the enterprise profile, generate a clear, short one-line search query with keywords like state, sector, size, gender, and investment.
 
51
  {profile_summary}
52
  """)
53
 
54
+ # === Utilities ===
55
  def normalize_udyam(uid): return uid.strip().upper().replace(" ", "")
56
  def is_valid_udyam(uid): return bool(re.match(r"^UDYAM-[A-Z]{2}-\d{2}-\d{6,7}$", uid))
57
 
58
  def get_profile_by_uid(uid):
59
  uid = normalize_udyam(uid)
60
+ return udyam_coll.find_one({"Udyam_ID": uid}, {"_id": 0}) if is_valid_udyam(uid) else None
 
61
 
62
  def summarize_profile(profile):
63
  return (
64
+ f"The user represents '{profile['Enterprise Name']}', a {profile['Enterprise Type']} enterprise from {profile['State']}, in the {profile['Major Activity']} sector. "
65
+ f"They are a {profile['Gender']} entrepreneur with an investment of β‚Ή{profile['Investment Cost (In Rs.)']:,}, a turnover of β‚Ή{profile['Net Turnover (In Rs.)']:,}, "
66
+ f"and {profile['Employment']} employees running a {profile['Organisation Type'].lower()}."
67
  )
68
 
69
  def generate_search_query(profile):
 
77
  matches = []
78
  for doc in schemes_chunk_coll.find({"rag_chunks": {"$exists": True}}):
79
  for chunk in doc["rag_chunks"]:
80
+ if "embedding" in chunk:
81
  chunk_tensor = torch.tensor(chunk["embedding"]).to(query_embedding.device)
82
  score = util.cos_sim(query_embedding, chunk_tensor)[0][0].item()
83
  matches.append({
 
103
  "documents": "required_documents_list"
104
  }
105
  matched_field = next((v for k, v in field_map.items() if k in field_input.lower()), None)
 
 
106
  doc = schemes_info_coll.find_one({"scheme_id": scheme_id})
107
+ if doc and matched_field and matched_field in doc:
108
  raw_text = "\n".join(doc[matched_field][:5])
109
+ prompt = f"""Summarize this section professionally for MSME users:\n\nScheme: {doc['scheme_name']}\nSection: {matched_field.replace('_list','').title()}\n\n{raw_text}"""
 
 
 
 
 
 
 
110
  return llm.invoke(prompt).strip()
111
+ return "⚠️ Please ask about: eligibility, benefits, apply process, or required documents."
112
 
113
+ # === Chat State ===
114
+ chat_state = {"stage": 0, "profile": {}, "scheme_id": None, "last_bot_msg": "", "summary": ""}
115
 
116
+ # === Chatbot Logic ===
117
  def chatbot(msg, history):
118
  if chat_state["stage"] == 0:
119
  chat_state["stage"] = 1
120
+ return "πŸ‘‹ Welcome! Enter your Udyam ID (e.g., `UDYAM-TS-12-1234567`) or type `manual` to fill in your profile."
121
 
122
  if chat_state["stage"] == 1:
123
  if msg.lower().startswith("udyam-"):
 
125
  if profile:
126
  chat_state["profile"] = profile
127
  chat_state["stage"] = 3
128
+ summary = summarize_profile(profile)
129
+ chat_state["summary"] = summary
130
+ return f"βœ… Found your profile!\n\nπŸ” {summary}\n\nType `show related schemes` to get recommendations."
131
+ return "❌ Invalid or unregistered Udyam ID. Try again or say `manual` to fill manually."
132
  elif "manual" in msg.lower():
133
  chat_state["stage"] = 2
134
+ chat_state["profile"] = {}
135
+ return "πŸ“ Let's build your profile.\n\nStep 1: What's your enterprise name? (e.g., `Sri Laxmi Industries`)"
136
+ return "πŸ”„ Please enter a valid Udyam ID or say `manual`."
137
 
138
  if chat_state["stage"] == 2:
139
  steps = [
140
+ ("Enterprise Name", "e.g., `Sri Laxmi Textiles`"),
141
+ ("Gender", "e.g., `Female`"),
142
+ ("Enterprise Type", "e.g., `Micro`"),
143
+ ("Organisation Type", "e.g., `Sole Proprietorship`"),
144
+ ("Major Activity", "e.g., `Manufacturing`"),
145
+ ("State", "e.g., `Telangana`"),
146
+ ("Investment Cost (In Rs.)", "e.g., `5000000`"),
147
+ ("Net Turnover (In Rs.)", "e.g., `12000000`"),
148
+ ("Employment", "e.g., `23`")
149
  ]
150
+ idx = len(chat_state["profile"])
151
+ key, example = steps[idx]
152
+ value = int(msg) if "Cost" in key or "Turnover" in key or "Employment" in key else msg
153
+ chat_state["profile"][key] = value
154
+ if idx + 1 == len(steps):
 
 
155
  chat_state["stage"] = 3
156
+ summary = summarize_profile(chat_state["profile"])
157
+ chat_state["summary"] = summary
158
+ return f"βœ… Thanks! Here's your profile:\n\nπŸ” {summary}\n\nType `show related schemes` to get recommendations."
159
+ next_key, next_example = steps[idx + 1]
160
+ return f"Step {idx + 2}: {next_key}? ({next_example})"
161
 
162
  if chat_state["stage"] == 3:
163
+ if "show" in msg.lower() and "scheme" in msg.lower():
164
+ query, summary = generate_search_query(chat_state["profile"])
165
+ top_schemes = get_top_matching_schemes(query)
166
+ if not top_schemes:
167
+ return "⚠️ No schemes found. Please refine your profile."
168
+ chat_state["scheme_id"] = top_schemes[0]["scheme_id"]
169
+ chat_state["stage"] = 4
170
+ query_logs_coll.insert_one({
171
+ "timestamp": datetime.utcnow(),
172
+ "udyam_id": chat_state["profile"].get("Udyam_ID", "manual_entry"),
173
+ "profile_summary": summary,
174
+ "query": query,
175
+ "top_schemes": top_schemes,
176
+ "selected_scheme": top_schemes[0]["scheme_name"]
177
+ })
178
+ schemes_text = "\n".join([f"{i+1}. {s['scheme_name']} (Score: {round(s['score'], 4)})" for i, s in enumerate(top_schemes)])
179
+ return f"πŸ“ˆ Top Matches:\n{schemes_text}\n\nAsk: `eligibility`, `how to apply`, `documents needed`, etc."
180
+ return "πŸ“’ Type `show related schemes` to get scheme suggestions."
181
 
182
  if chat_state["stage"] == 4:
183
+ response = fetch_scheme_field_llm(chat_state["scheme_id"], msg)
184
+ return f"{response}\n\nπŸ’¬ Try asking: `What are the eligibility criteria?`, `What documents are required?`, `How to apply?`"
185
+
186
+ def translate_last_response():
187
+ if chat_state["last_bot_msg"]:
188
+ return "πŸ“„ Telugu Translation:\n\n" + translate_to_telugu(chat_state["last_bot_msg"], translator_tokenizer, translator_model)
189
+ return "⚠️ No message to translate."
190
+
191
+ # === Gradio UI ===
192
+ with gr.Blocks(title="MSME Chatbot with Telugu Support") as demo:
193
+ chatbot_ui = gr.ChatInterface(
194
+ fn=chatbot,
195
+ title="πŸ€– MSME Scheme Assistant",
196
+ textbox=gr.Textbox(placeholder="Type your message here...")
197
+ )
198
+ with gr.Row():
199
+ translate_btn = gr.Button("🌐 Translate Last Response to Telugu")
200
+ translation_output = gr.Textbox(label="πŸ—£οΈ Telugu Translation", lines=5)
201
+ translate_btn.click(fn=translate_last_response, outputs=translation_output)
202
 
 
203
  demo.launch()