VJnCode commited on
Commit
2e108ec
·
1 Parent(s): aaa9a08

FEAT : added recomendation route

Browse files
.gitignore CHANGED
@@ -1,4 +1,4 @@
1
- chathur/
2
  **/__pycache__/
3
  .env
4
  api/rag/translator-en-kn-merged/
 
1
+ chatur/
2
  **/__pycache__/
3
  .env
4
  api/rag/translator-en-kn-merged/
api/main.py CHANGED
@@ -6,9 +6,9 @@ import logging
6
  from api.routes import endpoints
7
  from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
8
 
9
- # Central scheme imports
 
10
  from api.routes import central_endpoints
11
- # MODIFIED IMPORT: Added _central_schemes_cache to get more stats
12
  from api.services.central_services import (
13
  load_central_schemes_into_cache,
14
  get_central_cache_loading_status,
@@ -16,9 +16,9 @@ from api.services.central_services import (
16
  _central_schemes_cache
17
  )
18
 
19
- # Other imports
20
  from api.core.firebase_utils import db, initialize_firebase
21
- from api.routes import rag_route
22
  from fastapi.middleware.cors import CORSMiddleware
23
 
24
  # Configure logging
@@ -55,7 +55,7 @@ app.include_router(
55
  prefix="/{lang}/central",
56
  tags=["Central Schemes"]
57
  )
58
- app.include_router(rag_route.router, prefix="/api", tags=["RAG Chatbot"])
59
 
60
  @app.get("/")
61
  def root():
@@ -63,10 +63,6 @@ def root():
63
  return {"message": "Welcome to Chathur API"}
64
 
65
  # --- Cache Status and Refresh Endpoints ---
66
-
67
- # REMOVED: Combined /cache_status endpoint
68
-
69
- # NEW: Separate endpoint for state scheme cache status
70
  @app.get("/state_cache_status")
71
  def get_state_cache_status():
72
  """Returns the current status of the state scheme cache."""
 
6
  from api.routes import endpoints
7
  from api.services.scheme_service import load_all_schemes_into_cache, is_cache_loading, cached_all_schemes
8
 
9
+ from api.routes import recommend_route
10
+
11
  from api.routes import central_endpoints
 
12
  from api.services.central_services import (
13
  load_central_schemes_into_cache,
14
  get_central_cache_loading_status,
 
16
  _central_schemes_cache
17
  )
18
 
19
+
20
  from api.core.firebase_utils import db, initialize_firebase
21
+
22
  from fastapi.middleware.cors import CORSMiddleware
23
 
24
  # Configure logging
 
55
  prefix="/{lang}/central",
56
  tags=["Central Schemes"]
57
  )
58
+ app.include_router(recommend_route.router)
59
 
60
  @app.get("/")
61
  def root():
 
63
  return {"message": "Welcome to Chathur API"}
64
 
65
  # --- Cache Status and Refresh Endpoints ---
 
 
 
 
66
  @app.get("/state_cache_status")
67
  def get_state_cache_status():
68
  """Returns the current status of the state scheme cache."""
api/rag/figures/eval_bleu.pdf DELETED
Binary file (12.5 kB)
 
api/rag/figures/eval_bleu.png DELETED
Binary file (47.4 kB)
 
api/rag/figures/training_logs.json DELETED
@@ -1,306 +0,0 @@
1
- [
2
- {
3
- "loss": 8.1255,
4
- "grad_norm": 5.886991024017334,
5
- "learning_rate": 4.755e-05,
6
- "epoch": 0.1,
7
- "step": 50
8
- },
9
- {
10
- "loss": 2.1223,
11
- "grad_norm": 1.780342936515808,
12
- "learning_rate": 4.5050000000000004e-05,
13
- "epoch": 0.2,
14
- "step": 100
15
- },
16
- {
17
- "loss": 1.4172,
18
- "grad_norm": 1.2484183311462402,
19
- "learning_rate": 4.2550000000000004e-05,
20
- "epoch": 0.3,
21
- "step": 150
22
- },
23
- {
24
- "loss": 1.0609,
25
- "grad_norm": 1.4256188869476318,
26
- "learning_rate": 4.0050000000000004e-05,
27
- "epoch": 0.4,
28
- "step": 200
29
- },
30
- {
31
- "eval_loss": 0.8911033868789673,
32
- "eval_score": 0.06293457344434858,
33
- "eval_counts": [
34
- 163,
35
- 1,
36
- 0,
37
- 0
38
- ],
39
- "eval_totals": [
40
- 3683,
41
- 3483,
42
- 3283,
43
- 3084
44
- ],
45
- "eval_precisions": [
46
- 4.425739885962531,
47
- 0.02871088142405972,
48
- 0.015229972586049346,
49
- 0.008106355382619975
50
- ],
51
- "eval_bp": 1.0,
52
- "eval_sys_len": 3683,
53
- "eval_ref_len": 1623,
54
- "eval_bleu": 0.06293457344434858,
55
- "eval_runtime": 109.0083,
56
- "eval_samples_per_second": 1.835,
57
- "eval_steps_per_second": 0.459,
58
- "epoch": 0.4,
59
- "step": 200
60
- },
61
- {
62
- "loss": 0.938,
63
- "grad_norm": 0.9899176955223083,
64
- "learning_rate": 3.7550000000000005e-05,
65
- "epoch": 0.5,
66
- "step": 250
67
- },
68
- {
69
- "loss": 0.8151,
70
- "grad_norm": 0.8253363966941833,
71
- "learning_rate": 3.505e-05,
72
- "epoch": 0.6,
73
- "step": 300
74
- },
75
- {
76
- "loss": 0.8122,
77
- "grad_norm": 1.7979626655578613,
78
- "learning_rate": 3.2550000000000005e-05,
79
- "epoch": 0.7,
80
- "step": 350
81
- },
82
- {
83
- "loss": 0.8516,
84
- "grad_norm": 0.5633005499839783,
85
- "learning_rate": 3.0050000000000002e-05,
86
- "epoch": 0.8,
87
- "step": 400
88
- },
89
- {
90
- "eval_loss": 0.7273606657981873,
91
- "eval_score": 0.45057594789546845,
92
- "eval_counts": [
93
- 208,
94
- 5,
95
- 2,
96
- 0
97
- ],
98
- "eval_totals": [
99
- 1368,
100
- 1168,
101
- 968,
102
- 774
103
- ],
104
- "eval_precisions": [
105
- 15.2046783625731,
106
- 0.4280821917808219,
107
- 0.2066115702479339,
108
- 0.06459948320413436
109
- ],
110
- "eval_bp": 0.8299386398864602,
111
- "eval_sys_len": 1368,
112
- "eval_ref_len": 1623,
113
- "eval_bleu": 0.45057594789546845,
114
- "eval_runtime": 77.3509,
115
- "eval_samples_per_second": 2.586,
116
- "eval_steps_per_second": 0.646,
117
- "epoch": 0.8,
118
- "step": 400
119
- },
120
- {
121
- "loss": 0.9177,
122
- "grad_norm": 0.6352578997612,
123
- "learning_rate": 2.7550000000000002e-05,
124
- "epoch": 0.9,
125
- "step": 450
126
- },
127
- {
128
- "loss": 0.7974,
129
- "grad_norm": 0.8983929753303528,
130
- "learning_rate": 2.5050000000000002e-05,
131
- "epoch": 1.0,
132
- "step": 500
133
- },
134
- {
135
- "loss": 0.7734,
136
- "grad_norm": 0.6885063648223877,
137
- "learning_rate": 2.2550000000000003e-05,
138
- "epoch": 1.1,
139
- "step": 550
140
- },
141
- {
142
- "loss": 0.8068,
143
- "grad_norm": 0.9066347479820251,
144
- "learning_rate": 2.0050000000000003e-05,
145
- "epoch": 1.2,
146
- "step": 600
147
- },
148
- {
149
- "eval_loss": 0.6409754157066345,
150
- "eval_score": 2.2308463972371086,
151
- "eval_counts": [
152
- 281,
153
- 33,
154
- 11,
155
- 6
156
- ],
157
- "eval_totals": [
158
- 1269,
159
- 1069,
160
- 870,
161
- 686
162
- ],
163
- "eval_precisions": [
164
- 22.14342001576044,
165
- 3.086997193638915,
166
- 1.264367816091954,
167
- 0.8746355685131195
168
- ],
169
- "eval_bp": 0.7565703085029857,
170
- "eval_sys_len": 1269,
171
- "eval_ref_len": 1623,
172
- "eval_bleu": 2.2308463972371086,
173
- "eval_runtime": 53.7294,
174
- "eval_samples_per_second": 3.722,
175
- "eval_steps_per_second": 0.931,
176
- "epoch": 1.2,
177
- "step": 600
178
- },
179
- {
180
- "loss": 0.6715,
181
- "grad_norm": 0.945395290851593,
182
- "learning_rate": 1.755e-05,
183
- "epoch": 1.3,
184
- "step": 650
185
- },
186
- {
187
- "loss": 0.7764,
188
- "grad_norm": 2.0758280754089355,
189
- "learning_rate": 1.505e-05,
190
- "epoch": 1.4,
191
- "step": 700
192
- },
193
- {
194
- "loss": 0.6834,
195
- "grad_norm": 0.43225401639938354,
196
- "learning_rate": 1.255e-05,
197
- "epoch": 1.5,
198
- "step": 750
199
- },
200
- {
201
- "loss": 0.7715,
202
- "grad_norm": 0.982354998588562,
203
- "learning_rate": 1.005e-05,
204
- "epoch": 1.6,
205
- "step": 800
206
- },
207
- {
208
- "eval_loss": 0.6118303537368774,
209
- "eval_score": 2.2446563832557205,
210
- "eval_counts": [
211
- 312,
212
- 37,
213
- 11,
214
- 5
215
- ],
216
- "eval_totals": [
217
- 1298,
218
- 1098,
219
- 899,
220
- 717
221
- ],
222
- "eval_precisions": [
223
- 24.03697996918336,
224
- 3.3697632058287796,
225
- 1.2235817575083425,
226
- 0.697350069735007
227
- ],
228
- "eval_bp": 0.7785008405436009,
229
- "eval_sys_len": 1298,
230
- "eval_ref_len": 1623,
231
- "eval_bleu": 2.2446563832557205,
232
- "eval_runtime": 50.8519,
233
- "eval_samples_per_second": 3.933,
234
- "eval_steps_per_second": 0.983,
235
- "epoch": 1.6,
236
- "step": 800
237
- },
238
- {
239
- "loss": 0.7415,
240
- "grad_norm": 0.5001242160797119,
241
- "learning_rate": 7.55e-06,
242
- "epoch": 1.7,
243
- "step": 850
244
- },
245
- {
246
- "loss": 0.6018,
247
- "grad_norm": 0.6771586537361145,
248
- "learning_rate": 5.050000000000001e-06,
249
- "epoch": 1.8,
250
- "step": 900
251
- },
252
- {
253
- "loss": 0.6488,
254
- "grad_norm": 0.7276270389556885,
255
- "learning_rate": 2.55e-06,
256
- "epoch": 1.9,
257
- "step": 950
258
- },
259
- {
260
- "loss": 0.6508,
261
- "grad_norm": 0.5777331590652466,
262
- "learning_rate": 5.0000000000000004e-08,
263
- "epoch": 2.0,
264
- "step": 1000
265
- },
266
- {
267
- "eval_loss": 0.6058484315872192,
268
- "eval_score": 2.256370766803717,
269
- "eval_counts": [
270
- 319,
271
- 37,
272
- 11,
273
- 5
274
- ],
275
- "eval_totals": [
276
- 1310,
277
- 1110,
278
- 911,
279
- 727
280
- ],
281
- "eval_precisions": [
282
- 24.35114503816794,
283
- 3.3333333333333335,
284
- 1.2074643249176729,
285
- 0.687757909215956
286
- ],
287
- "eval_bp": 0.7874689814366906,
288
- "eval_sys_len": 1310,
289
- "eval_ref_len": 1623,
290
- "eval_bleu": 2.256370766803717,
291
- "eval_runtime": 50.885,
292
- "eval_samples_per_second": 3.93,
293
- "eval_steps_per_second": 0.983,
294
- "epoch": 2.0,
295
- "step": 1000
296
- },
297
- {
298
- "train_runtime": 493.5783,
299
- "train_samples_per_second": 8.104,
300
- "train_steps_per_second": 2.026,
301
- "total_flos": 136952414208000.0,
302
- "train_loss": 1.2491823387145997,
303
- "epoch": 2.0,
304
- "step": 1000
305
- }
306
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/rag/figures/training_loss.pdf DELETED
Binary file (11.4 kB)
 
api/rag/figures/training_loss.png DELETED
Binary file (43.2 kB)
 
api/rag/rag.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
api/rag/translated_schemes_kn.json DELETED
The diff for this file is too large to render. See raw diff
 
api/rag/translator.ipynb DELETED
The diff for this file is too large to render. See raw diff
 
api/routes/rag_route.py DELETED
@@ -1,16 +0,0 @@
1
- from fastapi import APIRouter, HTTPException
2
- from pydantic import BaseModel
3
- from api.services.rag_service import get_answer_from_vectorstore
4
-
5
- router = APIRouter()
6
-
7
- class QueryInput(BaseModel):
8
- question: str
9
-
10
- @router.post("/rag/query")
11
- async def rag_query(query: QueryInput):
12
- try:
13
- result = get_answer_from_vectorstore(query.question)
14
- return result
15
- except Exception as e:
16
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/routes/recommend_route.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Path, status
2
+ from pydantic import BaseModel
3
+ from typing import List
4
+
5
+ # Import the recommendation service
6
+ from api.services.recommend_service import get_recommendations
7
+
8
+ router = APIRouter()
9
+
10
+ # --- Pydantic Request Model ---
11
+
12
+ class RecommendationRequest(BaseModel):
13
+ """
14
+ Payload for the recommendation endpoint.
15
+ Expects a list of tags.
16
+ """
17
+ tags: List[str]
18
+
19
+ # --- API Endpoint ---
20
+
21
+ @router.post(
22
+ "/{lang}/recommend",
23
+ tags=["Recommendations"],
24
+ summary="Get Hybrid Scheme Recommendations"
25
+ )
26
+ async def recommend_schemes(
27
+ request: RecommendationRequest,
28
+ lang: str = Path(..., title="Language Code", description="ISO 639-1 language code (e.g., 'en', 'hi')")
29
+ ):
30
+ """
31
+ Get a list of recommended schemes from both State and Central governments
32
+ based on a list of input tags.
33
+
34
+ This endpoint uses a hybrid model that considers:
35
+ 1. **Tag Matching:** How well the user's tags match the scheme's tags.
36
+ 2. **Popularity:** The general popularity score of the scheme.
37
+ """
38
+ if not request.tags:
39
+ raise HTTPException(
40
+ status_code=status.HTTP_400_BAD_REQUEST,
41
+ detail="The 'tags' list cannot be empty."
42
+ )
43
+
44
+ try:
45
+ # Call the service layer to get recommendations
46
+ recommendations = get_recommendations(user_tags=request.tags, lang=lang)
47
+ return recommendations
48
+ except Exception as e:
49
+ # Generic error for unexpected issues in the service layer
50
+ logger.error(f"Recommendation endpoint failed: {e}")
51
+ raise HTTPException(
52
+ status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
53
+ detail=f"An error occurred while generating recommendations."
54
+ )
api/services/rag_service.py DELETED
@@ -1,93 +0,0 @@
1
- import os
2
- from dotenv import load_dotenv
3
- from langchain_pinecone import PineconeVectorStore
4
- from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain_groq import ChatGroq
6
- from langchain_core.messages import HumanMessage
7
- from pinecone import Pinecone
8
-
9
- # --- Load environment variables ---
10
- load_dotenv()
11
-
12
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
13
- GROQ_API_KEY = os.getenv("GROQ_API_KEY")
14
-
15
- if not PINECONE_API_KEY or not GROQ_API_KEY:
16
- raise ValueError("❌ Missing PINECONE_API_KEY or GROQ_API_KEY")
17
-
18
- # --- Configurations ---
19
- PINECONE_INDEX_NAME = "scheme-index"
20
- PINECONE_NAMESPACE = "schemes"
21
- EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
22
- GROQ_MODEL_NAME = "llama-3.1-8b-instant"
23
-
24
- # --- Initialize Services ---
25
- print("🚀 Initializing embeddings and LLM...")
26
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_NAME)
27
- llm = ChatGroq(model_name=GROQ_MODEL_NAME)
28
-
29
- print("🔗 Connecting to Pinecone...")
30
- try:
31
- pc = Pinecone(api_key=PINECONE_API_KEY)
32
- indexes = pc.list_indexes()
33
- print(f"✅ Pinecone reachable. Indexes: {indexes}")
34
- except Exception as e:
35
- print(f"❌ Pinecone connection failed: {e}")
36
-
37
- # --- Vector Store ---
38
- vectorstore = PineconeVectorStore.from_existing_index(
39
- index_name=PINECONE_INDEX_NAME,
40
- embedding=embeddings,
41
- namespace=PINECONE_NAMESPACE
42
- )
43
-
44
- # --- Main RAG Function ---
45
- def get_answer_from_vectorstore(question: str) -> dict:
46
- print(f"🧠 Query received: {question}")
47
- try:
48
- docs_with_scores = vectorstore.similarity_search_with_score(question, k=5)
49
- print(f"📄 Retrieved {len(docs_with_scores)} docs")
50
-
51
- for doc, score in docs_with_scores:
52
- print(f"→ Score: {score:.4f} | Snippet: {doc.page_content[:80]}")
53
-
54
- threshold = 0.75
55
- filtered_docs = [doc for doc, score in docs_with_scores if score < threshold]
56
- print(f"✅ Filtered {len(filtered_docs)} docs below threshold {threshold}")
57
-
58
- if not filtered_docs:
59
- print("⚠️ No matching documents found.")
60
- return {
61
- "answer": "This question seems to be outside my knowledge of government schemes. Please ask about a specific scheme or benefit.",
62
- "sources": []
63
- }
64
-
65
- context = "\n\n".join([doc.page_content for doc in filtered_docs])
66
- prompt = f"""
67
- You are a helpful assistant for rural users regarding Indian government schemes.
68
- Answer the following question using only the context provided below.
69
- If the answer cannot be found in the context, say:
70
- "I'm sorry, I couldn't find information about that in my current knowledge base."
71
-
72
- Context:
73
- {context}
74
-
75
- Question: {question}
76
-
77
- Answer:
78
- """
79
-
80
- answer_message = llm.invoke([HumanMessage(content=prompt)])
81
- answer = answer_message.content.strip()
82
-
83
- return {
84
- "answer": answer,
85
- "sources": [doc.metadata for doc in filtered_docs]
86
- }
87
-
88
- except Exception as e:
89
- print(f"❌ Error in get_answer_from_vectorstore: {e}")
90
- return {
91
- "answer": f"An error occurred while fetching the answer: {str(e)}",
92
- "sources": []
93
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api/services/recommend_service.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ # MODIFIED IMPORTS: Import the modules themselves, not the variables
3
+ from api.services import scheme_service
4
+ from api.services import central_services
5
+
6
+ logger = logging.getLogger(__name__)
7
+
8
+ # --- NEW: Helper function for dynamic tag generation ---
9
+ def _generate_tags_from_scheme(scheme: dict, user_tags_set: set) -> list[str]:
10
+ """
11
+ Searches a scheme's Title and Description for any of the user's tags.
12
+ Returns a list of tags that were found.
13
+ """
14
+ # Combine Title and Description into a single searchable text
15
+ search_text = (
16
+ scheme.get("Title", "") + " " +
17
+ scheme.get("Description", "")
18
+ ).lower()
19
+
20
+ if not search_text:
21
+ return []
22
+
23
+ found_tags = []
24
+ # Check each of the user's original tags
25
+ for tag in user_tags_set:
26
+ # Use ' in ' for simple substring matching
27
+ if tag in search_text:
28
+ found_tags.append(tag)
29
+ return found_tags
30
+ # --- END NEW HELPER ---
31
+
32
+
33
+ # --- Hybrid Recommendation Logic ---
34
+
35
+ def _calculate_hybrid_score(scheme: dict, user_tags_set: set) -> float:
36
+ """
37
+ Calculates a hybrid recommendation score for a single scheme.
38
+
39
+ ASSUMPTIONS:
40
+ - scheme (dict): A scheme object.
41
+ - 'tags' (list[str]): Assumes scheme has a 'tags' key with a list of strings.
42
+ - 'popularity' (float): Assumes scheme has a 'popularity' key with a float (0.0 to 1.0).
43
+ If not present, defaults to 0.5.
44
+ """
45
+ # Define weights for each part of the hybrid model
46
+ WEIGHT_TAG_MATCH = 0.7 # 70% importance
47
+ WEIGHT_POPULARITY = 0.3 # 30% importance
48
+
49
+ # 1. Content-Based Score (Jaccard Similarity)
50
+ # Jaccard Similarity = (Intersection of tags) / (Union of tags)
51
+
52
+ # --- Assumption Handling ---
53
+ # Safely get tags, default to empty list if not present or wrong type
54
+ scheme_tags = scheme.get("tags", [])
55
+ if not isinstance(scheme_tags, list):
56
+ # FIX: Use 'Title' for logging, as 'id' may not exist
57
+ logger.warning(f"Scheme {scheme.get('Title', 'Unknown')} has invalid 'tags' format. Skipping.")
58
+ scheme_tags = []
59
+
60
+ scheme_tags_set = set(tag.lower() for tag in scheme_tags)
61
+ # --- End Assumption Handling ---
62
+
63
+ intersection = user_tags_set.intersection(scheme_tags_set)
64
+ union = user_tags_set.union(scheme_tags_set)
65
+
66
+ if not union:
67
+ tag_score = 0.0
68
+ else:
69
+ tag_score = len(intersection) / len(union)
70
+
71
+ # 2. Popularity-Based Score
72
+ # --- Assumption Handling ---
73
+ # Safely get popularity, default to 0.5 if not present or wrong type
74
+ popularity_score = scheme.get("popularity", 0.5)
75
+ if not isinstance(popularity_score, (int, float)):
76
+ # FIX: Use 'Title' for logging
77
+ logger.warning(f"Scheme {scheme.get('Title', 'Unknown')} has invalid 'popularity' format. Defaulting to 0.5.")
78
+ popularity_score = 0.5
79
+ # --- End Assumption Handling ---
80
+
81
+
82
+ # 3. Final Hybrid Score
83
+ final_score = (WEIGHT_TAG_MATCH * tag_score) + (WEIGHT_POPULARITY * popularity_score)
84
+
85
+ return final_score
86
+
87
+ def get_recommendations(user_tags: list[str], lang: str) -> list[dict]:
88
+ """
89
+ Generates a ranked list of scheme recommendations from both state and
90
+ central caches based on user tags.
91
+
92
+ NOTE: This function currently ignores the 'lang' parameter and searches
93
+ across ALL languages in the cache.
94
+ """
95
+ logger.info(f"Generating recommendations with tags={user_tags}. (NOTE: Ignoring lang='{lang}' and searching all languages)")
96
+
97
+ # --- FIX: Get cache variables at RUN-TIME ---
98
+ # Access the variables *through* their modules to get the current, populated data
99
+ cached_all_schemes = scheme_service.cached_all_schemes
100
+ _central_schemes_cache = central_services._central_schemes_cache
101
+ # --- END FIX ---
102
+
103
+ all_schemes = []
104
+ user_tags_set = set(tag.lower() for tag in user_tags)
105
+
106
+ # --- NEW: Diagnostic Logging ---
107
+ # Log what this function *sees* in the imported caches.
108
+ logger.info(f"DIAGNOSTIC: State cache size: {len(cached_all_schemes)}")
109
+ logger.info(f"DIAGNOSTIC: State cache keys: {list(cached_all_schemes.keys())}")
110
+ logger.info(f"DIAGNOSTIC: Central cache size: {len(_central_schemes_cache)}")
111
+ logger.info(f"DIAGNOSTIC: Central cache keys: {list(_central_schemes_cache.keys())}")
112
+ # --- End Diagnostic Logging ---
113
+
114
+
115
+ # 1. Aggregate State Schemes (Ignoring 'lang' parameter)
116
+ try:
117
+ # --- FIX: Changed loop to handle Dict[StateName, List[Schemes]] ---
118
+ # Iterate over all states in the cache
119
+ for state_name, state_schemes in cached_all_schemes.items():
120
+ # Log the number of schemes found for this state
121
+ logger.info(f"DIAGNOSTIC: Processing state: {state_name}, found {len(state_schemes)} schemes.")
122
+
123
+ # We don't have a definitive lang_key here.
124
+ # Based on logs ('Kannada schemes loaded'), we make an assumption.
125
+ lang_key = "unknown"
126
+ if state_name.lower() == "karnataka":
127
+ lang_key = "ka" # HACK: based on user log
128
+
129
+ if not isinstance(state_schemes, list):
130
+ logger.warning(f"DIAGNOSTIC: Expected list of schemes for state '{state_name}', but got {type(state_schemes)}. Skipping.")
131
+ continue
132
+
133
+ for scheme in state_schemes:
134
+ # Add source to identify origin
135
+ scheme_copy = scheme.copy()
136
+
137
+ # --- FIX: DYNAMICALLY GENERATE TAGS ---
138
+ # If 'tags' field is missing or empty, create them from Title/Description
139
+ if not scheme_copy.get("tags"):
140
+ generated_tags = _generate_tags_from_scheme(scheme_copy, user_tags_set)
141
+ scheme_copy["tags"] = generated_tags # Add the new tags
142
+ # --- END FIX ---
143
+
144
+ scheme_copy["source"] = "state"
145
+ scheme_copy["source_name"] = state_name
146
+ scheme_copy["lang_found"] = lang_key # Set to unknown or assumed lang
147
+ all_schemes.append(scheme_copy)
148
+ # --- END FIX ---
149
+ except Exception as e:
150
+ logger.error(f"Error processing state schemes cache: {e}")
151
+
152
+ # 2. Aggregate Central Schemes (Ignoring 'lang' parameter)
153
+ try:
154
+ # Iterate over all languages in the central cache, not just the specified one
155
+ for lang_key, central_lang_cache in _central_schemes_cache.items():
156
+
157
+ # --- USER REQUEST: Skip 'hi' language ---
158
+ if lang_key == "hi":
159
+ continue
160
+ # --- END USER REQUEST ---
161
+
162
+ logger.info(f"DIAGNOSTIC: Processing central lang: {lang_key}, found ministries: {len(central_lang_cache)}") # NEW LOG
163
+
164
+ if not isinstance(central_lang_cache, dict):
165
+ logger.warning(f"DIAGNOSTIC: Expected dict of ministries for lang '{lang_key}', but got {type(central_lang_cache)}. Skipping.")
166
+ continue
167
+
168
+ # Iterate over all ministries in that language cache
169
+ for ministry_name, ministry_schemes in central_lang_cache.items():
170
+ for scheme in ministry_schemes:
171
+ # Add source to identify origin
172
+ scheme_copy = scheme.copy()
173
+
174
+ # --- FIX: DYNAMICALLY GENERATE TAGS ---
175
+ # If 'tags' field is missing or empty, create them from Title/Description
176
+ if not scheme_copy.get("tags"):
177
+ generated_tags = _generate_tags_from_scheme(scheme_copy, user_tags_set)
178
+ scheme_copy["tags"] = generated_tags # Add the new tags
179
+ # --- END FIX ---
180
+
181
+ scheme_copy["source"] = "central"
182
+ scheme_copy["source_name"] = ministry_name
183
+ scheme_copy["lang_found"] = lang_key # Add which lang it came from
184
+ all_schemes.append(scheme_copy)
185
+ except Exception as e:
186
+ logger.error(f"Error processing central schemes cache: {e}")
187
+
188
+ if not all_schemes:
189
+ # Updated warning message
190
+ logger.warning(f"No schemes found in cache across ANY language. Caches might be empty.")
191
+ return []
192
+
193
+ # 3. Calculate scores for all aggregated schemes
194
+ recommendations = []
195
+ for scheme in all_schemes:
196
+ score = _calculate_hybrid_score(scheme, user_tags_set)
197
+
198
+ # Only include schemes that had at least one tag match
199
+ # This check will now work because we dynamically added tags
200
+ scheme_tags_set = set(tag.lower() for tag in scheme.get("tags", []))
201
+ if user_tags_set.intersection(scheme_tags_set):
202
+ recommendations.append({
203
+ # --- Assumed Fields ---
204
+ # FIX: Use 'Title' and 'Description' to match your scheme data
205
+ "name": scheme.get("Title", "Unnamed Scheme"),
206
+ "description": scheme.get("Description", ""),
207
+ "tags": scheme.get("tags", []), # Will now show generated tags
208
+ # --- End Assumed Fields ---
209
+ "source": scheme["source"], # 'state' or 'central'
210
+ "source_name": scheme["source_name"], # State or Ministry name
211
+ "lang_found": scheme.get("lang_found", "unknown"), # Show which lang it came from
212
+ "matched_tags": list(user_tags_set.intersection(scheme_tags_set)),
213
+ "final_score": round(score, 4)
214
+ })
215
+
216
+ # 4. Sort by the final score in descending order
217
+ sorted_recommendations = sorted(recommendations, key=lambda x: x["final_score"], reverse=True)
218
+
219
+ logger.info(f"Found {len(sorted_recommendations)} matching recommendations.")
220
+ return sorted_recommendations
221
+
requirements.txt CHANGED
@@ -1,22 +1,3 @@
1
- # SLIM requirements.txt
2
-
3
- # Core web framework
4
  fastapi
5
  uvicorn[standard]
6
-
7
- # Database & Cloud Services
8
- firebase-admin
9
- pinecone-client>=4.0.0
10
-
11
- # LLM & AI Libraries
12
- python-dotenv
13
- groq
14
- sentence-transformers
15
-
16
- # LangChain - with minimum versions to fix import errors
17
- langchain>=0.2.0
18
- langchain-core>=0.2.0
19
- langchain-community>=0.2.0
20
- langchain-groq>=0.1.5
21
- langchain-pinecone>=0.1.1
22
- langchain-huggingface>=0.0.3
 
 
 
 
1
  fastapi
2
  uvicorn[standard]
3
+ firebase-admin