Anmol4521 commited on
Commit
388aa42
·
verified ·
1 Parent(s): 359cedf

Upload 95 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +31 -0
  2. .env.example +8 -0
  3. .gitattributes +6 -0
  4. .gitignore +29 -0
  5. ARCHITECTURE.txt +330 -0
  6. Dockerfile +28 -0
  7. PROJECT_STRUCTURE.txt +387 -0
  8. README.md +238 -11
  9. agent_io/__init__.py +3 -0
  10. agent_io/benefit_io.py +117 -0
  11. agent_io/exam_io.py +115 -0
  12. agent_io/profiling_io.py +111 -0
  13. agent_io/scheme_io.py +116 -0
  14. agents/__init__.py +3 -0
  15. agents/benefit_agent.py +213 -0
  16. agents/document_agent.py +165 -0
  17. agents/exam_agent.py +138 -0
  18. agents/profiling_agent.py +149 -0
  19. agents/rag_agent.py +91 -0
  20. agents/scheme_agent.py +142 -0
  21. agents/search_agent.py +71 -0
  22. app.py +599 -0
  23. config.py +8 -0
  24. data/exams_pdfs/README.txt +13 -0
  25. data/exams_pdfs/exam.pdf +3 -0
  26. data/schemes_pdfs/Government Welfare Schemes & Policies - Disha Experts.pdf +3 -0
  27. data/schemes_pdfs/Government of India Welfare Schemes & Policies For Competitive Exams.pdf +3 -0
  28. data/schemes_pdfs/README.txt +12 -0
  29. data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf +3 -0
  30. graph/__init__.py +3 -0
  31. graph/workflow.py +319 -0
  32. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
  33. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/58d4a9a45664eb9e12de9549c548c09b6134c17f.lock +0 -0
  34. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
  35. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
  36. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
  37. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
  38. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
  39. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
  40. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
  41. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
  42. hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
  43. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json +0 -0
  44. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json +0 -0
  45. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja +0 -0
  46. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db +3 -0
  47. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f +173 -0
  48. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/59d594003bf59880a884c574bf88ef7555bb0202 +4 -0
  49. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/72b987fd805cfa2b58c4c8c952b274a11bfd5a00 +24 -0
  50. hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/952a9b81c0bfd99800fabf352f69c7ccd46c5e43 +20 -0
.dockerignore ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment
2
+ .env
3
+ .venv/
4
+ venv/
5
+ env/
6
+
7
+ # Python cache
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+ *.so
12
+
13
+ # Git
14
+ .git/
15
+ .gitignore
16
+
17
+ # IDE
18
+ .vscode/
19
+ .idea/
20
+
21
+ # Documentation
22
+ *.md
23
+ ARCHITECTURE.txt
24
+ PROJECT_STRUCTURE.txt
25
+
26
+ # Outputs (will be generated)
27
+ outputs/*.json
28
+
29
+ # RAG indexes (build during deployment)
30
+ rag/scheme_index/
31
+ rag/exam_index/
.env.example ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ GROQ_API_KEY="your_groq_api_key_here"
2
+ TAVILY_API_KEY="your_tavily_api_key_here"
3
+ HF_TOKEN="your_huggingface_token_here"
4
+
5
+ # Skip vectorstores on memory-constrained platforms
6
+ # Set to "true" to use only web search (saves ~300MB RAM)
7
+ # Set to "false" to use FAISS vectorstores (for Hugging Face Spaces)
8
+ SKIP_VECTORSTORES="false"
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/exams_pdfs/exam.pdf filter=lfs diff=lfs merge=lfs -text
37
+ data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf filter=lfs diff=lfs merge=lfs -text
38
+ data/schemes_pdfs/Government[[:space:]]of[[:space:]]India[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]For[[:space:]]Competitive[[:space:]]Exams.pdf filter=lfs diff=lfs merge=lfs -text
39
+ data/schemes_pdfs/Government[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]-[[:space:]]Disha[[:space:]]Experts.pdf filter=lfs diff=lfs merge=lfs -text
40
+ hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db filter=lfs diff=lfs merge=lfs -text
41
+ rag/scheme_index/index.faiss filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment
2
+ .env
3
+ .venv/
4
+ venv/
5
+ env/
6
+
7
+ # Python
8
+ __pycache__/
9
+ *.py[cod]
10
+ *$py.class
11
+ *.so
12
+
13
+ # HuggingFace Cache (downloaded models)
14
+ hf_cache/
15
+
16
+ # RAG Indexes (now included for production)
17
+ # rag/scheme_index/
18
+ # rag/exam_index/
19
+
20
+ # Outputs
21
+ outputs/*.json
22
+
23
+ # IDE
24
+ .vscode/
25
+ .idea/
26
+
27
+ # Data files (optional - uncomment if PDFs are large)
28
+ # data/schemes_pdfs/*.pdf
29
+ # data/exams_pdfs/*.pdf
ARCHITECTURE.txt ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ JanSahayak Architecture Overview
3
+ ================================
4
+
5
+ SYSTEM COMPONENTS
6
+ -----------------
7
+
8
+ 1. AGENTS (agents/)
9
+ - profiling_agent.py → User Profile Extraction
10
+ - scheme_agent.py → Government Scheme Recommendations
11
+ - exam_agent.py → Competitive Exam Recommendations
12
+ - search_agent.py → Live Web Search (Tavily)
13
+ - rag_agent.py → Vector Database Retrieval
14
+ - document_agent.py → PDF/Image Text Extraction
15
+ - benefit_agent.py → Missed Benefits Calculator
16
+
17
+ 2. PROMPTS (prompts/)
18
+ - profiling_prompt.py → User profiling instructions
19
+ - scheme_prompt.py → Scheme recommendation template
20
+ - exam_prompt.py → Exam recommendation template
21
+ - rag_prompt.py → RAG retrieval instructions
22
+
23
+ 3. RAG SYSTEM (rag/)
24
+ - embeddings.py → HuggingFace embeddings (CPU)
25
+ - scheme_vectorstore.py → FAISS store for schemes
26
+ - exam_vectorstore.py → FAISS store for exams
27
+
28
+ 4. TOOLS (tools/)
29
+ - tavily_tool.py → Live government website search
30
+
31
+ 5. WORKFLOW (graph/)
32
+ - workflow.py → LangGraph orchestration
33
+
34
+ 6. I/O HANDLERS (agent_io/)
35
+ - profiling_io.py → Profiling agent I/O
36
+ - scheme_io.py → Scheme agent I/O
37
+ - exam_io.py → Exam agent I/O
38
+ - benefit_io.py → Benefit agent I/O
39
+
40
+ 7. DATA (data/)
41
+ - schemes_pdfs/ → Government scheme PDFs
42
+ - exams_pdfs/ → Competitive exam PDFs
43
+
44
+ 8. OUTPUTS (outputs/)
45
+ - results_*.json → Generated analysis results
46
+
47
+ 9. CONFIGURATION
48
+ - config.py → Configuration loader
49
+ - .env → API keys (user creates)
50
+ - requirements.txt → Python dependencies
51
+
52
+ 10. ENTRY POINTS
53
+ - main.py → Main application
54
+ - setup.py → Setup wizard
55
+
56
+
57
+ WORKFLOW EXECUTION
58
+ ------------------
59
+
60
+ User Input
61
+
62
+ [Profiling Agent]
63
+
64
+ ├─→ [Scheme Agent] ──→ [Benefit Agent] ──┐
65
+ │ ↓ │
66
+ │ [RAG Search] │
67
+ │ ↓ │
68
+ │ [Tavily Search] │
69
+ │ │
70
+ └─→ [Exam Agent] ────────────────────────┤
71
+ ↓ │
72
+ [RAG Search] │
73
+ ↓ │
74
+ [Tavily Search] │
75
+
76
+ [Final Output]
77
+
78
+ [JSON Results File]
79
+
80
+
81
+ TECHNOLOGY STACK
82
+ ----------------
83
+
84
+ LLM & AI:
85
+ - Groq API (llama-3.3-70b-versatile) → Fast inference
86
+ - LangChain → Agent framework
87
+ - LangGraph → Workflow orchestration
88
+
89
+ Embeddings & Search:
90
+ - HuggingFace Transformers → sentence-transformers/all-MiniLM-L6-v2
91
+ - FAISS (CPU) → Vector similarity search
92
+
93
+ Web Search:
94
+ - Tavily API → Government website search
95
+
96
+ Document Processing:
97
+ - PyPDF → PDF text extraction
98
+ - Pytesseract → OCR for images
99
+ - Pillow → Image processing
100
+
101
+ Infrastructure:
102
+ - Python 3.8+
103
+ - CPU-only deployment (no GPU needed)
104
+ - PyTorch CPU version
105
+
106
+
107
+ DATA FLOW
108
+ ---------
109
+
110
+ 1. User Input Processing:
111
+ Raw Text → Profiling Agent → Structured JSON Profile
112
+
113
+ 2. Scheme Recommendation:
114
+ Profile → RAG Query → Vectorstore Search → Top-K Documents
115
+ Profile + Documents → Tavily Search (optional) → Web Results
116
+ Profile + Documents + Web Results → LLM → Recommendations
117
+
118
+ 3. Exam Recommendation:
119
+ Profile → RAG Query → Vectorstore Search → Top-K Documents
120
+ Profile + Documents → Tavily Search (optional) → Web Results
121
+ Profile + Documents + Web Results → LLM → Recommendations
122
+
123
+ 4. Benefit Calculation:
124
+ Profile + Scheme Recommendations → LLM → Missed Benefits Analysis
125
+
126
+ 5. Final Output:
127
+ All Results → JSON Compilation → File Save → User Display
128
+
129
+
130
+ API INTERACTIONS
131
+ ----------------
132
+
133
+ 1. Groq API:
134
+ - Used by: All LLM-powered agents
135
+ - Model: llama-3.3-70b-versatile
136
+ - Purpose: Natural language understanding & generation
137
+ - Rate: Per-request basis
138
+
139
+ 2. Tavily API:
140
+ - Used by: search_agent, scheme_agent, exam_agent
141
+ - Purpose: Live government website search
142
+ - Filter: .gov.in domains preferred
143
+ - Depth: Advanced search mode
144
+
145
+ 3. HuggingFace:
146
+ - Used by: embeddings module
147
+ - Model: sentence-transformers/all-MiniLM-L6-v2
148
+ - Purpose: Document embeddings for RAG
149
+ - Local: Runs on CPU, cached after first download
150
+
151
+
152
+ VECTORSTORE ARCHITECTURE
153
+ ------------------------
154
+
155
+ Scheme Vectorstore (rag/scheme_index/):
156
+ ├── index.faiss → FAISS index file
157
+ ├── index.pkl → Metadata pickle
158
+ └── [Embedded chunks from schemes_pdfs/]
159
+
160
+ Exam Vectorstore (rag/exam_index/):
161
+ ├── index.faiss → FAISS index file
162
+ ├── index.pkl → Metadata pickle
163
+ └── [Embedded chunks from exams_pdfs/]
164
+
165
+ Embedding Dimension: 384
166
+ Similarity Metric: Cosine similarity
167
+ Chunk Size: Auto (from PyPDF)
168
+
169
+
170
+ AGENT SPECIALIZATIONS
171
+ ---------------------
172
+
173
+ 1. Profiling Agent:
174
+ - Extraction-focused
175
+ - Low temperature (0.1)
176
+ - JSON output required
177
+ - No external tools
178
+
179
+ 2. Scheme Agent:
180
+ - RAG + Web search
181
+ - Temperature: 0.3
182
+ - Tools: Vectorstore, Tavily
183
+ - Output: Detailed scheme info
184
+
185
+ 3. Exam Agent:
186
+ - RAG + Web search
187
+ - Temperature: 0.3
188
+ - Tools: Vectorstore, Tavily
189
+ - Output: Detailed exam info
190
+
191
+ 4. Benefit Agent:
192
+ - Calculation-focused
193
+ - Temperature: 0.2
194
+ - No external tools
195
+ - Output: Financial analysis
196
+
197
+ 5. Search Agent:
198
+ - Web search only
199
+ - Tool: Tavily API
200
+ - Focus: .gov.in domains
201
+ - Output: Live search results
202
+
203
+ 6. RAG Agent:
204
+ - Vectorstore query only
205
+ - Tool: FAISS
206
+ - Similarity search
207
+ - Output: Relevant documents
208
+
209
+ 7. Document Agent:
210
+ - File processing
211
+ - Tools: PyPDF, Pytesseract
212
+ - Supports: PDF, Images
213
+ - Output: Extracted text
214
+
215
+
216
+ SECURITY & PRIVACY
217
+ ------------------
218
+
219
+ - API keys stored in .env (not committed to git)
220
+ - User data processed locally except LLM calls
221
+ - No data stored on external servers (except API providers)
222
+ - PDF data remains local
223
+ - Vectorstores are local
224
+ - Output files saved locally
225
+
226
+
227
+ SCALABILITY NOTES
228
+ -----------------
229
+
230
+ Current Setup (Single User):
231
+ - Synchronous workflow
232
+ - Local vectorstores
233
+ - CPU processing
234
+
235
+ Potential Scaling:
236
+ - Add Redis for caching
237
+ - Use cloud vectorstore (Pinecone, Weaviate)
238
+ - Parallel agent execution
239
+ - GPU acceleration for embeddings
240
+ - Database for user profiles
241
+ - API service deployment
242
+
243
+
244
+ ERROR HANDLING
245
+ --------------
246
+
247
+ Each agent includes:
248
+ - Try-catch blocks
249
+ - Error state tracking
250
+ - Graceful degradation
251
+ - Partial results on failure
252
+ - Error reporting in final output
253
+
254
+
255
+ MONITORING & LOGGING
256
+ --------------------
257
+
258
+ Current:
259
+ - Console print statements
260
+ - Agent start/completion messages
261
+ - Error messages
262
+ - Final output summary
263
+
264
+ Future Enhancement:
265
+ - Structured logging (logging module)
266
+ - Performance metrics
267
+ - API usage tracking
268
+ - User feedback collection
269
+
270
+
271
+ EXTENSIBILITY
272
+ -------------
273
+
274
+ Adding New Agent:
275
+ 1. Create agent file in agents/
276
+ 2. Add prompt template in prompts/
277
+ 3. Create node function in workflow.py
278
+ 4. Add node to graph
279
+ 5. Define edges (connections)
280
+ 6. Optional: Create I/O handler
281
+
282
+ Adding New Data Source:
283
+ 1. Create vectorstore module in rag/
284
+ 2. Add PDFs to data/ subdirectory
285
+ 3. Build vectorstore
286
+ 4. Create agent or modify existing
287
+
288
+ Adding New Tool:
289
+ 1. Create tool in tools/
290
+ 2. Import in agent
291
+ 3. Use in agent logic
292
+
293
+
294
+ PERFORMANCE BENCHMARKS (Typical)
295
+ ---------------------------------
296
+
297
+ Vectorstore Building:
298
+ - 10 PDFs: ~2-5 minutes
299
+ - 100 PDFs: ~20-30 minutes
300
+
301
+ Query Performance:
302
+ - Profiling: ~1-2 seconds
303
+ - RAG Search: ~0.5-1 second
304
+ - LLM Call: ~1-3 seconds
305
+ - Web Search: ~2-4 seconds
306
+ - Full Workflow: ~10-20 seconds
307
+
308
+ Memory Usage:
309
+ - Base: ~500 MB
310
+ - With models: ~2-3 GB
311
+ - With large PDFs: +500 MB per 100 PDFs
312
+
313
+
314
+ FUTURE ENHANCEMENTS
315
+ -------------------
316
+
317
+ 1. Multilingual Support (Hindi, regional languages)
318
+ 2. Voice input/output
319
+ 3. Mobile app integration
320
+ 4. Database for user history
321
+ 5. Notification system for deadlines
322
+ 6. Document upload interface
323
+ 7. Real-time scheme updates
324
+ 8. Community feedback integration
325
+ 9. State-specific customization
326
+ 10. Integration with government portals
327
+
328
+
329
+ END OF ARCHITECTURE DOCUMENT
330
+ """
Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HuggingFace Spaces Dockerfile
2
+ FROM python:3.12-slim
3
+
4
+ WORKDIR /app
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ build-essential \
9
+ curl \
10
+ && rm -rf /var/lib/apt/lists/*
11
+
12
+ # Copy requirements first for better caching
13
+ COPY requirements.txt .
14
+
15
+ # Install Python dependencies
16
+ RUN pip install --no-cache-dir -r requirements.txt
17
+
18
+ # Copy application files
19
+ COPY . .
20
+
21
+ # Expose port 7860 (HuggingFace Spaces default)
22
+ EXPOSE 7860
23
+
24
+ # Set environment variable for port
25
+ ENV PORT=7860
26
+
27
+ # Run the application
28
+ CMD ["python", "app.py"]
PROJECT_STRUCTURE.txt ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ JanSahayak - Multi-Agent Government Intelligence System
2
+ ========================================================
3
+
4
+ 📦 JanSahayak/
5
+
6
+ ├── 📄 main.py # Main entry point
7
+ ├── 📄 setup.py # Setup wizard & utilities
8
+ ├── 📄 config.py # Configuration loader
9
+ ├── 📄 requirements.txt # Python dependencies
10
+
11
+ ├── 📄 README.md # Project overview
12
+ ├── 📄 USAGE_GUIDE.md # Comprehensive usage guide
13
+ ├── 📄 ARCHITECTURE.txt # System architecture
14
+
15
+ ├── 📄 .env.example # Example environment file
16
+ ├── 📄 .gitignore # Git ignore rules
17
+
18
+ ├── 📁 agents/ # Agent modules
19
+ │ ├── __init__.py
20
+ │ ├── profiling_agent.py # 🧾 User profiling
21
+ │ ├── scheme_agent.py # 🏛️ Scheme recommendations
22
+ │ ├── exam_agent.py # 🎓 Exam recommendations
23
+ │ ├── search_agent.py # 🔎 Web search (Tavily)
24
+ │ ├── rag_agent.py # 📚 RAG retrieval
25
+ │ ├── document_agent.py # 📂 Document processing
26
+ │ └── benefit_agent.py # 💰 Benefit calculator
27
+
28
+ ├── 📁 prompts/ # Prompt templates
29
+ │ ├── __init__.py
30
+ │ ├── profiling_prompt.py # Profiling instructions
31
+ │ ├── scheme_prompt.py # Scheme recommendation template
32
+ │ ├── exam_prompt.py # Exam recommendation template
33
+ │ └── rag_prompt.py # RAG retrieval template
34
+
35
+ ├── 📁 rag/ # RAG system
36
+ │ ├── __init__.py
37
+ │ ├── embeddings.py # HuggingFace embeddings
38
+ │ ├── scheme_vectorstore.py # Scheme FAISS store
39
+ │ ├── exam_vectorstore.py # Exam FAISS store
40
+ │ ├── scheme_index/ # Generated vectorstore
41
+ │ │ ├── index.faiss
42
+ │ │ └── index.pkl
43
+ │ └── exam_index/ # Generated vectorstore
44
+ │ ├── index.faiss
45
+ │ └── index.pkl
46
+
47
+ ├── 📁 tools/ # External tools
48
+ │ ├── __init__.py
49
+ │ └── tavily_tool.py # Tavily search integration
50
+
51
+ ├── 📁 graph/ # Workflow orchestration
52
+ │ ├── __init__.py
53
+ │ └── workflow.py # LangGraph workflow
54
+
55
+ ├── 📁 agent_io/ # Agent I/O handlers
56
+ │ ├── __init__.py
57
+ │ ├── profiling_io.py # Profiling I/O
58
+ │ ├── scheme_io.py # Scheme I/O
59
+ │ ├── exam_io.py # Exam I/O
60
+ │ └── benefit_io.py # Benefit I/O
61
+
62
+ ├── 📁 data/ # PDF data
63
+ │ ├── schemes_pdfs/ # Government scheme PDFs
64
+ │ │ └── README.txt
65
+ │ └── exams_pdfs/ # Competitive exam PDFs
66
+ │ └── README.txt
67
+
68
+ └── 📁 outputs/ # Generated results
69
+ ├── README.txt
70
+ └── results_*.json # Analysis results
71
+
72
+
73
+ KEY FILES DESCRIPTION
74
+ =====================
75
+
76
+ 📄 main.py
77
+ ----------
78
+ Main application entry point with:
79
+ - Interactive mode for user input
80
+ - File mode for batch processing
81
+ - Result saving and formatting
82
+ - Summary display
83
+
84
+ 📄 setup.py
85
+ -----------
86
+ Setup wizard that:
87
+ - Checks dependencies
88
+ - Verifies API keys
89
+ - Validates PDF data
90
+ - Builds vectorstores
91
+
92
+ 📄 config.py
93
+ ------------
94
+ Loads configuration from .env:
95
+ - GROQ_API_KEY
96
+ - TAVILY_API_KEY
97
+ - HF_TOKEN
98
+
99
+ 📁 agents/
100
+ ----------
101
+ 7 specialized agents:
102
+ 1. profiling_agent.py → Extract user profile
103
+ 2. scheme_agent.py → Recommend schemes
104
+ 3. exam_agent.py → Recommend exams
105
+ 4. search_agent.py → Live web search
106
+ 5. rag_agent.py → Vector search
107
+ 6. document_agent.py → Process PDFs/images
108
+ 7. benefit_agent.py → Calculate missed benefits
109
+
110
+ 📁 prompts/
111
+ -----------
112
+ Prompt engineering templates for:
113
+ - User profiling instructions
114
+ - Scheme recommendation format
115
+ - Exam recommendation format
116
+ - RAG retrieval guidance
117
+
118
+ 📁 rag/
119
+ -------
120
+ RAG (Retrieval Augmented Generation) system:
121
+ - embeddings.py → HuggingFace embeddings
122
+ - scheme_vectorstore.py → Scheme database
123
+ - exam_vectorstore.py → Exam database
124
+ - *_index/ → Generated FAISS indexes
125
+
126
+ 📁 tools/
127
+ ---------
128
+ External tool integrations:
129
+ - tavily_tool.py → Tavily API for government website search
130
+
131
+ 📁 graph/
132
+ ---------
133
+ LangGraph workflow orchestration:
134
+ - workflow.py → Defines agent connections and execution flow
135
+
136
+ 📁 agent_io/
137
+ ------------
138
+ Input/Output handlers for each agent:
139
+ - Separate I/O files for tracking
140
+ - JSON-based data exchange
141
+ - Timestamp tracking
142
+
143
+ 📁 data/
144
+ --------
145
+ Training data for RAG:
146
+ - schemes_pdfs/ → Government scheme documents
147
+ - exams_pdfs/ → Competitive exam documents
148
+
149
+ 📁 outputs/
150
+ -----------
151
+ Generated analysis results:
152
+ - results_YYYYMMDD_HHMMSS.json
153
+ - Contains all agent outputs
154
+
155
+
156
+ WORKFLOW VISUALIZATION
157
+ ======================
158
+
159
+ User Input (Text)
160
+
161
+ ┌───────────────┐
162
+ │ Profiling │
163
+ │ Agent │
164
+ └───────┬───────┘
165
+
166
+ Structured Profile
167
+
168
+ ┌───────────────┼───────────────┐
169
+ ↓ ↓
170
+ ┌───────────────┐ ┌───────────────┐
171
+ │ Scheme │ │ Exam │
172
+ │ Agent │ │ Agent │
173
+ └───────┬───────┘ └───────┬───────┘
174
+ │ │
175
+ ├─→ RAG Search ├─→ RAG Search
176
+ ├─→ Web Search └─→ Web Search
177
+ ↓ │
178
+ ┌───────────────┐ │
179
+ │ Benefit │ │
180
+ │ Agent │ │
181
+ └───────┬───────┘ │
182
+ │ │
183
+ └───────────────┬───────────────┘
184
+
185
+ ┌───────────────┐
186
+ │ Final │
187
+ │ Output │
188
+ └───────────────┘
189
+
190
+ JSON File
191
+
192
+
193
+ TECHNOLOGY COMPONENTS
194
+ =====================
195
+
196
+ 🧠 Brain (LLM)
197
+ - Groq API (llama-3.3-70b-versatile)
198
+ - Fast inference (<2s per call)
199
+ - Powers all agents
200
+
201
+ 📚 Memory (RAG)
202
+ - HuggingFace embeddings (all-MiniLM-L6-v2)
203
+ - FAISS vectorstore (CPU)
204
+ - Semantic search
205
+
206
+ 🔍 Live Search
207
+ - Tavily API
208
+ - Government website focus
209
+ - Real-time information
210
+
211
+ 🔗 Orchestration
212
+ - LangChain (agent framework)
213
+ - LangGraph (workflow)
214
+ - State management
215
+
216
+ 📄 Document Processing
217
+ - PyPDF (PDF extraction)
218
+ - Pytesseract (OCR)
219
+ - Pillow (image handling)
220
+
221
+
222
+ QUICK START CHECKLIST
223
+ ======================
224
+
225
+ □ 1. Install dependencies
226
+ pip install -r requirements.txt
227
+
228
+ □ 2. Create .env file
229
+ Copy .env.example to .env
230
+ Add GROQ_API_KEY and TAVILY_API_KEY
231
+
232
+ □ 3. Add PDF data
233
+ Place PDFs in data/schemes_pdfs/
234
+ Place PDFs in data/exams_pdfs/
235
+
236
+ □ 4. Run setup
237
+ python setup.py
238
+
239
+ □ 5. Build vectorstores
240
+ Automatic during setup, or:
241
+ python setup.py --build-vectorstores
242
+
243
+ □ 6. Run the system
244
+ python main.py
245
+
246
+
247
+ USAGE EXAMPLES
248
+ ==============
249
+
250
+ Interactive Mode:
251
+ -----------------
252
+ $ python main.py
253
+
254
+ Enter your details:
255
+ I am 25 years old, male, from Maharashtra.
256
+ My family income is 3 lakh per year.
257
+ I belong to OBC category.
258
+ I completed Bachelor's in Engineering.
259
+ I am unemployed and looking for government jobs.
260
+ I am interested in technical and banking sectors.
261
+
262
+ [Press Enter twice to submit]
263
+
264
+
265
+ File Mode:
266
+ ----------
267
+ $ python main.py user_input.txt
268
+
269
+
270
+ Testing Individual Agents:
271
+ ---------------------------
272
+ # Test profiling
273
+ python -m agents.profiling_agent
274
+
275
+ # Test scheme agent
276
+ python -m agents.scheme_agent
277
+
278
+ # Test exam agent
279
+ python -m agents.exam_agent
280
+
281
+
282
+ Building Vectorstores:
283
+ -----------------------
284
+ python setup.py --build-vectorstores
285
+
286
+ Or in Python:
287
+ from rag.scheme_vectorstore import build_scheme_vectorstore
288
+ from rag.exam_vectorstore import build_exam_vectorstore
289
+
290
+ build_scheme_vectorstore()
291
+ build_exam_vectorstore()
292
+
293
+
294
+ OUTPUT FORMAT
295
+ =============
296
+
297
+ Generated file: outputs/results_20260302_143022.json
298
+
299
+ {
300
+ "user_profile": {
301
+ "age": 25,
302
+ "gender": "Male",
303
+ "state": "Maharashtra",
304
+ "income": "300000",
305
+ "caste": "OBC",
306
+ "education": "Bachelor's in Engineering",
307
+ "employment_status": "Unemployed",
308
+ "interests": "Technical, Banking"
309
+ },
310
+ "scheme_recommendations": "...",
311
+ "exam_recommendations": "...",
312
+ "missed_benefits_analysis": "...",
313
+ "errors": []
314
+ }
315
+
316
+
317
+ SYSTEM REQUIREMENTS
318
+ ===================
319
+
320
+ ✅ Python 3.8 or higher
321
+ ✅ 4GB RAM minimum (8GB recommended)
322
+ ✅ 2GB storage for dependencies
323
+ ✅ Internet connection (for APIs)
324
+ ✅ CPU only (no GPU needed)
325
+
326
+
327
+ API KEYS REQUIRED
328
+ =================
329
+
330
+ 🔑 GROQ_API_KEY
331
+ Get from: https://console.groq.com/
332
+ Purpose: LLM inference
333
+ Cost: Free tier available
334
+
335
+ 🔑 TAVILY_API_KEY
336
+ Get from: https://tavily.com/
337
+ Purpose: Web search
338
+ Cost: Free tier available
339
+
340
+ 🔑 HF_TOKEN (Optional)
341
+ Get from: https://huggingface.co/settings/tokens
342
+ Purpose: Model downloads
343
+ Cost: Free
344
+
345
+
346
+ SUPPORT & DOCUMENTATION
347
+ ========================
348
+
349
+ 📖 Full Usage Guide: USAGE_GUIDE.md
350
+ 🏗️ Architecture Details: ARCHITECTURE.txt
351
+ ❓ Quick Start: README.md
352
+ 🐛 Troubleshooting: See USAGE_GUIDE.md
353
+
354
+ For issues:
355
+ 1. Check setup: python setup.py --check
356
+ 2. Verify .env file has correct API keys
357
+ 3. Ensure PDFs are in data/ directories
358
+ 4. Rebuild vectorstores if needed
359
+
360
+
361
+ PROJECT STATUS
362
+ ==============
363
+
364
+ ✅ Core System: Complete
365
+ ✅ All 7 Agents: Implemented
366
+ ✅ RAG System: Functional
367
+ ✅ Web Search: Integrated
368
+ ✅ Workflow: Orchestrated
369
+ ✅ I/O Handlers: Created
370
+ ✅ Documentation: Comprehensive
371
+
372
+ Ready for deployment and testing!
373
+
374
+
375
+ NEXT STEPS
376
+ ==========
377
+
378
+ 1. Add your API keys to .env
379
+ 2. Add government scheme and exam PDFs
380
+ 3. Run setup wizard
381
+ 4. Test the system
382
+ 5. Customize prompts as needed
383
+ 6. Add more PDF data over time
384
+ 7. Monitor and improve
385
+
386
+
387
+ Happy Analyzing! 🎉
README.md CHANGED
@@ -1,11 +1,238 @@
1
- ---
2
- title: Jansahayak
3
- emoji: 🐨
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: JanSahayak
3
+ emoji: 🙏
4
+ colorFrom: blue
5
+ colorTo: green
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ # 🙏 JanSahayak - AI-Powered Government Schemes & Exams Assistant
11
+
12
+ > Your personal AI assistant for discovering government schemes and competitive exam opportunities in India
13
+
14
+ [![Hugging Face Spaces](https://img.shields.io/badge/🤗-Hugging%20Face-yellow)](https://huggingface.co/spaces)
15
+ [![Flask](https://img.shields.io/badge/Flask-2.3+-green)](https://flask.palletsprojects.com/)
16
+ [![LangChain](https://img.shields.io/badge/LangChain-Latest-blue)](https://www.langchain.com/)
17
+
18
+ ---
19
+
20
+ ## 🌟 Features
21
+
22
+ ### 🤖 Multi-Agent AI System
23
+ - **Profiling Agent**: Extracts structured user information
24
+ - **Scheme Agent**: Recommends relevant government schemes
25
+ - **Exam Agent**: Suggests competitive exams based on qualifications
26
+ - **RAG Agent**: Retrieves information from curated document database
27
+
28
+ ### 💡 Intelligent Capabilities
29
+ - ✅ Natural language understanding of user profiles
30
+ - ✅ Smart recommendations based on eligibility criteria
31
+ - ✅ RAG (Retrieval-Augmented Generation) with FAISS vectorstore
32
+ - ✅ Real-time web search via Tavily API
33
+ - ✅ PDF generation for saving recommendations
34
+ - ✅ Beautiful web interface with modern UI
35
+
36
+ ---
37
+
38
+ ## 🚀 Deploy to Hugging Face Spaces (Recommended)
39
+
40
+ ### Why Hugging Face Spaces?
41
+ - ✅ **16GB RAM for FREE** (perfect for RAG apps!)
42
+ - ✅ Built for ML/AI applications
43
+ - ✅ Git-based deployment
44
+ - ✅ Public URL instantly
45
+ - ✅ Persistent storage
46
+
47
+ ### Quick Deploy Steps:
48
+
49
+ **Method 1: Using HF CLI (Easiest)**
50
+
51
+ ```bash
52
+ # Install HF CLI
53
+ pip install huggingface_hub[cli]
54
+
55
+ # Login
56
+ huggingface-cli login
57
+
58
+ # Create Space and push
59
+ huggingface-cli repo create jansahayak --type space --space_sdk gradio
60
+ git remote add hf https://huggingface.co/spaces/YOUR_USERNAME/jansahayak
61
+ git push hf main
62
+ ```
63
+
64
+ **Method 2: Manual Setup**
65
+
66
+ 1. **Create Space** on [huggingface.co/spaces](https://huggingface.co/spaces)
67
+ - Click "Create new Space"
68
+ - Name: `jansahayak`
69
+ - SDK: Select "Gradio" (works with Flask)
70
+ - Hardware: CPU basic (Free - 16GB RAM!)
71
+ - License: MIT
72
+
73
+ 2. **Clone YOUR Space repo** (not GitHub!)
74
+ ```bash
75
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/jansahayak
76
+ cd jansahayak
77
+ ```
78
+
79
+ 3. **Copy your project files**
80
+ ```bash
81
+ # Copy all files from your JanSahayak folder to the cloned space folder
82
+ cp -r /path/to/JanSahayak/* .
83
+ ```
84
+
85
+ 4. **Add Environment Variables** (Space Settings → Variables and secrets)
86
+ ```
87
+ GROQ_API_KEY=your_groq_key
88
+ TAVILY_API_KEY=your_tavily_key
89
+ HF_TOKEN=your_hf_token (optional)
90
+ SKIP_VECTORSTORES=false
91
+ ```
92
+
93
+ 5. **Push to Space**
94
+ ```bash
95
+ git add .
96
+ git commit -m "Initial commit"
97
+ git push
98
+ ```
99
+
100
+ Your app will be live at: `https://huggingface.co/spaces/YOUR_USERNAME/jansahayak`
101
+
102
+ ### Important Notes:
103
+ - HF Spaces uses its own Git repo (not GitHub directly)
104
+ - App runs on port 7860 by default (Flask uses 5000, update if needed)
105
+ - First deployment may take 5-10 minutes to install dependencies
106
+ - Check Space logs if deployment fails
107
+
108
+ ---
109
+
110
+ ## 🛠️ Local Development
111
+
112
+ ```bash
113
+ # Clone and setup
114
+ git clone https://github.com/YOUR_USERNAME/JanSahayak.git
115
+ cd JanSahayak
116
+
117
+ # Create virtual environment
118
+ python -m venv .venv
119
+ source .venv/bin/activate # Linux/Mac
120
+ .venv\Scripts\activate # Windows
121
+
122
+ # Install dependencies
123
+ pip install -r requirements.txt
124
+
125
+ # Configure API keys
126
+ cp .env.example .env
127
+ # Edit .env with your keys
128
+
129
+ # Build vectorstores (optional - if you have PDFs)
130
+ python init_embeddings.py
131
+
132
+ # Run app
133
+ python app.py
134
+ # or use launcher scripts: start_web.bat (Windows) / ./start_web.sh (Linux/Mac)
135
+ ```
136
+
137
+ Visit `http://localhost:5000`
138
+
139
+ ---
140
+
141
+ ## 🔑 Get API Keys
142
+
143
+ | Service | URL | Free Tier | Used For |
144
+ |---------|-----|-----------|----------|
145
+ | **Groq** | [console.groq.com](https://console.groq.com) | ✅ Yes | LLM Inference |
146
+ | **Tavily** | [tavily.com](https://tavily.com) | 1000 searches/mo | Web Search |
147
+ | **HuggingFace** | [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) | ✅ Yes | Model Downloads |
148
+
149
+ ---
150
+
151
+ ## 💾 Adding Custom Documents
152
+
153
+ ### Government Schemes PDFs
154
+ 1. Place PDFs in `data/schemes_pdfs/`
155
+ 2. Run `python init_embeddings.py`
156
+ 3. Restart app
157
+
158
+ ### Exam Information PDFs
159
+ 1. Place PDFs in `data/exams_pdfs/`
160
+ 2. Run `python init_embeddings.py`
161
+ 3. Restart app
162
+
163
+ Automatically indexed and searchable via RAG!
164
+
165
+ ---
166
+
167
+ ## 🧪 Technology Stack
168
+
169
+ - **Backend**: Flask
170
+ - **AI**: LangChain + LangGraph
171
+ - **LLM**: Groq (Llama 3.3 70B)
172
+ - **Embeddings**: sentence-transformers/all-MiniLM-L6-v2
173
+ - **Vector DB**: FAISS (local)
174
+ - **Search**: Tavily API
175
+ - **Frontend**: HTML5 + CSS3 + JavaScript
176
+
177
+ ---
178
+
179
+ ## 📁 Project Structure
180
+
181
+ ```
182
+ JanSahayak/
183
+ ├── app.py # Flask web app
184
+ ├── main.py # CLI interface
185
+ ├── agents/ # AI agents
186
+ │ ├── profiling_agent.py
187
+ │ ├── scheme_agent.py
188
+ │ ├── exam_agent.py
189
+ │ └── rag_agent.py
190
+ ├── rag/ # RAG components
191
+ │ ├── embeddings.py
192
+ │ ├── scheme_vectorstore.py
193
+ │ └── exam_vectorstore.py
194
+ ├── data/ # Documents
195
+ │ ├── schemes_pdfs/
196
+ │ └── exams_pdfs/
197
+ ├── templates/ # HTML templates
198
+ └── static/ # CSS/JS
199
+ ```
200
+
201
+ ---
202
+
203
+ ## 🐛 Troubleshooting
204
+
205
+ **Memory issues on local machine?**
206
+ ```env
207
+ # Set in .env
208
+ SKIP_VECTORSTORES=true
209
+ ```
210
+ Uses web search only (no embeddings needed)
211
+
212
+ **Vectorstore errors?**
213
+ ```bash
214
+ rm -rf rag/scheme_index rag/exam_index
215
+ python init_embeddings.py
216
+ ```
217
+
218
+ ---
219
+
220
+ ## 🤝 Contributing
221
+
222
+ Contributions welcome! Fork → Create branch → Submit PR
223
+
224
+ ---
225
+
226
+ ## 📜 License
227
+
228
+ MIT License
229
+
230
+ ---
231
+
232
+ ## 🙏 Acknowledgments
233
+
234
+ Built with [LangChain](https://www.langchain.com/), [Groq](https://groq.com/), [Tavily](https://tavily.com/), and ❤️
235
+
236
+ ---
237
+
238
+ Made for the people of India 🇮🇳
agent_io/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Agent I/O Module Init
3
+ """
agent_io/benefit_io.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Benefit Agent I/O Handler
3
+ Manages input/output for missed benefits calculator agent
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+
10
+
11
+ class BenefitIO:
12
+ """Handles input/output operations for benefit calculator agent"""
13
+
14
+ def __init__(self, input_file: str = "agent_io/benefit_input.json",
15
+ output_file: str = "agent_io/benefit_output.json"):
16
+ self.input_file = input_file
17
+ self.output_file = output_file
18
+ self._ensure_directory()
19
+
20
+ def _ensure_directory(self):
21
+ """Create agent_io directory if it doesn't exist"""
22
+ os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
23
+
24
+ def read_input(self) -> dict:
25
+ """
26
+ Read benefit calculator input from file
27
+
28
+ Returns:
29
+ Input configuration dictionary
30
+ """
31
+ try:
32
+ if os.path.exists(self.input_file):
33
+ with open(self.input_file, 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ else:
36
+ return {"error": "Input file not found"}
37
+ except Exception as e:
38
+ return {"error": str(e)}
39
+
40
+ def write_input(self, profile_data: dict, scheme_recommendations: str, years: int = 5):
41
+ """
42
+ Write input for benefit calculator
43
+
44
+ Args:
45
+ profile_data: User profile dictionary
46
+ scheme_recommendations: Eligible schemes text
47
+ years: Number of years to calculate (default: 5)
48
+ """
49
+ input_data = {
50
+ "timestamp": datetime.now().isoformat(),
51
+ "profile": profile_data,
52
+ "scheme_recommendations": scheme_recommendations,
53
+ "calculation_years": years,
54
+ "agent": "benefit_calculator"
55
+ }
56
+
57
+ with open(self.input_file, 'w', encoding='utf-8') as f:
58
+ json.dump(input_data, f, indent=2, ensure_ascii=False)
59
+
60
+ def write_output(self, calculation: dict, metadata: dict = None):
61
+ """
62
+ Write benefit calculation to output file
63
+
64
+ Args:
65
+ calculation: Missed benefits calculation
66
+ metadata: Optional metadata about calculation
67
+ """
68
+ output_data = {
69
+ "timestamp": datetime.now().isoformat(),
70
+ "calculation": calculation,
71
+ "metadata": metadata or {},
72
+ "agent": "benefit_calculator"
73
+ }
74
+
75
+ with open(self.output_file, 'w', encoding='utf-8') as f:
76
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
77
+
78
+ def read_output(self) -> dict:
79
+ """
80
+ Read previous benefit calculations
81
+
82
+ Returns:
83
+ Previous calculations dictionary
84
+ """
85
+ try:
86
+ if os.path.exists(self.output_file):
87
+ with open(self.output_file, 'r', encoding='utf-8') as f:
88
+ return json.load(f)
89
+ else:
90
+ return {"error": "Output file not found"}
91
+ except Exception as e:
92
+ return {"error": str(e)}
93
+
94
+
95
+ if __name__ == "__main__":
96
+ # Test BenefitIO
97
+ io = BenefitIO()
98
+
99
+ # Sample input
100
+ profile = {
101
+ "age": 25,
102
+ "income": "300000"
103
+ }
104
+
105
+ schemes = "PM Kisan: ₹6000/year"
106
+
107
+ io.write_input(profile, schemes, years=5)
108
+ print("Input written successfully")
109
+
110
+ # Sample output
111
+ calculation = {
112
+ "total_missed": "₹30,000",
113
+ "breakdown": {"2022": "₹6000", "2023": "₹6000"}
114
+ }
115
+
116
+ io.write_output(calculation)
117
+ print("Output written successfully")
agent_io/exam_io.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Exam Agent I/O Handler
3
+ Manages input/output for exam recommendation agent
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+
10
+
11
+ class ExamIO:
12
+ """Handles input/output operations for exam agent"""
13
+
14
+ def __init__(self, input_file: str = "agent_io/exam_input.json",
15
+ output_file: str = "agent_io/exam_output.json"):
16
+ self.input_file = input_file
17
+ self.output_file = output_file
18
+ self._ensure_directory()
19
+
20
+ def _ensure_directory(self):
21
+ """Create agent_io directory if it doesn't exist"""
22
+ os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
23
+
24
+ def read_input(self) -> dict:
25
+ """
26
+ Read exam agent input from file
27
+
28
+ Returns:
29
+ Input configuration dictionary
30
+ """
31
+ try:
32
+ if os.path.exists(self.input_file):
33
+ with open(self.input_file, 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ else:
36
+ return {"error": "Input file not found"}
37
+ except Exception as e:
38
+ return {"error": str(e)}
39
+
40
+ def write_input(self, profile_data: dict, preferences: dict = None):
41
+ """
42
+ Write input for exam agent
43
+
44
+ Args:
45
+ profile_data: Student profile dictionary
46
+ preferences: Optional student preferences
47
+ """
48
+ input_data = {
49
+ "timestamp": datetime.now().isoformat(),
50
+ "profile": profile_data,
51
+ "preferences": preferences or {},
52
+ "agent": "exam_recommendation"
53
+ }
54
+
55
+ with open(self.input_file, 'w', encoding='utf-8') as f:
56
+ json.dump(input_data, f, indent=2, ensure_ascii=False)
57
+
58
+ def write_output(self, recommendations: dict, metadata: dict = None):
59
+ """
60
+ Write exam recommendations to output file
61
+
62
+ Args:
63
+ recommendations: Exam recommendations from agent
64
+ metadata: Optional metadata about the recommendation process
65
+ """
66
+ output_data = {
67
+ "timestamp": datetime.now().isoformat(),
68
+ "recommendations": recommendations,
69
+ "metadata": metadata or {},
70
+ "agent": "exam_recommendation"
71
+ }
72
+
73
+ with open(self.output_file, 'w', encoding='utf-8') as f:
74
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
75
+
76
+ def read_output(self) -> dict:
77
+ """
78
+ Read previous exam recommendations
79
+
80
+ Returns:
81
+ Previous recommendations dictionary
82
+ """
83
+ try:
84
+ if os.path.exists(self.output_file):
85
+ with open(self.output_file, 'r', encoding='utf-8') as f:
86
+ return json.load(f)
87
+ else:
88
+ return {"error": "Output file not found"}
89
+ except Exception as e:
90
+ return {"error": str(e)}
91
+
92
+
93
+ if __name__ == "__main__":
94
+ # Test ExamIO
95
+ io = ExamIO()
96
+
97
+ # Sample input
98
+ profile = {
99
+ "age": 25,
100
+ "education": "Bachelor's in Engineering",
101
+ "interests": "Technical jobs"
102
+ }
103
+
104
+ io.write_input(profile, {"exam_type": "government"})
105
+ print("Input written successfully")
106
+
107
+ # Sample output
108
+ recommendations = {
109
+ "exams": [
110
+ {"name": "SSC CGL", "eligibility": "Graduate"}
111
+ ]
112
+ }
113
+
114
+ io.write_output(recommendations, {"sources": 5})
115
+ print("Output written successfully")
agent_io/profiling_io.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Profiling Agent I/O Handler
3
+ Manages input/output for user profiling agent
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+
10
+
11
+ class ProfilingIO:
12
+ """Handles input/output operations for profiling agent"""
13
+
14
+ def __init__(self, input_file: str = "agent_io/profiling_input.json",
15
+ output_file: str = "agent_io/profiling_output.json"):
16
+ self.input_file = input_file
17
+ self.output_file = output_file
18
+ self._ensure_directory()
19
+
20
+ def _ensure_directory(self):
21
+ """Create agent_io directory if it doesn't exist"""
22
+ os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
23
+
24
+ def read_input(self) -> dict:
25
+ """
26
+ Read profiling agent input from file
27
+
28
+ Returns:
29
+ Raw user input dictionary
30
+ """
31
+ try:
32
+ if os.path.exists(self.input_file):
33
+ with open(self.input_file, 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ else:
36
+ return {"error": "Input file not found"}
37
+ except Exception as e:
38
+ return {"error": str(e)}
39
+
40
+ def write_input(self, user_input: str, documents: list = None):
41
+ """
42
+ Write raw user input for profiling
43
+
44
+ Args:
45
+ user_input: Raw text input from user
46
+ documents: Optional list of uploaded documents
47
+ """
48
+ input_data = {
49
+ "timestamp": datetime.now().isoformat(),
50
+ "user_input": user_input,
51
+ "documents": documents or [],
52
+ "agent": "user_profiling"
53
+ }
54
+
55
+ with open(self.input_file, 'w', encoding='utf-8') as f:
56
+ json.dump(input_data, f, indent=2, ensure_ascii=False)
57
+
58
+ def write_output(self, profile_data: dict, confidence: dict = None):
59
+ """
60
+ Write extracted profile to output file
61
+
62
+ Args:
63
+ profile_data: Structured profile data
64
+ confidence: Optional confidence scores for extracted fields
65
+ """
66
+ output_data = {
67
+ "timestamp": datetime.now().isoformat(),
68
+ "profile": profile_data,
69
+ "confidence": confidence or {},
70
+ "agent": "user_profiling"
71
+ }
72
+
73
+ with open(self.output_file, 'w', encoding='utf-8') as f:
74
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
75
+
76
+ def read_output(self) -> dict:
77
+ """
78
+ Read extracted profile
79
+
80
+ Returns:
81
+ Structured profile dictionary
82
+ """
83
+ try:
84
+ if os.path.exists(self.output_file):
85
+ with open(self.output_file, 'r', encoding='utf-8') as f:
86
+ return json.load(f)
87
+ else:
88
+ return {"error": "Output file not found"}
89
+ except Exception as e:
90
+ return {"error": str(e)}
91
+
92
+
93
+ if __name__ == "__main__":
94
+ # Test ProfilingIO
95
+ io = ProfilingIO()
96
+
97
+ # Sample input
98
+ user_text = "I am 25 years old from Maharashtra, OBC category, income 3 lakh."
99
+ io.write_input(user_text, documents=["resume.pdf"])
100
+ print("Input written successfully")
101
+
102
+ # Sample output
103
+ profile = {
104
+ "age": 25,
105
+ "state": "Maharashtra",
106
+ "caste": "OBC",
107
+ "income": "300000"
108
+ }
109
+
110
+ io.write_output(profile, confidence={"age": 1.0, "state": 1.0})
111
+ print("Output written successfully")
agent_io/scheme_io.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scheme Agent I/O Handler
3
+ Manages input/output for scheme recommendation agent
4
+ """
5
+
6
+ import json
7
+ import os
8
+ from datetime import datetime
9
+
10
+
11
+ class SchemeIO:
12
+ """Handles input/output operations for scheme agent"""
13
+
14
+ def __init__(self, input_file: str = "agent_io/scheme_input.json",
15
+ output_file: str = "agent_io/scheme_output.json"):
16
+ self.input_file = input_file
17
+ self.output_file = output_file
18
+ self._ensure_directory()
19
+
20
+ def _ensure_directory(self):
21
+ """Create agent_io directory if it doesn't exist"""
22
+ os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
23
+
24
+ def read_input(self) -> dict:
25
+ """
26
+ Read scheme agent input from file
27
+
28
+ Returns:
29
+ Input configuration dictionary
30
+ """
31
+ try:
32
+ if os.path.exists(self.input_file):
33
+ with open(self.input_file, 'r', encoding='utf-8') as f:
34
+ return json.load(f)
35
+ else:
36
+ return {"error": "Input file not found"}
37
+ except Exception as e:
38
+ return {"error": str(e)}
39
+
40
+ def write_input(self, profile_data: dict, preferences: dict = None):
41
+ """
42
+ Write input for scheme agent
43
+
44
+ Args:
45
+ profile_data: User profile dictionary
46
+ preferences: Optional user preferences
47
+ """
48
+ input_data = {
49
+ "timestamp": datetime.now().isoformat(),
50
+ "profile": profile_data,
51
+ "preferences": preferences or {},
52
+ "agent": "scheme_recommendation"
53
+ }
54
+
55
+ with open(self.input_file, 'w', encoding='utf-8') as f:
56
+ json.dump(input_data, f, indent=2, ensure_ascii=False)
57
+
58
+ def write_output(self, recommendations: dict, metadata: dict = None):
59
+ """
60
+ Write scheme recommendations to output file
61
+
62
+ Args:
63
+ recommendations: Scheme recommendations from agent
64
+ metadata: Optional metadata about the recommendation process
65
+ """
66
+ output_data = {
67
+ "timestamp": datetime.now().isoformat(),
68
+ "recommendations": recommendations,
69
+ "metadata": metadata or {},
70
+ "agent": "scheme_recommendation"
71
+ }
72
+
73
+ with open(self.output_file, 'w', encoding='utf-8') as f:
74
+ json.dump(output_data, f, indent=2, ensure_ascii=False)
75
+
76
+ def read_output(self) -> dict:
77
+ """
78
+ Read previous scheme recommendations
79
+
80
+ Returns:
81
+ Previous recommendations dictionary
82
+ """
83
+ try:
84
+ if os.path.exists(self.output_file):
85
+ with open(self.output_file, 'r', encoding='utf-8') as f:
86
+ return json.load(f)
87
+ else:
88
+ return {"error": "Output file not found"}
89
+ except Exception as e:
90
+ return {"error": str(e)}
91
+
92
+
93
+ if __name__ == "__main__":
94
+ # Test SchemeIO
95
+ io = SchemeIO()
96
+
97
+ # Sample input
98
+ profile = {
99
+ "age": 25,
100
+ "income": "300000",
101
+ "state": "Maharashtra",
102
+ "caste": "OBC"
103
+ }
104
+
105
+ io.write_input(profile, {"priority": "high_benefit"})
106
+ print("Input written successfully")
107
+
108
+ # Sample output
109
+ recommendations = {
110
+ "schemes": [
111
+ {"name": "PM Kisan", "benefit": "₹6000/year"}
112
+ ]
113
+ }
114
+
115
+ io.write_output(recommendations, {"sources": 5})
116
+ print("Output written successfully")
agents/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Agents Module Init
3
+ """
agents/benefit_agent.py ADDED
@@ -0,0 +1,213 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Missed Benefits Calculator Agent
3
+ Estimates potential benefits user might have missed
4
+ """
5
+
6
+ import json
7
+ from langchain_groq import ChatGroq
8
+ from langchain_core.messages import HumanMessage, SystemMessage
9
+ from config import GROQ_API_KEY
10
+
11
+
12
+ def get_llm():
13
+ """Initialize Groq LLM"""
14
+ if not GROQ_API_KEY:
15
+ raise ValueError("GROQ_API_KEY not found in environment variables")
16
+
17
+ return ChatGroq(
18
+ api_key=GROQ_API_KEY,
19
+ model="llama-3.3-70b-versatile",
20
+ temperature=0.2
21
+ )
22
+
23
+
24
+ def calculate_missed_benefits(profile_data: dict, scheme_recommendations: str) -> dict:
25
+ """
26
+ Calculates potential benefits the user might have missed in the past
27
+
28
+ Args:
29
+ profile_data: User profile dictionary
30
+ scheme_recommendations: Recommended schemes text
31
+
32
+ Returns:
33
+ Dictionary with missed benefits calculation
34
+ """
35
+ try:
36
+ llm = get_llm()
37
+
38
+ profile_str = json.dumps(profile_data, indent=2)
39
+
40
+ prompt = f"""
41
+ You are a financial analyst specializing in Indian government welfare schemes.
42
+
43
+ Based on the user's profile and recommended schemes, calculate how much money/benefits
44
+ they might have missed in the past 5 years by not applying to eligible schemes.
45
+
46
+ **USER PROFILE:**
47
+ {profile_str}
48
+
49
+ **RECOMMENDED SCHEMES:**
50
+ {scheme_recommendations}
51
+
52
+ **ANALYSIS REQUIREMENTS:**
53
+
54
+ 1. **Identify Eligible Schemes:**
55
+ - List schemes user was eligible for in past 5 years
56
+ - Consider age, income, education criteria over time
57
+
58
+ 2. **Calculate Monetary Benefits:**
59
+ - One-time payments missed
60
+ - Annual recurring benefits missed
61
+ - Subsidies or discounts not availed
62
+ - Total missed amount (conservative estimate)
63
+
64
+ 3. **Non-Monetary Benefits:**
65
+ - Training opportunities missed
66
+ - Healthcare benefits not utilized
67
+ - Educational scholarships lost
68
+ - Employment opportunities missed
69
+
70
+ 4. **Year-wise Breakdown:**
71
+ - Provide year-wise missed benefit estimate
72
+ - Account for scheme start dates
73
+ - Consider eligibility changes over time
74
+
75
+ 5. **Actionable Insights:**
76
+ - Can any benefits be claimed retroactively?
77
+ - Which schemes should be applied immediately?
78
+ - Priority ranking for current applications
79
+
80
+ **OUTPUT FORMAT:**
81
+
82
+ ### Total Missed Benefits (Past 5 Years)
83
+ - **Monetary Loss:** ₹[Amount]
84
+ - **Non-Monetary Loss:** [Description]
85
+
86
+ ### Year-wise Breakdown
87
+ **2022:**
88
+ - Scheme Name: ₹[Amount] | [Benefit Description]
89
+
90
+ **2023:**
91
+ - Scheme Name: ₹[Amount] | [Benefit Description]
92
+
93
+ [Continue for all years]
94
+
95
+ ### Retroactive Claims Possible
96
+ - List schemes that allow backdated applications
97
+ - Required documentation for backdated claims
98
+
99
+ ### Immediate Action Items
100
+ 1. [Highest priority scheme to apply now]
101
+ 2. [Second priority scheme]
102
+ 3. [Third priority scheme]
103
+
104
+ ### Future Projections
105
+ If user applies now, estimated benefits over next 5 years: ₹[Amount]
106
+
107
+ ---
108
+
109
+ **IMPORTANT NOTES:**
110
+ - Provide conservative estimates (lower bound)
111
+ - Mark assumptions clearly
112
+ - Only include verified government schemes
113
+ - Consider state-specific schemes based on user's state
114
+ - Factor in income bracket changes over time
115
+
116
+ Proceed with calculation:
117
+ """
118
+
119
+ messages = [
120
+ SystemMessage(content="You are a financial analyst for government welfare schemes. Provide realistic, conservative estimates."),
121
+ HumanMessage(content=prompt)
122
+ ]
123
+
124
+ response = llm.invoke(messages)
125
+
126
+ return {
127
+ "calculation": response.content,
128
+ "profile_considered": profile_data.get('age', 'N/A'),
129
+ "schemes_analyzed": "Available in recommendations"
130
+ }
131
+
132
+ except Exception as e:
133
+ return {
134
+ "error": str(e),
135
+ "calculation": "Unable to calculate missed benefits"
136
+ }
137
+
138
+
139
+ def estimate_future_benefits(profile_data: dict, scheme_recommendations: str, years: int = 5) -> dict:
140
+ """
141
+ Estimates potential benefits over the next N years if user applies now
142
+
143
+ Args:
144
+ profile_data: User profile dictionary
145
+ scheme_recommendations: Recommended schemes text
146
+ years: Number of years to project (default: 5)
147
+
148
+ Returns:
149
+ Dictionary with future benefits projection
150
+ """
151
+ try:
152
+ llm = get_llm()
153
+
154
+ profile_str = json.dumps(profile_data, indent=2)
155
+
156
+ prompt = f"""
157
+ Based on the user's current profile and eligible schemes, estimate the total benefits
158
+ they can receive over the next {years} years if they apply immediately.
159
+
160
+ **USER PROFILE:**
161
+ {profile_str}
162
+
163
+ **ELIGIBLE SCHEMES:**
164
+ {scheme_recommendations}
165
+
166
+ Provide:
167
+ 1. Year-wise projected benefits
168
+ 2. Total estimated benefits over {years} years
169
+ 3. Required actions to maximize benefits
170
+ 4. Key deadlines to watch
171
+
172
+ Return structured calculation with conservative estimates.
173
+ """
174
+
175
+ messages = [
176
+ SystemMessage(content="You are a financial projection analyst for government schemes."),
177
+ HumanMessage(content=prompt)
178
+ ]
179
+
180
+ response = llm.invoke(messages)
181
+
182
+ return {
183
+ "projection": response.content,
184
+ "years_projected": years,
185
+ "profile_age": profile_data.get('age', 'N/A')
186
+ }
187
+
188
+ except Exception as e:
189
+ return {
190
+ "error": str(e),
191
+ "projection": "Unable to estimate future benefits"
192
+ }
193
+
194
+
195
+ if __name__ == "__main__":
196
+ # Test the agent
197
+ test_profile = {
198
+ "age": 25,
199
+ "income": "300000",
200
+ "caste": "OBC",
201
+ "state": "Maharashtra",
202
+ "education": "Bachelor's in Engineering",
203
+ "employment_status": "Unemployed"
204
+ }
205
+
206
+ test_schemes = """
207
+ 1. PM Kisan Samman Nidhi: ₹6000 per year
208
+ 2. Post Matric Scholarship (OBC): ₹5000-10000 per year
209
+ 3. Skill Development Scheme: Free training worth ₹20000
210
+ """
211
+
212
+ result = calculate_missed_benefits(test_profile, test_schemes)
213
+ print(json.dumps(result, indent=2))
agents/document_agent.py ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Document Processing Agent
3
+ Handles PDF and image text extraction
4
+ """
5
+
6
+ import os
7
+ import pytesseract
8
+ from PIL import Image
9
+ from pypdf import PdfReader
10
+
11
+
12
+ def process_pdf(file_path: str) -> dict:
13
+ """
14
+ Extracts text from PDF file
15
+
16
+ Args:
17
+ file_path: Path to PDF file
18
+
19
+ Returns:
20
+ Dictionary with extracted text and metadata
21
+ """
22
+ try:
23
+ if not os.path.exists(file_path):
24
+ return {"error": f"File not found: {file_path}", "text": ""}
25
+
26
+ reader = PdfReader(file_path)
27
+ text = ""
28
+
29
+ for page_num, page in enumerate(reader.pages):
30
+ page_text = page.extract_text()
31
+ text += f"\n--- Page {page_num + 1} ---\n{page_text}"
32
+
33
+ return {
34
+ "file_path": file_path,
35
+ "pages": len(reader.pages),
36
+ "text": text,
37
+ "success": True
38
+ }
39
+
40
+ except Exception as e:
41
+ return {
42
+ "error": str(e),
43
+ "file_path": file_path,
44
+ "text": "",
45
+ "success": False
46
+ }
47
+
48
+
49
+ def process_image(file_path: str, language: str = 'eng+hin') -> dict:
50
+ """
51
+ Extracts text from image using OCR
52
+
53
+ Args:
54
+ file_path: Path to image file
55
+ language: Tesseract language code (default: English + Hindi)
56
+
57
+ Returns:
58
+ Dictionary with extracted text and metadata
59
+ """
60
+ try:
61
+ if not os.path.exists(file_path):
62
+ return {"error": f"File not found: {file_path}", "text": ""}
63
+
64
+ img = Image.open(file_path)
65
+ text = pytesseract.image_to_string(img, lang=language)
66
+
67
+ return {
68
+ "file_path": file_path,
69
+ "image_size": img.size,
70
+ "text": text,
71
+ "success": True
72
+ }
73
+
74
+ except Exception as e:
75
+ return {
76
+ "error": str(e),
77
+ "file_path": file_path,
78
+ "text": "",
79
+ "success": False
80
+ }
81
+
82
+
83
+ def process_resume(file_path: str) -> dict:
84
+ """
85
+ Processes resume (PDF or image) and extracts relevant information
86
+
87
+ Args:
88
+ file_path: Path to resume file
89
+
90
+ Returns:
91
+ Extracted resume information
92
+ """
93
+ file_ext = os.path.splitext(file_path)[1].lower()
94
+
95
+ if file_ext == '.pdf':
96
+ result = process_pdf(file_path)
97
+ elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
98
+ result = process_image(file_path)
99
+ else:
100
+ return {
101
+ "error": f"Unsupported file format: {file_ext}",
102
+ "text": "",
103
+ "success": False
104
+ }
105
+
106
+ if result.get("success"):
107
+ # Basic resume parsing (can be enhanced)
108
+ text = result["text"]
109
+ result["document_type"] = "resume"
110
+ result["contains_email"] = "@" in text
111
+ result["contains_phone"] = any(char.isdigit() for char in text)
112
+
113
+ return result
114
+
115
+
116
+ def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
117
+ """
118
+ Processes multiple documents in a folder
119
+
120
+ Args:
121
+ folder_path: Path to folder containing documents
122
+ file_type: Type of files to process ("pdf" or "image")
123
+
124
+ Returns:
125
+ List of processing results for each document
126
+ """
127
+ results = []
128
+
129
+ if not os.path.exists(folder_path):
130
+ return [{"error": f"Folder not found: {folder_path}"}]
131
+
132
+ extensions = {
133
+ "pdf": [".pdf"],
134
+ "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
135
+ }
136
+
137
+ valid_extensions = extensions.get(file_type, [".pdf"])
138
+
139
+ for filename in os.listdir(folder_path):
140
+ file_path = os.path.join(folder_path, filename)
141
+ file_ext = os.path.splitext(filename)[1].lower()
142
+
143
+ if file_ext in valid_extensions:
144
+ if file_type == "pdf":
145
+ result = process_pdf(file_path)
146
+ else:
147
+ result = process_image(file_path)
148
+
149
+ results.append(result)
150
+
151
+ return results
152
+
153
+
154
+ if __name__ == "__main__":
155
+ # Test the agent
156
+ # Note: You'll need to provide actual file paths to test
157
+
158
+ # Example usage
159
+ print("Document Processing Agent")
160
+ print("=" * 50)
161
+ print("Available functions:")
162
+ print("1. process_pdf(file_path)")
163
+ print("2. process_image(file_path)")
164
+ print("3. process_resume(file_path)")
165
+ print("4. batch_process_documents(folder_path, file_type)")
agents/exam_agent.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Exam Recommendation Agent
3
+ Provides competitive exam recommendations based on student profile
4
+ Uses FAISS for local vector storage
5
+ """
6
+
7
+ import json
8
+ from langchain_groq import ChatGroq
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ from rag.exam_vectorstore import load_exam_vectorstore
11
+ from prompts.exam_prompt import EXAM_PROMPT
12
+ from tools.tavily_tool import government_focused_search
13
+ from config import GROQ_API_KEY
14
+
15
+
16
+ def get_llm():
17
+ """Initialize Groq LLM"""
18
+ if not GROQ_API_KEY:
19
+ raise ValueError("GROQ_API_KEY not found in environment variables")
20
+
21
+ return ChatGroq(
22
+ api_key=GROQ_API_KEY,
23
+ model="llama-3.3-70b-versatile",
24
+ temperature=0.3
25
+ )
26
+
27
+
28
+ def run_exam_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict:
29
+ """
30
+ Recommends competitive exams based on student profile
31
+
32
+ Args:
33
+ profile_data: Structured user profile
34
+ use_web_search: Whether to use Tavily for live search
35
+ vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading)
36
+
37
+ Returns:
38
+ Exam recommendations dictionary
39
+ """
40
+ try:
41
+ # Use provided vectorstore or try to load it
42
+ context = ""
43
+ sources_used = 0
44
+
45
+ if vectorstore is not None:
46
+ print("✅ Using pre-loaded vectorstore")
47
+ try:
48
+ # Create search query from profile
49
+ search_query = f"""
50
+ Student Profile:
51
+ Education: {profile_data.get('education', 'N/A')}
52
+ Age: {profile_data.get('age', 'N/A')}
53
+ Interests: {profile_data.get('interests', 'N/A')}
54
+ Skills: {profile_data.get('skills', 'N/A')}
55
+ Occupation: {profile_data.get('occupation', 'N/A')}
56
+ """
57
+
58
+ # RAG retrieval
59
+ docs = vectorstore.similarity_search(search_query, k=5)
60
+ context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)])
61
+ sources_used = len(docs)
62
+ print(f"✓ Retrieved {sources_used} exam documents from vectorstore")
63
+ except Exception as e:
64
+ print(f"⚠️ Error querying vectorstore: {str(e)}")
65
+ context = "Vectorstore query failed. Using live web search."
66
+ else:
67
+ print("ℹ️ No vectorstore provided, using web search only")
68
+ context = "No local exam database available. Using live web search."
69
+
70
+ # Create profile string
71
+ profile_str = json.dumps(profile_data, indent=2)
72
+
73
+ # Web search (fallback or enhancement)
74
+ web_context = ""
75
+ if use_web_search:
76
+ try:
77
+ education = profile_data.get('education', 'graduate')
78
+ interests = profile_data.get('interests', 'government jobs')
79
+ web_query = f"competitive exams India {education} {interests} eligibility 2026"
80
+ print(f"🔍 Searching web: {web_query}")
81
+ web_results = government_focused_search(web_query)
82
+ web_context = f"\n\nLive Web Search Results:\n{web_results}"
83
+ print("✓ Web search completed")
84
+ except Exception as e:
85
+ web_context = f"\n\nWeb search unavailable: {str(e)}"
86
+ print(f"⚠ Web search failed: {str(e)}")
87
+
88
+ # Combine contexts
89
+ full_context = context + web_context
90
+
91
+ # If no context at all, return helpful message
92
+ if not full_context.strip():
93
+ return {
94
+ "recommendations": "Unable to retrieve exam information. Please ensure Tavily API key is configured or vectorstore is built.",
95
+ "sources_used": 0,
96
+ "web_search_used": use_web_search
97
+ }
98
+
99
+ # Generate recommendations
100
+ llm = get_llm()
101
+
102
+ prompt = EXAM_PROMPT.format(
103
+ context=full_context,
104
+ profile=profile_str
105
+ )
106
+
107
+ messages = [
108
+ SystemMessage(content="You are an expert competitive exam advisor. Provide accurate, verified information only."),
109
+ HumanMessage(content=prompt)
110
+ ]
111
+
112
+ response = llm.invoke(messages)
113
+
114
+ return {
115
+ "recommendations": response.content,
116
+ "sources_used": sources_used,
117
+ "web_search_used": use_web_search
118
+ }
119
+
120
+ except Exception as e:
121
+ return {
122
+ "error": str(e),
123
+ "recommendations": []
124
+ }
125
+
126
+
127
+ if __name__ == "__main__":
128
+ # Test the agent
129
+ test_profile = {
130
+ "education": "Bachelor's in Engineering",
131
+ "age": 25,
132
+ "interests": "Technical jobs, government sector",
133
+ "skills": "Programming, problem solving",
134
+ "occupation": "Student"
135
+ }
136
+
137
+ result = run_exam_agent(test_profile, use_web_search=False)
138
+ print(json.dumps(result, indent=2))
agents/profiling_agent.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ User Profiling Agent
3
+ Extracts structured user information for eligibility matching
4
+ """
5
+
6
+ import json
7
+ from langchain_groq import ChatGroq
8
+ from langchain_core.messages import HumanMessage, SystemMessage
9
+ from prompts.profiling_prompt import PROFILING_PROMPT
10
+ from config import GROQ_API_KEY
11
+
12
+
13
+ def get_llm():
14
+ """Initialize Groq LLM"""
15
+ if not GROQ_API_KEY:
16
+ raise ValueError("GROQ_API_KEY not found in environment variables")
17
+
18
+ return ChatGroq(
19
+ api_key=GROQ_API_KEY,
20
+ model="llama-3.3-70b-versatile",
21
+ temperature=0.1 # Low temperature for structured extraction
22
+ )
23
+
24
+
25
+ def extract_json_from_text(text: str) -> dict:
26
+ """Extract JSON from text that might contain markdown or extra content"""
27
+ import re
28
+
29
+ # Try direct JSON parse first
30
+ try:
31
+ return json.loads(text.strip())
32
+ except json.JSONDecodeError:
33
+ pass
34
+
35
+ # Try to extract JSON from markdown code blocks
36
+ json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
37
+ matches = re.findall(json_pattern, text, re.DOTALL)
38
+ if matches:
39
+ try:
40
+ return json.loads(matches[0])
41
+ except json.JSONDecodeError:
42
+ pass
43
+
44
+ # Try to find complete JSON object (improved pattern)
45
+ # Match from first { to last }
46
+ start_idx = text.find('{')
47
+ end_idx = text.rfind('}')
48
+
49
+ if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
50
+ try:
51
+ potential_json = text[start_idx:end_idx+1]
52
+ return json.loads(potential_json)
53
+ except json.JSONDecodeError:
54
+ pass
55
+
56
+ # Fallback: try to find any JSON-like structure
57
+ json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
58
+ matches = re.findall(json_pattern, text, re.DOTALL)
59
+ for match in matches:
60
+ try:
61
+ return json.loads(match)
62
+ except json.JSONDecodeError:
63
+ continue
64
+
65
+ return None
66
+
67
+
68
+ def run_profiling_agent(user_input: str) -> dict:
69
+ """
70
+ Extracts structured profile information from user input
71
+
72
+ Args:
73
+ user_input: Raw user input text
74
+
75
+ Returns:
76
+ Structured profile dictionary
77
+ """
78
+ try:
79
+ llm = get_llm()
80
+
81
+ prompt = PROFILING_PROMPT.format(user_input=user_input)
82
+
83
+ messages = [
84
+ SystemMessage(content="You are an expert user profiling agent. Return ONLY a valid JSON object, nothing else."),
85
+ HumanMessage(content=prompt)
86
+ ]
87
+
88
+ response = llm.invoke(messages)
89
+
90
+ print(f"\n🤖 LLM Response (first 200 chars): {response.content[:200]}...")
91
+
92
+ # Extract JSON from response
93
+ profile_data = extract_json_from_text(response.content)
94
+
95
+ if profile_data:
96
+ # Normalize keys to lowercase with underscores
97
+ normalized_profile = {}
98
+ for key, value in profile_data.items():
99
+ normalized_key = key.lower().replace(' ', '_').replace('-', '_')
100
+ normalized_profile[normalized_key] = value
101
+
102
+ print(f"✅ Profile extracted: {list(normalized_profile.keys())}")
103
+ return normalized_profile
104
+ else:
105
+ # Fallback: Create basic profile from user input
106
+ print("⚠️ Could not parse JSON, creating basic profile")
107
+ return {
108
+ "user_input": user_input,
109
+ "raw_profile": response.content,
110
+ "note": "Profile extraction incomplete. Using raw input."
111
+ }
112
+
113
+ except Exception as e:
114
+ print(f"❌ Profiling error: {str(e)}")
115
+ return {
116
+ "error": str(e),
117
+ "user_input": user_input
118
+ }
119
+
120
+
121
+ def validate_profile(profile_data: dict) -> bool:
122
+ """
123
+ Validates that profile has minimum required information
124
+
125
+ Args:
126
+ profile_data: Profile dictionary
127
+
128
+ Returns:
129
+ True if valid, False otherwise
130
+ """
131
+ required_fields = ['age', 'state', 'education']
132
+
133
+ for field in required_fields:
134
+ if field not in profile_data or profile_data[field] == "Not Provided":
135
+ return False
136
+
137
+ return True
138
+
139
+
140
+ if __name__ == "__main__":
141
+ # Test the agent
142
+ test_input = """
143
+ I am a 25-year-old male from Maharashtra. I completed my Bachelor's in Engineering.
144
+ My family income is around 3 lakh per year. I belong to the OBC category.
145
+ I am currently unemployed and looking for government job opportunities.
146
+ """
147
+
148
+ result = run_profiling_agent(test_input)
149
+ print(json.dumps(result, indent=2))
agents/rag_agent.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ RAG Retrieval Agent
3
+ Dedicated agent for vector database queries
4
+ Uses FAISS for local vector storage
5
+ """
6
+
7
+ import json
8
+ from rag.scheme_vectorstore import load_scheme_vectorstore
9
+ from rag.exam_vectorstore import load_exam_vectorstore
10
+
11
+
12
+ def run_rag_agent(query: str, database: str = "schemes", k: int = 5) -> dict:
13
+ """
14
+ Performs RAG retrieval from specified vectorstore
15
+
16
+ Args:
17
+ query: Search query
18
+ database: "schemes" or "exams"
19
+ k: Number of documents to retrieve
20
+
21
+ Returns:
22
+ Retrieved documents dictionary
23
+ """
24
+ try:
25
+ if database == "schemes":
26
+ vectorstore = load_scheme_vectorstore()
27
+ elif database == "exams":
28
+ vectorstore = load_exam_vectorstore()
29
+ else:
30
+ return {
31
+ "error": f"Invalid database: {database}. Use 'schemes' or 'exams'",
32
+ "documents": []
33
+ }
34
+
35
+ # Similarity search
36
+ docs = vectorstore.similarity_search(query, k=k)
37
+
38
+ # Format results
39
+ formatted_docs = []
40
+ for i, doc in enumerate(docs):
41
+ formatted_docs.append({
42
+ "id": i + 1,
43
+ "content": doc.page_content,
44
+ "metadata": doc.metadata,
45
+ "source": doc.metadata.get('source', 'Unknown')
46
+ })
47
+
48
+ return {
49
+ "query": query,
50
+ "database": database,
51
+ "documents_found": len(formatted_docs),
52
+ "documents": formatted_docs
53
+ }
54
+
55
+ except FileNotFoundError as e:
56
+ return {
57
+ "error": f"Vectorstore not found for {database}. Please build it first.",
58
+ "documents": []
59
+ }
60
+ except Exception as e:
61
+ return {
62
+ "error": str(e),
63
+ "documents": []
64
+ }
65
+
66
+
67
+ def hybrid_rag_search(query: str, k: int = 3) -> dict:
68
+ """
69
+ Searches both scheme and exam databases
70
+
71
+ Args:
72
+ query: Search query
73
+ k: Number of documents per database
74
+
75
+ Returns:
76
+ Combined results from both databases
77
+ """
78
+ scheme_results = run_rag_agent(query, database="schemes", k=k)
79
+ exam_results = run_rag_agent(query, database="exams", k=k)
80
+
81
+ return {
82
+ "query": query,
83
+ "scheme_results": scheme_results,
84
+ "exam_results": exam_results
85
+ }
86
+
87
+
88
+ if __name__ == "__main__":
89
+ # Test the agent
90
+ result = run_rag_agent("agricultural schemes for farmers", database="schemes", k=3)
91
+ print(json.dumps(result, indent=2))
agents/scheme_agent.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scheme Recommendation Agent
3
+ Provides RAG-based government scheme recommendations
4
+ Uses FAISS for local vector storage
5
+ """
6
+
7
+ import json
8
+ from langchain_groq import ChatGroq
9
+ from langchain_core.messages import HumanMessage, SystemMessage
10
+ from rag.scheme_vectorstore import load_scheme_vectorstore
11
+ from prompts.scheme_prompt import SCHEME_PROMPT
12
+ from tools.tavily_tool import government_focused_search
13
+ from config import GROQ_API_KEY
14
+
15
+
16
+ def get_llm():
17
+ """Initialize Groq LLM"""
18
+ if not GROQ_API_KEY:
19
+ raise ValueError("GROQ_API_KEY not found in environment variables")
20
+
21
+ return ChatGroq(
22
+ api_key=GROQ_API_KEY,
23
+ model="llama-3.3-70b-versatile",
24
+ temperature=0.3
25
+ )
26
+
27
+
28
+ def run_scheme_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict:
29
+ """
30
+ Recommends government schemes based on user profile
31
+
32
+ Args:
33
+ profile_data: Structured user profile
34
+ use_web_search: Whether to use Tavily for live search
35
+ vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading)
36
+
37
+ Returns:
38
+ Scheme recommendations dictionary
39
+ """
40
+ try:
41
+ # Use provided vectorstore or try to load it
42
+ context = ""
43
+ sources_used = 0
44
+
45
+ if vectorstore is not None:
46
+ print("✅ Using pre-loaded vectorstore")
47
+ try:
48
+ # Create search query from profile
49
+ search_query = f"""
50
+ User Profile:
51
+ Income: {profile_data.get('income', 'N/A')}
52
+ Caste: {profile_data.get('caste', 'N/A')}
53
+ State: {profile_data.get('state', 'N/A')}
54
+ Age: {profile_data.get('age', 'N/A')}
55
+ Gender: {profile_data.get('gender', 'N/A')}
56
+ Employment: {profile_data.get('employment_status', 'N/A')}
57
+ """
58
+
59
+ # RAG retrieval
60
+ docs = vectorstore.similarity_search(search_query, k=5)
61
+ context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)])
62
+ sources_used = len(docs)
63
+ print(f"✓ Retrieved {sources_used} scheme documents from vectorstore")
64
+ except Exception as e:
65
+ print(f"⚠️ Error querying vectorstore: {str(e)}")
66
+ context = "Vectorstore query failed. Using live web search."
67
+ else:
68
+ print("ℹ️ No vectorstore provided, using web search only")
69
+ context = "No local scheme database available. Using live web search."
70
+
71
+ # Create profile string
72
+ profile_str = json.dumps(profile_data, indent=2)
73
+
74
+ # Web search (fallback or enhancement)
75
+ web_context = ""
76
+ if use_web_search:
77
+ try:
78
+ state = profile_data.get('state', 'India')
79
+ caste = profile_data.get('caste', '')
80
+ income = profile_data.get('income', '')
81
+ web_query = f"government schemes India {state} {caste} eligibility benefits 2026"
82
+ print(f"🔍 Searching web: {web_query}")
83
+ web_results = government_focused_search(web_query)
84
+ web_context = f"\n\nLive Web Search Results:\n{web_results}"
85
+ print("✓ Web search completed")
86
+ except Exception as e:
87
+ web_context = f"\n\nWeb search unavailable: {str(e)}"
88
+ print(f"⚠ Web search failed: {str(e)}")
89
+
90
+ # Combine contexts
91
+ full_context = context + web_context
92
+
93
+ # If no context at all, return helpful message
94
+ if not full_context.strip():
95
+ return {
96
+ "recommendations": "Unable to retrieve scheme information. Please ensure Tavily API key is configured or vectorstore is built.",
97
+ "sources_used": 0,
98
+ "web_search_used": use_web_search
99
+ }
100
+
101
+ # Generate recommendations
102
+ llm = get_llm()
103
+
104
+ prompt = SCHEME_PROMPT.format(
105
+ context=full_context,
106
+ profile=profile_str
107
+ )
108
+
109
+ messages = [
110
+ SystemMessage(content="You are an expert government scheme advisor. Provide accurate, verified information only."),
111
+ HumanMessage(content=prompt)
112
+ ]
113
+
114
+ response = llm.invoke(messages)
115
+
116
+ return {
117
+ "recommendations": response.content,
118
+ "sources_used": sources_used,
119
+ "web_search_used": use_web_search
120
+ }
121
+
122
+ except Exception as e:
123
+ return {
124
+ "error": str(e),
125
+ "recommendations": []
126
+ }
127
+
128
+
129
+ if __name__ == "__main__":
130
+ # Test the agent
131
+ test_profile = {
132
+ "income": "300000",
133
+ "caste": "OBC",
134
+ "state": "Maharashtra",
135
+ "age": 25,
136
+ "gender": "Male",
137
+ "employment_status": "Unemployed",
138
+ "education": "Bachelor's in Engineering"
139
+ }
140
+
141
+ result = run_scheme_agent(test_profile, use_web_search=False)
142
+ print(json.dumps(result, indent=2))
agents/search_agent.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Web Search Agent
3
+ Uses Tavily to search government websites for real-time information
4
+ """
5
+
6
+ from tools.tavily_tool import tavily_search, government_focused_search
7
+
8
+
9
+ def run_search_agent(query: str, government_only: bool = True) -> dict:
10
+ """
11
+ Performs web search for government information
12
+
13
+ Args:
14
+ query: Search query
15
+ government_only: If True, restricts to .gov.in domains
16
+
17
+ Returns:
18
+ Search results dictionary
19
+ """
20
+ try:
21
+ if government_only:
22
+ results = government_focused_search(query)
23
+ else:
24
+ results = tavily_search(query)
25
+
26
+ return {
27
+ "query": query,
28
+ "results": results,
29
+ "government_only": government_only
30
+ }
31
+
32
+ except Exception as e:
33
+ return {
34
+ "query": query,
35
+ "error": str(e),
36
+ "results": []
37
+ }
38
+
39
+
40
+ def search_scheme_details(scheme_name: str) -> dict:
41
+ """
42
+ Search for specific scheme details
43
+
44
+ Args:
45
+ scheme_name: Name of the government scheme
46
+
47
+ Returns:
48
+ Scheme details from official sources
49
+ """
50
+ query = f"{scheme_name} official website application process eligibility"
51
+ return run_search_agent(query, government_only=True)
52
+
53
+
54
+ def search_exam_details(exam_name: str) -> dict:
55
+ """
56
+ Search for specific exam details
57
+
58
+ Args:
59
+ exam_name: Name of the competitive exam
60
+
61
+ Returns:
62
+ Exam details from official sources
63
+ """
64
+ query = f"{exam_name} official notification eligibility exam pattern 2026"
65
+ return run_search_agent(query, government_only=True)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ # Test the agent
70
+ result = run_search_agent("pradhan mantri kisan samman nidhi yojana", government_only=True)
71
+ print(result)
app.py ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ JanSahayak Flask Web Application
3
+ Beautiful UI for Multi-Agent Government Intelligence System
4
+ """
5
+
6
+ from flask import Flask, render_template, request, jsonify, session, send_file
7
+ import json
8
+ import os
9
+ from datetime import datetime
10
+ from graph.workflow import run_workflow
11
+ import uuid
12
+ import io
13
+ import re
14
+ from reportlab.lib.pagesizes import letter, A4
15
+ from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
16
+ from reportlab.lib.units import inch
17
+ from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
18
+ from reportlab.lib import colors
19
+ from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
20
+
21
+ app = Flask(__name__)
22
+ app.secret_key = os.urandom(24) # For session management
23
+
24
+ # Store active sessions
25
+ sessions = {}
26
+
27
+ # Global vectorstores (loaded on first use for faster startup)
28
+ SCHEME_VECTORSTORE = None
29
+ EXAM_VECTORSTORE = None
30
+ VECTORSTORES_INITIALIZED = False
31
+
32
+ # Check if running on a memory-constrained platform
33
+ SKIP_VECTORSTORES = os.environ.get('SKIP_VECTORSTORES', 'false').lower() == 'true'
34
+
35
+
36
+ def initialize_vectorstores():
37
+ """Load vectorstores lazily on first use to avoid blocking port binding"""
38
+ global SCHEME_VECTORSTORE, EXAM_VECTORSTORE, VECTORSTORES_INITIALIZED
39
+
40
+ if VECTORSTORES_INITIALIZED:
41
+ return # Already initialized
42
+
43
+ # Skip vectorstore loading on memory-constrained platforms (use web search only)
44
+ if SKIP_VECTORSTORES:
45
+ print("\n" + "="*70)
46
+ print("⚡ LIGHTWEIGHT MODE: Skipping vectorstore loading")
47
+ print("="*70)
48
+ print("✅ Using Tavily web search only (no embeddings model)")
49
+ print("✅ Low memory usage (<200MB)")
50
+ print("✅ Real-time, up-to-date information")
51
+ print("="*70 + "\n")
52
+ SCHEME_VECTORSTORE = None
53
+ EXAM_VECTORSTORE = None
54
+ VECTORSTORES_INITIALIZED = True
55
+ return
56
+
57
+ print("\n" + "="*70)
58
+ print("📚 Initializing Vector Stores (lazy loading)")
59
+ print("="*70)
60
+
61
+ # Load scheme vectorstore
62
+ try:
63
+ from rag.scheme_vectorstore import load_scheme_vectorstore
64
+ SCHEME_VECTORSTORE = load_scheme_vectorstore()
65
+ print("✅ Scheme vectorstore loaded successfully")
66
+ except Exception as e:
67
+ print(f"⚠️ Scheme vectorstore not available: {str(e)}")
68
+ print(" Will use web search only for schemes")
69
+ SCHEME_VECTORSTORE = None
70
+
71
+ # Load exam vectorstore
72
+ try:
73
+ from rag.exam_vectorstore import load_exam_vectorstore
74
+ EXAM_VECTORSTORE = load_exam_vectorstore()
75
+ print("✅ Exam vectorstore loaded successfully")
76
+ except Exception as e:
77
+ print(f"⚠️ Exam vectorstore not available: {str(e)}")
78
+ print(" Will use web search only for exams")
79
+ EXAM_VECTORSTORE = None
80
+
81
+ VECTORSTORES_INITIALIZED = True
82
+ print("="*70 + "\n")
83
+
84
+
85
+ def format_markdown(text):
86
+ """Convert markdown-style text to HTML"""
87
+ if not text or not isinstance(text, str):
88
+ return text
89
+
90
+ import re
91
+
92
+ # Convert headers (### heading)
93
+ text = re.sub(r'###\s+(.+?)(?=\n|$)', r'<h4>\1</h4>', text)
94
+ text = re.sub(r'##\s+(.+?)(?=\n|$)', r'<h3>\1</h3>', text)
95
+
96
+ # Convert bold (**text**)
97
+ text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
98
+
99
+ # Convert italic (*text*)
100
+ text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
101
+
102
+ # Convert bullet points (- item or * item)
103
+ text = re.sub(r'^[\-\*]\s+(.+)$', r'<li>\1</li>', text, flags=re.MULTILINE)
104
+ text = re.sub(r'(<li>.*?</li>)', r'<ul>\1</ul>', text, flags=re.DOTALL)
105
+ text = text.replace('</ul>\n<ul>', '\n') # Merge consecutive lists
106
+
107
+ # Convert line breaks
108
+ text = text.replace('\n\n', '</p><p>')
109
+ text = text.replace('\n', '<br>')
110
+
111
+ # Wrap in paragraph if not starting with a tag
112
+ if not text.startswith('<'):
113
+ text = f'<p>{text}</p>'
114
+
115
+ return text
116
+
117
+
118
+ # Register Jinja filter
119
+ app.jinja_env.filters['format_markdown'] = format_markdown
120
+
121
+
122
+ @app.route('/')
123
+ def index():
124
+ """Landing page with input form"""
125
+ return render_template('index.html')
126
+
127
+
128
+ @app.route('/about')
129
+ def about():
130
+ """About page"""
131
+ return render_template('about.html')
132
+
133
+
134
+ @app.route('/health')
135
+ def health():
136
+ """Health check endpoint for monitoring"""
137
+ from config import GROQ_API_KEY, TAVILY_API_KEY, HF_TOKEN
138
+
139
+ return jsonify({
140
+ 'status': 'ok',
141
+ 'service': 'JanSahayak',
142
+ 'api_keys_configured': {
143
+ 'groq': bool(GROQ_API_KEY),
144
+ 'tavily': bool(TAVILY_API_KEY),
145
+ 'hf_token': bool(HF_TOKEN)
146
+ }
147
+ })
148
+
149
+
150
+ @app.route('/analyze', methods=['POST'])
151
+ def analyze():
152
+ """Process user input and run workflow"""
153
+ try:
154
+ # First check if API keys are configured
155
+ from config import GROQ_API_KEY, TAVILY_API_KEY
156
+
157
+ if not GROQ_API_KEY or GROQ_API_KEY == "":
158
+ return jsonify({
159
+ 'success': False,
160
+ 'error': 'GROQ_API_KEY is not configured. Please set environment variables on Render.'
161
+ }), 500
162
+
163
+ if not TAVILY_API_KEY or TAVILY_API_KEY == "":
164
+ return jsonify({
165
+ 'success': False,
166
+ 'error': 'TAVILY_API_KEY is not configured. Please set environment variables on Render.'
167
+ }), 500
168
+
169
+ # Initialize vectorstores lazily on first request
170
+ initialize_vectorstores()
171
+
172
+ # Get user input
173
+ user_input = request.json.get('user_input', '')
174
+ structured_data = request.json.get('structured_data', None)
175
+
176
+ if not user_input.strip():
177
+ return jsonify({
178
+ 'success': False,
179
+ 'error': 'Please provide your details'
180
+ }), 400
181
+
182
+ # Generate session ID
183
+ session_id = str(uuid.uuid4())
184
+
185
+ # Store in session (including structured data if available)
186
+ sessions[session_id] = {
187
+ 'status': 'processing',
188
+ 'input': user_input,
189
+ 'structured_data': structured_data,
190
+ 'started_at': datetime.now().isoformat()
191
+ }
192
+
193
+ # Extract user interests from structured data
194
+ user_interests = structured_data.get('interests', ['schemes', 'exams']) if structured_data else ['schemes', 'exams']
195
+
196
+ # Prepare structured profile if available
197
+ structured_profile = None
198
+ if structured_data:
199
+ structured_profile = {
200
+ 'name': structured_data.get('name', 'Not Provided'),
201
+ 'age': structured_data.get('age', 'Not Provided'),
202
+ 'gender': structured_data.get('gender', 'Not Provided'),
203
+ 'state': structured_data.get('state', 'Not Provided'),
204
+ 'education': structured_data.get('education', 'Not Provided'),
205
+ 'employment_status': structured_data.get('employment', 'Not Provided'),
206
+ 'income': structured_data.get('income', 'Not Provided'),
207
+ 'caste': structured_data.get('category', 'Not Provided'),
208
+ 'specialization': structured_data.get('specialization', 'Not Provided'),
209
+ 'career_interest': structured_data.get('career_interest', 'Not Provided'),
210
+ 'interests': structured_data.get('interests', [])
211
+ }
212
+
213
+ # Run workflow with interests, structured profile, and pre-loaded vectorstores
214
+ result = run_workflow(
215
+ user_input,
216
+ user_interests,
217
+ structured_profile,
218
+ scheme_vectorstore=SCHEME_VECTORSTORE,
219
+ exam_vectorstore=EXAM_VECTORSTORE
220
+ )
221
+
222
+ # Ensure user_profile key exists in result
223
+ if 'user_profile' not in result and 'profile' in result:
224
+ result['user_profile'] = result['profile']
225
+
226
+ # Update session
227
+ sessions[session_id]['status'] = 'completed'
228
+ sessions[session_id]['result'] = result
229
+ sessions[session_id]['completed_at'] = datetime.now().isoformat()
230
+
231
+ # Save to file
232
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
233
+ filename = f"outputs/results_{timestamp}.json"
234
+ os.makedirs('outputs', exist_ok=True)
235
+
236
+ with open(filename, 'w', encoding='utf-8') as f:
237
+ json.dump(result, f, indent=2, ensure_ascii=False)
238
+
239
+ return jsonify({
240
+ 'success': True,
241
+ 'session_id': session_id,
242
+ 'result': result,
243
+ 'filename': filename
244
+ })
245
+
246
+ except ImportError as e:
247
+ print(f"Import Error in /analyze: {str(e)}")
248
+ return jsonify({
249
+ 'success': False,
250
+ 'error': f'Configuration error: {str(e)}. Please ensure all dependencies are installed.'
251
+ }), 500
252
+ except TimeoutError as e:
253
+ print(f"Timeout Error in /analyze: {str(e)}")
254
+ return jsonify({
255
+ 'success': False,
256
+ 'error': 'Request timed out. The analysis is taking longer than expected. Please try again.'
257
+ }), 504
258
+ except Exception as e:
259
+ print(f"Error in /analyze: {str(e)}")
260
+ import traceback
261
+ traceback.print_exc()
262
+ return jsonify({
263
+ 'success': False,
264
+ 'error': f'An error occurred during analysis: {str(e)}'
265
+ }), 500
266
+
267
+
268
+ @app.route('/result/<session_id>')
269
+ def result(session_id):
270
+ """Display results page"""
271
+ if session_id not in sessions:
272
+ return render_template('error.html',
273
+ error='Session not found'), 404
274
+
275
+ session_data = sessions[session_id]
276
+
277
+ if session_data['status'] != 'completed':
278
+ return render_template('error.html',
279
+ error='Analysis still in progress'), 400
280
+
281
+ return render_template('results.html',
282
+ session_id=session_id,
283
+ session_data=session_data,
284
+ result=session_data['result'])
285
+
286
+
287
+ @app.route('/api/status/<session_id>')
288
+ def status(session_id):
289
+ """Check analysis status"""
290
+ if session_id not in sessions:
291
+ return jsonify({'error': 'Session not found'}), 404
292
+
293
+ return jsonify(sessions[session_id])
294
+
295
+
296
+ @app.route('/history')
297
+ def history():
298
+ """View analysis history"""
299
+ output_files = []
300
+
301
+ if os.path.exists('outputs'):
302
+ files = [f for f in os.listdir('outputs') if f.endswith('.json')]
303
+ files.sort(reverse=True)
304
+
305
+ for filename in files[:10]: # Show last 10
306
+ filepath = os.path.join('outputs', filename)
307
+ with open(filepath, 'r', encoding='utf-8') as f:
308
+ data = json.load(f)
309
+ output_files.append({
310
+ 'filename': filename,
311
+ 'timestamp': filename.replace('results_', '').replace('.json', ''),
312
+ 'profile': data.get('user_profile', {}),
313
+ 'errors': data.get('errors', [])
314
+ })
315
+
316
+ return render_template('history.html', files=output_files)
317
+
318
+
319
+ @app.route('/api/file/<filename>')
320
+ def get_file(filename):
321
+ """Download result file"""
322
+ try:
323
+ filepath = os.path.join('outputs', filename)
324
+ with open(filepath, 'r', encoding='utf-8') as f:
325
+ data = json.load(f)
326
+ return jsonify(data)
327
+ except Exception as e:
328
+ return jsonify({'error': str(e)}), 404
329
+
330
+
331
+ @app.route('/download/pdf/<session_id>')
332
+ def download_pdf(session_id):
333
+ """Generate and download PDF report"""
334
+ try:
335
+ if session_id not in sessions:
336
+ return jsonify({'error': 'Session not found'}), 404
337
+
338
+ session_data = sessions[session_id]
339
+ result = session_data.get('result', {})
340
+
341
+ # Create PDF in memory
342
+ buffer = io.BytesIO()
343
+ doc = SimpleDocTemplate(buffer, pagesize=letter,
344
+ rightMargin=72, leftMargin=72,
345
+ topMargin=72, bottomMargin=18)
346
+
347
+ # Container for PDF elements
348
+ elements = []
349
+
350
+ # Define styles
351
+ styles = getSampleStyleSheet()
352
+ title_style = ParagraphStyle(
353
+ 'CustomTitle',
354
+ parent=styles['Heading1'],
355
+ fontSize=24,
356
+ textColor=colors.HexColor('#5B21B6'),
357
+ spaceAfter=30,
358
+ alignment=TA_CENTER
359
+ )
360
+ heading_style = ParagraphStyle(
361
+ 'CustomHeading',
362
+ parent=styles['Heading2'],
363
+ fontSize=16,
364
+ textColor=colors.HexColor('#7C3AED'),
365
+ spaceAfter=12,
366
+ spaceBefore=12
367
+ )
368
+ normal_style = styles['BodyText']
369
+ normal_style.alignment = TA_JUSTIFY
370
+
371
+ # Get user name for personalization
372
+ profile = result.get('user_profile', {})
373
+ user_name = profile.get('name', 'Citizen')
374
+ if user_name and user_name != 'Not Provided':
375
+ user_name = user_name.strip()
376
+ else:
377
+ user_name = 'Citizen'
378
+
379
+ # Title with logo-like header
380
+ elements.append(Paragraph("🇮🇳 JanSahayak", title_style))
381
+ elements.append(Paragraph("Government Benefits Analysis Report", styles['Heading3']))
382
+ elements.append(Spacer(1, 0.2*inch))
383
+
384
+ # Personalized greeting
385
+ greeting = ParagraphStyle('Greeting', parent=styles['Normal'], fontSize=14,
386
+ textColor=colors.HexColor('#374151'), spaceBefore=6, spaceAfter=12)
387
+ elements.append(Paragraph(f"<b>Prepared for: {user_name}</b>", greeting))
388
+
389
+ # Timestamp
390
+ timestamp = datetime.now().strftime("%B %d, %Y at %I:%M %p")
391
+ elements.append(Paragraph(f"<i>Generated: {timestamp}</i>", styles['Normal']))
392
+
393
+ # Separator line
394
+ elements.append(Spacer(1, 0.2*inch))
395
+ elements.append(Table([['_'*100]], colWidths=[6.5*inch]))
396
+ elements.append(Spacer(1, 0.4*inch))
397
+
398
+ # User Profile Section
399
+ elements.append(Paragraph("Your Profile", heading_style))
400
+ profile = result.get('user_profile', {})
401
+
402
+ if profile:
403
+ profile_data = []
404
+ for key, value in profile.items():
405
+ if key not in ['raw_profile', 'user_input', 'error', 'note'] and value != 'Not Provided':
406
+ label = key.replace('_', ' ').title()
407
+ # Format interests list properly
408
+ if key == 'interests' and isinstance(value, list):
409
+ value = ', '.join([v.title() for v in value])
410
+ profile_data.append([Paragraph(f"<b>{label}:</b>", normal_style),
411
+ Paragraph(str(value), normal_style)])
412
+
413
+ if profile_data:
414
+ profile_table = Table(profile_data, colWidths=[2.2*inch, 4.3*inch])
415
+ profile_table.setStyle(TableStyle([
416
+ ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#EEF2FF')), # Label column
417
+ ('BACKGROUND', (1, 0), (1, -1), colors.white), # Value column
418
+ ('TEXTCOLOR', (0, 0), (-1, -1), colors.HexColor('#1F2937')),
419
+ ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
420
+ ('VALIGN', (0, 0), (-1, -1), 'TOP'),
421
+ ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'), # Bold labels
422
+ ('FONTNAME', (1, 0), (1, -1), 'Helvetica'),
423
+ ('FONTSIZE', (0, 0), (-1, -1), 10),
424
+ ('BOTTOMPADDING', (0, 0), (-1, -1), 10),
425
+ ('TOPPADDING', (0, 0), (-1, -1), 10),
426
+ ('LEFTPADDING', (0, 0), (-1, -1), 12),
427
+ ('RIGHTPADDING', (0, 0), (-1, -1), 12),
428
+ ('GRID', (0, 0), (-1, -1), 1, colors.HexColor('#D1D5DB')),
429
+ ('ROWBACKGROUNDS', (0, 0), (-1, -1), [colors.white, colors.HexColor('#F9FAFB')]),
430
+ ]))
431
+ elements.append(profile_table)
432
+
433
+ elements.append(Spacer(1, 0.4*inch))
434
+
435
+ # Helper function to clean and format text
436
+ def clean_text(text):
437
+ if not text or not isinstance(text, str):
438
+ return "No information available"
439
+ # Skip if "Not requested by user"
440
+ if "Not requested by user" in text:
441
+ return None
442
+ # Remove HTML tags
443
+ text = re.sub(r'<[^>]+>', '', text)
444
+ # Convert markdown headers to regular text with proper spacing
445
+ text = re.sub(r'###\s+(.+)', r'\n\1\n', text)
446
+ text = re.sub(r'##\s+(.+)', r'\n\1\n', text)
447
+ # Clean up bold markers
448
+ text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
449
+ # Clean up bullet points
450
+ text = re.sub(r'^\*\s+', '\u2022 ', text, flags=re.MULTILINE)
451
+ text = re.sub(r'^-\s+', '\u2022 ', text, flags=re.MULTILINE)
452
+ return text.strip()
453
+
454
+ # Section style for better visual separation
455
+ section_box_style = ParagraphStyle(
456
+ 'SectionBox',
457
+ parent=normal_style,
458
+ leftIndent=20,
459
+ rightIndent=20,
460
+ spaceBefore=6,
461
+ spaceAfter=6,
462
+ borderColor=colors.HexColor('#E5E7EB'),
463
+ borderWidth=1,
464
+ borderPadding=10,
465
+ backColor=colors.HexColor('#F9FAFB')
466
+ )
467
+
468
+ # Government Schemes Section
469
+ schemes_text = clean_text(result.get('scheme_recommendations', 'No recommendations available'))
470
+ if schemes_text:
471
+ elements.append(Paragraph("\ud83c\udfdb\ufe0f Government Schemes for You", heading_style))
472
+ elements.append(Spacer(1, 0.1*inch))
473
+
474
+ # Split into paragraphs and add with better formatting
475
+ paragraphs = [p.strip() for p in schemes_text.split('\n\n') if p.strip()]
476
+ for para in paragraphs:
477
+ if para:
478
+ elements.append(Paragraph(para, normal_style))
479
+ elements.append(Spacer(1, 0.15*inch))
480
+
481
+ elements.append(Spacer(1, 0.2*inch))
482
+
483
+ # Competitive Exams Section
484
+ exams_text = clean_text(result.get('exam_recommendations', 'No recommendations available'))
485
+ if exams_text:
486
+ elements.append(Paragraph("\ud83c\udf93 Competitive Exams for You", heading_style))
487
+ elements.append(Spacer(1, 0.1*inch))
488
+
489
+ paragraphs = [p.strip() for p in exams_text.split('\n\n') if p.strip()]
490
+ for para in paragraphs:
491
+ if para:
492
+ elements.append(Paragraph(para, normal_style))
493
+ elements.append(Spacer(1, 0.15*inch))
494
+
495
+ elements.append(Spacer(1, 0.2*inch))
496
+
497
+ # Missed Benefits Section
498
+ benefits_text = clean_text(result.get('missed_benefits_analysis', 'No analysis available'))
499
+ if benefits_text:
500
+ elements.append(Paragraph("\ud83d\udcca Missed Benefits Analysis", heading_style))
501
+ elements.append(Spacer(1, 0.1*inch))
502
+
503
+ paragraphs = [p.strip() for p in benefits_text.split('\n\n') if p.strip()]
504
+ for para in paragraphs:
505
+ if para:
506
+ elements.append(Paragraph(para, normal_style))
507
+ elements.append(Spacer(1, 0.15*inch))
508
+
509
+ # Errors (if any)
510
+ errors = result.get('errors', [])
511
+ if errors:
512
+ elements.append(Spacer(1, 0.3*inch))
513
+ elements.append(Paragraph("Notices", heading_style))
514
+ for error in errors:
515
+ elements.append(Paragraph(f"• {error}", normal_style))
516
+
517
+ # Footer with disclaimer
518
+ elements.append(Spacer(1, 0.5*inch))
519
+
520
+ # Add separator before footer
521
+ elements.append(Table([['_'*100]], colWidths=[6.5*inch]))
522
+ elements.append(Spacer(1, 0.2*inch))
523
+
524
+ footer_style = ParagraphStyle('Footer', parent=styles['Normal'],
525
+ fontSize=9, textColor=colors.HexColor('#6B7280'),
526
+ alignment=TA_CENTER)
527
+ elements.append(Paragraph(
528
+ "<i>This report is generated by JanSahayak AI system. "
529
+ "For official information and application procedures, "
530
+ "please visit the respective government ministry websites or contact local government offices.</i>",
531
+ footer_style
532
+ ))
533
+ elements.append(Spacer(1, 0.1*inch))
534
+ elements.append(Paragraph(
535
+ "<i>Generated by JanSahayak - Your Government Benefits Assistant</i>",
536
+ footer_style
537
+ ))
538
+
539
+ # Build PDF
540
+ doc.build(elements)
541
+
542
+ # Prepare response
543
+ buffer.seek(0)
544
+
545
+ # Create filename with user's name
546
+ safe_name = re.sub(r'[^a-zA-Z0-9\s]', '', user_name).replace(' ', '_')
547
+ timestamp_str = datetime.now().strftime("%Y%m%d")
548
+ filename = f'JanSahayak_{safe_name}_{timestamp_str}.pdf'
549
+
550
+ return send_file(
551
+ buffer,
552
+ as_attachment=True,
553
+ download_name=filename,
554
+ mimetype='application/pdf'
555
+ )
556
+
557
+ except Exception as e:
558
+ print(f"PDF Generation Error: {str(e)}")
559
+ return jsonify({'error': str(e)}), 500
560
+
561
+
562
+ if __name__ == '__main__':
563
+ # Get port from environment variable (for deployment platforms)
564
+ port = int(os.environ.get('PORT', 5000))
565
+
566
+ # Check if running in production
567
+ is_production = os.environ.get('FLASK_ENV') != 'development'
568
+
569
+ print("\n" + "="*70)
570
+ print("🙏 JANSAHAYAK - Starting Web Server")
571
+ print("="*70)
572
+
573
+ # Check API keys on startup
574
+ from config import GROQ_API_KEY, TAVILY_API_KEY, HF_TOKEN
575
+
576
+ if not GROQ_API_KEY or GROQ_API_KEY == "":
577
+ print("⚠️ WARNING: GROQ_API_KEY is not set!")
578
+ print(" The application will not work without this API key.")
579
+ else:
580
+ print("✅ GROQ_API_KEY is configured")
581
+
582
+ if not TAVILY_API_KEY or TAVILY_API_KEY == "":
583
+ print("⚠️ WARNING: TAVILY_API_KEY is not set!")
584
+ print(" The application will not work without this API key.")
585
+ else:
586
+ print("✅ TAVILY_API_KEY is configured")
587
+
588
+ if not HF_TOKEN or HF_TOKEN == "":
589
+ print("⚠️ WARNING: HF_TOKEN is not set (optional but recommended)")
590
+ else:
591
+ print("✅ HF_TOKEN is configured")
592
+
593
+ print(f"\n📱 Starting Flask server on port {port}...")
594
+ print(f"🌍 Environment: {'Production' if is_production else 'Development'}")
595
+ print("🔄 Vectorstores will be loaded on first request")
596
+ print("🛑 Press CTRL+C to stop the server\n")
597
+
598
+ # Start Flask FIRST to bind to port, then load vectorstores in background
599
+ app.run(debug=not is_production, host='0.0.0.0', port=port, threaded=True)
config.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ load_dotenv()
5
+
6
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
7
+ TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
8
+ HF_TOKEN = os.getenv("HF_TOKEN")
data/exams_pdfs/README.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Placeholder for competitive exam PDFs
2
+ # Add your competitive exam PDF files to this directory
3
+
4
+ # Examples of exams to add:
5
+ # - UPSC (Civil Services, NDA, CDS)
6
+ # - SSC (CGL, CHSL, MTS, JE)
7
+ # - Banking (IBPS, SBI PO/Clerk, RBI)
8
+ # - Railways (RRB NTPC, ALP, Group D)
9
+ # - State PSC exams
10
+ # - Defense exams (NDA, CDS, AFCAT)
11
+ # - Teaching exams (CTET, TET)
12
+
13
+ # Download official notifications and syllabi from exam conducting bodies
data/exams_pdfs/exam.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae2c52c533fe29a081d8fe477d079e1d2e610aa398be5a8324f63b583c5beacf
3
+ size 149005
data/schemes_pdfs/Government Welfare Schemes & Policies - Disha Experts.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ad6e1fb3d26677250c6597ca9ed83f24000f8c062529f7188b693839f0c6ade9
3
+ size 2410388
data/schemes_pdfs/Government of India Welfare Schemes & Policies For Competitive Exams.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11f0608bcece884567ea3e98720c8e557d32d4fe203f3f1dde5356fcf39f7ee7
3
+ size 2387327
data/schemes_pdfs/README.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Placeholder for government scheme PDFs
2
+ # Add your government scheme PDF files to this directory
3
+
4
+ # Examples of schemes to add:
5
+ # - PM Kisan Samman Nidhi
6
+ # - Ayushman Bharat
7
+ # - PM Awas Yojana
8
+ # - Skill Development Schemes
9
+ # - Scholarships (SC/ST/OBC/Minority)
10
+ # - State-specific schemes
11
+
12
+ # Download official PDFs from government websites (.gov.in domains)
data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4f7683bac6e79e923ac9441191f073cdbb67c41fcf84d5b401b02ce51520648
3
+ size 511889
graph/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """
2
+ Graph Module Init
3
+ """
graph/workflow.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LangGraph Workflow
3
+ Orchestrates multi-agent system using LangGraph
4
+ """
5
+
6
+ from typing import TypedDict, Annotated
7
+ from langgraph.graph import StateGraph, END
8
+ import operator
9
+
10
+
11
+ class AgentState(TypedDict):
12
+ """
13
+ State object that gets passed between agents
14
+ Contains all intermediate and final results
15
+ """
16
+ # Input
17
+ user_input: str
18
+ user_interests: list # ['schemes', 'exams']
19
+
20
+ # Pre-loaded vectorstores
21
+ scheme_vectorstore: object # FAISS vectorstore or None
22
+ exam_vectorstore: object # FAISS vectorstore or None
23
+
24
+ # Profiling Agent Output
25
+ profile: dict
26
+
27
+ # Scheme Agent Output
28
+ scheme_recommendations: str
29
+
30
+ # Exam Agent Output
31
+ exam_recommendations: str
32
+
33
+ # Benefit Agent Output
34
+ missed_benefits: str
35
+
36
+ # Final Output
37
+ final_output: dict
38
+
39
+ # Error tracking
40
+ errors: Annotated[list, operator.add]
41
+
42
+
43
+ def profiling_node(state: AgentState) -> dict:
44
+ """
45
+ Node: User Profiling Agent
46
+ Extracts structured profile from user input
47
+ """
48
+ from agents.profiling_agent import run_profiling_agent
49
+
50
+ try:
51
+ # Check if we already have a structured profile (from form)
52
+ existing_profile = state.get("profile", {})
53
+
54
+ # If we have useful profile data already, skip LLM profiling
55
+ useful_fields = [k for k in existing_profile.keys() if k not in ['raw_profile', 'user_input', 'error', 'note'] and existing_profile[k] not in ['Not Provided', 'N/A', '', None]]
56
+
57
+ if len(useful_fields) >= 3:
58
+ print("\n✅ Using pre-extracted profile data (skipping LLM profiling)")
59
+ return {"profile": existing_profile}
60
+
61
+ print("\n🔍 Running Profiling Agent...")
62
+ user_input = state.get("user_input", "")
63
+ profile = run_profiling_agent(user_input)
64
+
65
+ # Merge with existing profile if available
66
+ if existing_profile:
67
+ profile = {**profile, **existing_profile} # existing_profile takes precedence
68
+
69
+ if "error" in profile and len(profile) <= 2: # Only error and maybe user_input
70
+ print("❌ Profile extraction failed, using fallback data")
71
+ return {
72
+ "profile": existing_profile if existing_profile else {},
73
+ "errors": ["Profiling failed: " + profile.get("error", "Unknown error")]
74
+ }
75
+
76
+ print("✅ Profile extracted successfully")
77
+ return {"profile": profile}
78
+
79
+ except Exception as e:
80
+ print(f"❌ Profiling Agent Error: {str(e)}")
81
+ existing_profile = state.get("profile", {})
82
+ return {
83
+ "profile": existing_profile if existing_profile else {},
84
+ "errors": [f"Profiling: {str(e)}"]
85
+ }
86
+
87
+
88
+ def scheme_node(state: AgentState) -> dict:
89
+ """
90
+ Node: Scheme Recommendation Agent
91
+ Recommends government schemes based on profile
92
+ """
93
+ from agents.scheme_agent import run_scheme_agent
94
+
95
+ try:
96
+ # Check if user wants scheme recommendations
97
+ interests = state.get("user_interests", ["schemes", "exams"])
98
+ if "schemes" not in interests:
99
+ print("\n⏭️ Skipping Scheme Agent (not requested)")
100
+ return {"scheme_recommendations": "Not requested by user"}
101
+
102
+ print("\n🏛️ Running Scheme Recommendation Agent...")
103
+ profile = state.get("profile", {})
104
+ scheme_vectorstore = state.get("scheme_vectorstore", None)
105
+
106
+ # Check if profile has useful data (at least 2 fields with actual values)
107
+ useful_fields = [k for k in profile.keys()
108
+ if k not in ['raw_profile', 'user_input', 'error', 'note']
109
+ and profile[k] not in ['Not Provided', 'N/A', '', None]]
110
+
111
+ if not profile or len(useful_fields) < 2:
112
+ print(f"⚠️ Limited profile data ({len(useful_fields)} fields), will rely more on web search")
113
+ else:
114
+ print(f"✅ Profile has {len(useful_fields)} useful fields")
115
+
116
+ result = run_scheme_agent(profile, use_web_search=True, vectorstore=scheme_vectorstore)
117
+ print("✅ Scheme recommendations generated")
118
+ return {"scheme_recommendations": result.get("recommendations", "")}
119
+
120
+ except Exception as e:
121
+ print(f"❌ Scheme Agent Error: {str(e)}")
122
+ return {
123
+ "scheme_recommendations": f"Error generating recommendations: {str(e)}",
124
+ "errors": [f"Scheme: {str(e)}"]
125
+ }
126
+
127
+
128
+ def exam_node(state: AgentState) -> dict:
129
+ """
130
+ Node: Exam Recommendation Agent
131
+ Recommends competitive exams based on profile
132
+ """
133
+ from agents.exam_agent import run_exam_agent
134
+
135
+ try:
136
+ # Check if user wants exam recommendations
137
+ interests = state.get("user_interests", ["schemes", "exams"])
138
+ if "exams" not in interests:
139
+ print("\n⏭️ Skipping Exam Agent (not requested)")
140
+ return {"exam_recommendations": "Not requested by user"}
141
+
142
+ print("\n🎓 Running Exam Recommendation Agent...")
143
+ profile = state.get("profile", {})
144
+ exam_vectorstore = state.get("exam_vectorstore", None)
145
+
146
+ # Check if profile has useful data
147
+ useful_fields = [k for k in profile.keys() if k not in ['raw_profile', 'user_input', 'error', 'note']]
148
+
149
+ if not profile or len(useful_fields) < 2:
150
+ print("⚠️ Insufficient profile data, using web search only")
151
+ # Still try with whatever we have
152
+
153
+ result = run_exam_agent(profile, use_web_search=True, vectorstore=exam_vectorstore)
154
+ print("✅ Exam recommendations generated")
155
+ return {"exam_recommendations": result.get("recommendations", "")}
156
+
157
+ except Exception as e:
158
+ print(f"❌ Exam Agent Error: {str(e)}")
159
+ return {
160
+ "exam_recommendations": f"Error generating recommendations: {str(e)}",
161
+ "errors": [f"Exam: {str(e)}"]
162
+ }
163
+
164
+
165
+ def benefit_node(state: AgentState) -> dict:
166
+ """
167
+ Node: Missed Benefits Calculator Agent
168
+ Calculates potential missed benefits
169
+ """
170
+ from agents.benefit_agent import calculate_missed_benefits
171
+
172
+ try:
173
+ print("\n💰 Running Benefit Calculator Agent...")
174
+ profile = state.get("profile", {})
175
+ scheme_recommendations = state.get("scheme_recommendations", "")
176
+
177
+ if not profile or not scheme_recommendations:
178
+ print("⚠️ Insufficient data for benefit calculation")
179
+ return {"missed_benefits": "Insufficient data"}
180
+
181
+ result = calculate_missed_benefits(profile, scheme_recommendations)
182
+ print("✅ Benefit calculation completed")
183
+ return {"missed_benefits": result.get("calculation", "")}
184
+
185
+ except Exception as e:
186
+ print(f"❌ Benefit Agent Error: {str(e)}")
187
+ return {
188
+ "missed_benefits": "",
189
+ "errors": [f"Benefit: {str(e)}"]
190
+ }
191
+
192
+
193
+ def output_node(state: AgentState) -> dict:
194
+ """
195
+ Node: Final Output Compiler
196
+ Compiles all agent outputs into final response
197
+ """
198
+ print("\n📊 Compiling Final Output...")
199
+
200
+ final_output = {
201
+ "user_profile": state.get("profile", {}),
202
+ "scheme_recommendations": state.get("scheme_recommendations", ""),
203
+ "exam_recommendations": state.get("exam_recommendations", ""),
204
+ "missed_benefits_analysis": state.get("missed_benefits", ""),
205
+ "errors": state.get("errors", [])
206
+ }
207
+
208
+ print("✅ Final output ready")
209
+
210
+ return {"final_output": final_output}
211
+
212
+
213
+ def build_workflow():
214
+ """
215
+ Builds the LangGraph workflow
216
+
217
+ Returns:
218
+ Compiled workflow graph
219
+ """
220
+ # Create workflow
221
+ workflow = StateGraph(AgentState)
222
+
223
+ # Add nodes
224
+ workflow.add_node("profiling", profiling_node)
225
+ workflow.add_node("scheme", scheme_node)
226
+ workflow.add_node("exam", exam_node)
227
+ workflow.add_node("benefit", benefit_node)
228
+ workflow.add_node("output", output_node)
229
+
230
+ # Set entry point
231
+ workflow.set_entry_point("profiling")
232
+
233
+ # Define edges (workflow flow)
234
+ # Step 1: Profiling runs first
235
+ workflow.add_edge("profiling", "scheme")
236
+ workflow.add_edge("profiling", "exam")
237
+
238
+ # Step 2: Both scheme and exam converge to benefit (runs after both complete)
239
+ workflow.add_edge("scheme", "benefit")
240
+ workflow.add_edge("exam", "benefit")
241
+
242
+ # Step 3: Benefit goes to output
243
+ workflow.add_edge("benefit", "output")
244
+
245
+ # Set finish point
246
+ workflow.add_edge("output", END)
247
+
248
+ # Compile workflow
249
+ return workflow.compile()
250
+
251
+
252
+ def run_workflow(user_input: str, user_interests: list = None, structured_profile: dict = None,
253
+ scheme_vectorstore=None, exam_vectorstore=None) -> dict:
254
+ """
255
+ Runs the complete multi-agent workflow
256
+
257
+ Args:
258
+ user_input: Raw user input text
259
+ user_interests: List of interests ['schemes', 'exams']
260
+ structured_profile: Pre-extracted profile data from form (optional)
261
+ scheme_vectorstore: Pre-loaded scheme vectorstore (optional)
262
+ exam_vectorstore: Pre-loaded exam vectorstore (optional)
263
+
264
+ Returns:
265
+ Final compiled output dictionary
266
+ """
267
+ print("="*60)
268
+ print("🚀 Starting JanSahayak Multi-Agent System")
269
+ print("="*60)
270
+
271
+ if user_interests:
272
+ print(f"🎯 User Interests: {', '.join(user_interests)}")
273
+
274
+ if structured_profile:
275
+ print("📋 Using structured profile data from form")
276
+
277
+ if scheme_vectorstore:
278
+ print("📚 Using pre-loaded scheme vectorstore")
279
+ if exam_vectorstore:
280
+ print("📚 Using pre-loaded exam vectorstore")
281
+
282
+ # Build workflow
283
+ app = build_workflow()
284
+
285
+ # Initialize state
286
+ initial_state = {
287
+ "user_input": user_input,
288
+ "user_interests": user_interests or ["schemes", "exams"],
289
+ "profile": structured_profile if structured_profile else {},
290
+ "scheme_vectorstore": scheme_vectorstore,
291
+ "exam_vectorstore": exam_vectorstore,
292
+ "errors": []
293
+ }
294
+
295
+ # Run workflow
296
+ result = app.invoke(initial_state)
297
+
298
+ print("\n" + "="*60)
299
+ print("✅ Workflow Completed")
300
+ print("="*60)
301
+
302
+ return result.get("final_output", {})
303
+
304
+
305
+ if __name__ == "__main__":
306
+ # Test workflow
307
+ test_input = """
308
+ I am a 25-year-old male from Maharashtra. I completed my Bachelor's in Engineering.
309
+ My family income is around 3 lakh per year. I belong to the OBC category.
310
+ I am currently unemployed and looking for government job opportunities.
311
+ I am interested in technical positions and government jobs.
312
+ """
313
+
314
+ result = run_workflow(test_input)
315
+
316
+ print("\n📄 Final Result:")
317
+ print("="*60)
318
+ import json
319
+ print(json.dumps(result, indent=2, ensure_ascii=False))
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/58d4a9a45664eb9e12de9549c548c09b6134c17f.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock ADDED
File without changes
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock ADDED
File without changes
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json ADDED
File without changes
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json ADDED
File without changes
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja ADDED
File without changes
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
+ size 90868376
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: en
3
+ license: apache-2.0
4
+ library_name: sentence-transformers
5
+ tags:
6
+ - sentence-transformers
7
+ - feature-extraction
8
+ - sentence-similarity
9
+ - transformers
10
+ datasets:
11
+ - s2orc
12
+ - flax-sentence-embeddings/stackexchange_xml
13
+ - ms_marco
14
+ - gooaq
15
+ - yahoo_answers_topics
16
+ - code_search_net
17
+ - search_qa
18
+ - eli5
19
+ - snli
20
+ - multi_nli
21
+ - wikihow
22
+ - natural_questions
23
+ - trivia_qa
24
+ - embedding-data/sentence-compression
25
+ - embedding-data/flickr30k-captions
26
+ - embedding-data/altlex
27
+ - embedding-data/simple-wiki
28
+ - embedding-data/QQP
29
+ - embedding-data/SPECTER
30
+ - embedding-data/PAQ_pairs
31
+ - embedding-data/WikiAnswers
32
+ pipeline_tag: sentence-similarity
33
+ ---
34
+
35
+
36
+ # all-MiniLM-L6-v2
37
+ This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
+
39
+ ## Usage (Sentence-Transformers)
40
+ Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
+
42
+ ```
43
+ pip install -U sentence-transformers
44
+ ```
45
+
46
+ Then you can use the model like this:
47
+ ```python
48
+ from sentence_transformers import SentenceTransformer
49
+ sentences = ["This is an example sentence", "Each sentence is converted"]
50
+
51
+ model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
+ embeddings = model.encode(sentences)
53
+ print(embeddings)
54
+ ```
55
+
56
+ ## Usage (HuggingFace Transformers)
57
+ Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
+
59
+ ```python
60
+ from transformers import AutoTokenizer, AutoModel
61
+ import torch
62
+ import torch.nn.functional as F
63
+
64
+ #Mean Pooling - Take attention mask into account for correct averaging
65
+ def mean_pooling(model_output, attention_mask):
66
+ token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
+ input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
+ return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
+
70
+
71
+ # Sentences we want sentence embeddings for
72
+ sentences = ['This is an example sentence', 'Each sentence is converted']
73
+
74
+ # Load model from HuggingFace Hub
75
+ tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
+ model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
+
78
+ # Tokenize sentences
79
+ encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
+
81
+ # Compute token embeddings
82
+ with torch.no_grad():
83
+ model_output = model(**encoded_input)
84
+
85
+ # Perform pooling
86
+ sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
+
88
+ # Normalize embeddings
89
+ sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
+
91
+ print("Sentence embeddings:")
92
+ print(sentence_embeddings)
93
+ ```
94
+
95
+ ------
96
+
97
+ ## Background
98
+
99
+ The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
+ contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
+ 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
+
103
+ We developed this model during the
104
+ [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
+ organized by Hugging Face. We developed this model as part of the project:
106
+ [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
+
108
+ ## Intended uses
109
+
110
+ Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
+ the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
+
113
+ By default, input text longer than 256 word pieces is truncated.
114
+
115
+
116
+ ## Training procedure
117
+
118
+ ### Pre-training
119
+
120
+ We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
+
122
+ ### Fine-tuning
123
+
124
+ We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
+ We then apply the cross entropy loss by comparing with true pairs.
126
+
127
+ #### Hyper parameters
128
+
129
+ We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
+ We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
+ a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
+
133
+ #### Training data
134
+
135
+ We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
+ We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
+
138
+
139
+ | Dataset | Paper | Number of training tuples |
140
+ |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
+ | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
+ | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
+ | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
+ | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
+ | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
+ | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
+ | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
+ | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
+ | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
+ | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
+ | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
+ | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
+ | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
+ | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
+ | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
+ | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
+ | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
+ | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
+ | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
+ | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
+ | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
+ | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
+ | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
+ | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
+ | **Total** | | **1,170,060,424** |
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/59d594003bf59880a884c574bf88ef7555bb0202 ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "max_seq_length": 256,
3
+ "do_lower_case": false
4
+ }
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/72b987fd805cfa2b58c4c8c952b274a11bfd5a00 ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
3
+ "architectures": [
4
+ "BertModel"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 384,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 1536,
13
+ "layer_norm_eps": 1e-12,
14
+ "max_position_embeddings": 512,
15
+ "model_type": "bert",
16
+ "num_attention_heads": 12,
17
+ "num_hidden_layers": 6,
18
+ "pad_token_id": 0,
19
+ "position_embedding_type": "absolute",
20
+ "transformers_version": "4.8.2",
21
+ "type_vocab_size": 2,
22
+ "use_cache": true,
23
+ "vocab_size": 30522
24
+ }
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/952a9b81c0bfd99800fabf352f69c7ccd46c5e43 ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.models.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.models.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.models.Normalize"
19
+ }
20
+ ]