Spaces:

Anmol4521
/

jansahayak

Sleeping

App Files Files Community

Anmol4521 commited on Mar 3

Commit

388aa42

verified ·

1 Parent(s): 359cedf

Upload 95 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +31 -0
.env.example +8 -0
.gitattributes +6 -0
.gitignore +29 -0
ARCHITECTURE.txt +330 -0
Dockerfile +28 -0
PROJECT_STRUCTURE.txt +387 -0
README.md +238 -11
agent_io/__init__.py +3 -0
agent_io/benefit_io.py +117 -0
agent_io/exam_io.py +115 -0
agent_io/profiling_io.py +111 -0
agent_io/scheme_io.py +116 -0
agents/__init__.py +3 -0
agents/benefit_agent.py +213 -0
agents/document_agent.py +165 -0
agents/exam_agent.py +138 -0
agents/profiling_agent.py +149 -0
agents/rag_agent.py +91 -0
agents/scheme_agent.py +142 -0
agents/search_agent.py +71 -0
app.py +599 -0
config.py +8 -0
data/exams_pdfs/README.txt +13 -0
data/exams_pdfs/exam.pdf +3 -0
data/schemes_pdfs/Government Welfare Schemes & Policies - Disha Experts.pdf +3 -0
data/schemes_pdfs/Government of India Welfare Schemes & Policies For Competitive Exams.pdf +3 -0
data/schemes_pdfs/README.txt +12 -0
data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf +3 -0
graph/__init__.py +3 -0
graph/workflow.py +319 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/58d4a9a45664eb9e12de9549c548c09b6134c17f.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock +0 -0
hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock +0 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json +0 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json +0 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja +0 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db +3 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f +173 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/59d594003bf59880a884c574bf88ef7555bb0202 +4 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/72b987fd805cfa2b58c4c8c952b274a11bfd5a00 +24 -0
hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/952a9b81c0bfd99800fabf352f69c7ccd46c5e43 +20 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,31 @@

+# Environment
+.env
+.venv/
+venv/
+env/
+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+# Git
+.git/
+.gitignore
+# IDE
+.vscode/
+.idea/
+# Documentation
+*.md
+ARCHITECTURE.txt
+PROJECT_STRUCTURE.txt
+# Outputs (will be generated)
+outputs/*.json
+# RAG indexes (build during deployment)
+rag/scheme_index/
+rag/exam_index/

.env.example ADDED Viewed

	@@ -0,0 +1,8 @@

+GROQ_API_KEY="your_groq_api_key_here"
+TAVILY_API_KEY="your_tavily_api_key_here"
+HF_TOKEN="your_huggingface_token_here"
+# Skip vectorstores on memory-constrained platforms
+# Set to "true" to use only web search (saves ~300MB RAM)
+# Set to "false" to use FAISS vectorstores (for Hugging Face Spaces)
+SKIP_VECTORSTORES="false"

.gitattributes CHANGED Viewed

@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/exams_pdfs/exam.pdf filter=lfs diff=lfs merge=lfs -text
+data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf filter=lfs diff=lfs merge=lfs -text
+data/schemes_pdfs/Government[[:space:]]of[[:space:]]India[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]For[[:space:]]Competitive[[:space:]]Exams.pdf filter=lfs diff=lfs merge=lfs -text
+data/schemes_pdfs/Government[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]-[[:space:]]Disha[[:space:]]Experts.pdf filter=lfs diff=lfs merge=lfs -text
+hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db filter=lfs diff=lfs merge=lfs -text
+rag/scheme_index/index.faiss filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,29 @@

+# Environment
+.env
+.venv/
+venv/
+env/
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+# HuggingFace Cache (downloaded models)
+hf_cache/
+# RAG Indexes (now included for production)
+# rag/scheme_index/
+# rag/exam_index/
+# Outputs
+outputs/*.json
+# IDE
+.vscode/
+.idea/
+# Data files (optional - uncomment if PDFs are large)
+# data/schemes_pdfs/*.pdf
+# data/exams_pdfs/*.pdf

ARCHITECTURE.txt ADDED Viewed

	@@ -0,0 +1,330 @@

+"""
+JanSahayak Architecture Overview
+================================
+SYSTEM COMPONENTS
+-----------------
+1. AGENTS (agents/)
+   - profiling_agent.py     → User Profile Extraction
+   - scheme_agent.py        → Government Scheme Recommendations
+   - exam_agent.py          → Competitive Exam Recommendations
+   - search_agent.py        → Live Web Search (Tavily)
+   - rag_agent.py          → Vector Database Retrieval
+   - document_agent.py      → PDF/Image Text Extraction
+   - benefit_agent.py       → Missed Benefits Calculator
+2. PROMPTS (prompts/)
+   - profiling_prompt.py    → User profiling instructions
+   - scheme_prompt.py       → Scheme recommendation template
+   - exam_prompt.py         → Exam recommendation template
+   - rag_prompt.py          → RAG retrieval instructions
+3. RAG SYSTEM (rag/)
+   - embeddings.py          → HuggingFace embeddings (CPU)
+   - scheme_vectorstore.py  → FAISS store for schemes
+   - exam_vectorstore.py    → FAISS store for exams
+4. TOOLS (tools/)
+   - tavily_tool.py         → Live government website search
+5. WORKFLOW (graph/)
+   - workflow.py            → LangGraph orchestration
+6. I/O HANDLERS (agent_io/)
+   - profiling_io.py        → Profiling agent I/O
+   - scheme_io.py           → Scheme agent I/O
+   - exam_io.py             → Exam agent I/O
+   - benefit_io.py          → Benefit agent I/O
+7. DATA (data/)
+   - schemes_pdfs/          → Government scheme PDFs
+   - exams_pdfs/            → Competitive exam PDFs
+8. OUTPUTS (outputs/)
+   - results_*.json         → Generated analysis results
+9. CONFIGURATION
+   - config.py              → Configuration loader
+   - .env                   → API keys (user creates)
+   - requirements.txt       → Python dependencies
+10. ENTRY POINTS
+    - main.py               → Main application
+    - setup.py              → Setup wizard
+WORKFLOW EXECUTION
+------------------
+User Input
+    ↓
+[Profiling Agent]
+    ↓
+    ├─→ [Scheme Agent] ──→ [Benefit Agent] ──┐
+    │         ↓                               │
+    │     [RAG Search]                        │
+    │         ↓                               │
+    │   [Tavily Search]                       │
+    │                                         │
+    └─→ [Exam Agent] ────────────────────────┤
+              ↓                               │
+          [RAG Search]                        │
+              ↓                               │
+        [Tavily Search]                       │
+                                             ↓
+                                    [Final Output]
+                                             ↓
+                                   [JSON Results File]
+TECHNOLOGY STACK
+----------------
+LLM & AI:
+- Groq API (llama-3.3-70b-versatile) → Fast inference
+- LangChain → Agent framework
+- LangGraph → Workflow orchestration
+Embeddings & Search:
+- HuggingFace Transformers → sentence-transformers/all-MiniLM-L6-v2
+- FAISS (CPU) → Vector similarity search
+Web Search:
+- Tavily API → Government website search
+Document Processing:
+- PyPDF → PDF text extraction
+- Pytesseract → OCR for images
+- Pillow → Image processing
+Infrastructure:
+- Python 3.8+
+- CPU-only deployment (no GPU needed)
+- PyTorch CPU version
+DATA FLOW
+---------
+1. User Input Processing:
+   Raw Text → Profiling Agent → Structured JSON Profile
+2. Scheme Recommendation:
+   Profile → RAG Query → Vectorstore Search → Top-K Documents
+   Profile + Documents → Tavily Search (optional) → Web Results
+   Profile + Documents + Web Results → LLM → Recommendations
+3. Exam Recommendation:
+   Profile → RAG Query → Vectorstore Search → Top-K Documents
+   Profile + Documents → Tavily Search (optional) → Web Results
+   Profile + Documents + Web Results → LLM → Recommendations
+4. Benefit Calculation:
+   Profile + Scheme Recommendations → LLM → Missed Benefits Analysis
+5. Final Output:
+   All Results → JSON Compilation → File Save → User Display
+API INTERACTIONS
+----------------
+1. Groq API:
+   - Used by: All LLM-powered agents
+   - Model: llama-3.3-70b-versatile
+   - Purpose: Natural language understanding & generation
+   - Rate: Per-request basis
+2. Tavily API:
+   - Used by: search_agent, scheme_agent, exam_agent
+   - Purpose: Live government website search
+   - Filter: .gov.in domains preferred
+   - Depth: Advanced search mode
+3. HuggingFace:
+   - Used by: embeddings module
+   - Model: sentence-transformers/all-MiniLM-L6-v2
+   - Purpose: Document embeddings for RAG
+   - Local: Runs on CPU, cached after first download
+VECTORSTORE ARCHITECTURE
+------------------------
+Scheme Vectorstore (rag/scheme_index/):
+├── index.faiss          → FAISS index file
+├── index.pkl            → Metadata pickle
+└── [Embedded chunks from schemes_pdfs/]
+Exam Vectorstore (rag/exam_index/):
+├── index.faiss          → FAISS index file
+├── index.pkl            → Metadata pickle
+└── [Embedded chunks from exams_pdfs/]
+Embedding Dimension: 384
+Similarity Metric: Cosine similarity
+Chunk Size: Auto (from PyPDF)
+AGENT SPECIALIZATIONS
+---------------------
+1. Profiling Agent:
+   - Extraction-focused
+   - Low temperature (0.1)
+   - JSON output required
+   - No external tools
+2. Scheme Agent:
+   - RAG + Web search
+   - Temperature: 0.3
+   - Tools: Vectorstore, Tavily
+   - Output: Detailed scheme info
+3. Exam Agent:
+   - RAG + Web search
+   - Temperature: 0.3
+   - Tools: Vectorstore, Tavily
+   - Output: Detailed exam info
+4. Benefit Agent:
+   - Calculation-focused
+   - Temperature: 0.2
+   - No external tools
+   - Output: Financial analysis
+5. Search Agent:
+   - Web search only
+   - Tool: Tavily API
+   - Focus: .gov.in domains
+   - Output: Live search results
+6. RAG Agent:
+   - Vectorstore query only
+   - Tool: FAISS
+   - Similarity search
+   - Output: Relevant documents
+7. Document Agent:
+   - File processing
+   - Tools: PyPDF, Pytesseract
+   - Supports: PDF, Images
+   - Output: Extracted text
+SECURITY & PRIVACY
+------------------
+- API keys stored in .env (not committed to git)
+- User data processed locally except LLM calls
+- No data stored on external servers (except API providers)
+- PDF data remains local
+- Vectorstores are local
+- Output files saved locally
+SCALABILITY NOTES
+-----------------
+Current Setup (Single User):
+- Synchronous workflow
+- Local vectorstores
+- CPU processing
+Potential Scaling:
+- Add Redis for caching
+- Use cloud vectorstore (Pinecone, Weaviate)
+- Parallel agent execution
+- GPU acceleration for embeddings
+- Database for user profiles
+- API service deployment
+ERROR HANDLING
+--------------
+Each agent includes:
+- Try-catch blocks
+- Error state tracking
+- Graceful degradation
+- Partial results on failure
+- Error reporting in final output
+MONITORING & LOGGING
+--------------------
+Current:
+- Console print statements
+- Agent start/completion messages
+- Error messages
+- Final output summary
+Future Enhancement:
+- Structured logging (logging module)
+- Performance metrics
+- API usage tracking
+- User feedback collection
+EXTENSIBILITY
+-------------
+Adding New Agent:
+1. Create agent file in agents/
+2. Add prompt template in prompts/
+3. Create node function in workflow.py
+4. Add node to graph
+5. Define edges (connections)
+6. Optional: Create I/O handler
+Adding New Data Source:
+1. Create vectorstore module in rag/
+2. Add PDFs to data/ subdirectory
+3. Build vectorstore
+4. Create agent or modify existing
+Adding New Tool:
+1. Create tool in tools/
+2. Import in agent
+3. Use in agent logic
+PERFORMANCE BENCHMARKS (Typical)
+---------------------------------
+Vectorstore Building:
+- 10 PDFs: ~2-5 minutes
+- 100 PDFs: ~20-30 minutes
+Query Performance:
+- Profiling: ~1-2 seconds
+- RAG Search: ~0.5-1 second
+- LLM Call: ~1-3 seconds
+- Web Search: ~2-4 seconds
+- Full Workflow: ~10-20 seconds
+Memory Usage:
+- Base: ~500 MB
+- With models: ~2-3 GB
+- With large PDFs: +500 MB per 100 PDFs
+FUTURE ENHANCEMENTS
+-------------------
+1. Multilingual Support (Hindi, regional languages)
+2. Voice input/output
+3. Mobile app integration
+4. Database for user history
+5. Notification system for deadlines
+6. Document upload interface
+7. Real-time scheme updates
+8. Community feedback integration
+9. State-specific customization
+10. Integration with government portals
+END OF ARCHITECTURE DOCUMENT
+"""

Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+# HuggingFace Spaces Dockerfile
+FROM python:3.12-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application files
+COPY . .
+# Expose port 7860 (HuggingFace Spaces default)
+EXPOSE 7860
+# Set environment variable for port
+ENV PORT=7860
+# Run the application
+CMD ["python", "app.py"]

PROJECT_STRUCTURE.txt ADDED Viewed

	@@ -0,0 +1,387 @@

+JanSahayak - Multi-Agent Government Intelligence System
+========================================================
+📦 JanSahayak/
+│
+├── 📄 main.py                          # Main entry point
+├── 📄 setup.py                         # Setup wizard & utilities
+├── 📄 config.py                        # Configuration loader
+├── 📄 requirements.txt                 # Python dependencies
+│
+├── 📄 README.md                        # Project overview
+├── 📄 USAGE_GUIDE.md                   # Comprehensive usage guide
+├── 📄 ARCHITECTURE.txt                 # System architecture
+│
+├── 📄 .env.example                     # Example environment file
+├── 📄 .gitignore                       # Git ignore rules
+│
+├── 📁 agents/                          # Agent modules
+│   ├── __init__.py
+│   ├── profiling_agent.py              # 🧾 User profiling
+│   ├── scheme_agent.py                 # 🏛️ Scheme recommendations
+│   ├── exam_agent.py                   # 🎓 Exam recommendations
+│   ├── search_agent.py                 # 🔎 Web search (Tavily)
+│   ├── rag_agent.py                    # 📚 RAG retrieval
+│   ├── document_agent.py               # 📂 Document processing
+│   └── benefit_agent.py                # 💰 Benefit calculator
+│
+├── 📁 prompts/                         # Prompt templates
+│   ├── __init__.py
+│   ├── profiling_prompt.py             # Profiling instructions
+│   ├── scheme_prompt.py                # Scheme recommendation template
+│   ├── exam_prompt.py                  # Exam recommendation template
+│   └── rag_prompt.py                   # RAG retrieval template
+│
+├── 📁 rag/                             # RAG system
+│   ├── __init__.py
+│   ├── embeddings.py                   # HuggingFace embeddings
+│   ├── scheme_vectorstore.py           # Scheme FAISS store
+│   ├── exam_vectorstore.py             # Exam FAISS store
+│   ├── scheme_index/                   # Generated vectorstore
+│   │   ├── index.faiss
+│   │   └── index.pkl
+│   └── exam_index/                     # Generated vectorstore
+│       ├── index.faiss
+│       └── index.pkl
+│
+├── 📁 tools/                           # External tools
+│   ├── __init__.py
+│   └── tavily_tool.py                  # Tavily search integration
+│
+├── 📁 graph/                           # Workflow orchestration
+│   ├── __init__.py
+│   └── workflow.py                     # LangGraph workflow
+│
+├── 📁 agent_io/                        # Agent I/O handlers
+│   ├── __init__.py
+│   ├── profiling_io.py                 # Profiling I/O
+│   ├── scheme_io.py                    # Scheme I/O
+│   ├── exam_io.py                      # Exam I/O
+│   └── benefit_io.py                   # Benefit I/O
+│
+├── 📁 data/                            # PDF data
+│   ├── schemes_pdfs/                   # Government scheme PDFs
+│   │   └── README.txt
+│   └── exams_pdfs/                     # Competitive exam PDFs
+│       └── README.txt
+│
+└── 📁 outputs/                         # Generated results
+    ├── README.txt
+    └── results_*.json                  # Analysis results
+KEY FILES DESCRIPTION
+=====================
+📄 main.py
+----------
+Main application entry point with:
+- Interactive mode for user input
+- File mode for batch processing
+- Result saving and formatting
+- Summary display
+📄 setup.py
+-----------
+Setup wizard that:
+- Checks dependencies
+- Verifies API keys
+- Validates PDF data
+- Builds vectorstores
+📄 config.py
+------------
+Loads configuration from .env:
+- GROQ_API_KEY
+- TAVILY_API_KEY
+- HF_TOKEN
+📁 agents/
+----------
+7 specialized agents:
+1. profiling_agent.py   → Extract user profile
+2. scheme_agent.py      → Recommend schemes
+3. exam_agent.py        → Recommend exams
+4. search_agent.py      → Live web search
+5. rag_agent.py         → Vector search
+6. document_agent.py    → Process PDFs/images
+7. benefit_agent.py     → Calculate missed benefits
+📁 prompts/
+-----------
+Prompt engineering templates for:
+- User profiling instructions
+- Scheme recommendation format
+- Exam recommendation format
+- RAG retrieval guidance
+📁 rag/
+-------
+RAG (Retrieval Augmented Generation) system:
+- embeddings.py          → HuggingFace embeddings
+- scheme_vectorstore.py  → Scheme database
+- exam_vectorstore.py    → Exam database
+- *_index/              → Generated FAISS indexes
+📁 tools/
+---------
+External tool integrations:
+- tavily_tool.py → Tavily API for government website search
+📁 graph/
+---------
+LangGraph workflow orchestration:
+- workflow.py → Defines agent connections and execution flow
+📁 agent_io/
+------------
+Input/Output handlers for each agent:
+- Separate I/O files for tracking
+- JSON-based data exchange
+- Timestamp tracking
+📁 data/
+--------
+Training data for RAG:
+- schemes_pdfs/ → Government scheme documents
+- exams_pdfs/   → Competitive exam documents
+📁 outputs/
+-----------
+Generated analysis results:
+- results_YYYYMMDD_HHMMSS.json
+- Contains all agent outputs
+WORKFLOW VISUALIZATION
+======================
+                User Input (Text)
+                        ↓
+                ┌───────────────┐
+                │   Profiling   │
+                │     Agent     │
+                └───────┬───────┘
+                        │
+                 Structured Profile
+                        │
+        ┌───────────────┼───────────────┐
+        ↓                               ↓
+┌───────────────┐               ┌───────────────┐
+│    Scheme     │               │     Exam      │
+│     Agent     │               │     Agent     │
+└───────┬───────┘               └───────┬───────┘
+        │                               │
+        ├─→ RAG Search                  ├─→ RAG Search
+        ├─→ Web Search                  └─→ Web Search
+        ↓                               │
+┌───────────────┐                       │
+│    Benefit    │                       │
+│     Agent     │                       │
+└───────┬───────┘                       │
+        │                               │
+        └───────────────┬───────────────┘
+                        ↓
+                ┌───────────────┐
+                │     Final     │
+                │    Output     │
+                └───────────────┘
+                        ↓
+                   JSON File
+TECHNOLOGY COMPONENTS
+=====================
+🧠 Brain (LLM)
+- Groq API (llama-3.3-70b-versatile)
+- Fast inference (<2s per call)
+- Powers all agents
+📚 Memory (RAG)
+- HuggingFace embeddings (all-MiniLM-L6-v2)
+- FAISS vectorstore (CPU)
+- Semantic search
+🔍 Live Search
+- Tavily API
+- Government website focus
+- Real-time information
+🔗 Orchestration
+- LangChain (agent framework)
+- LangGraph (workflow)
+- State management
+📄 Document Processing
+- PyPDF (PDF extraction)
+- Pytesseract (OCR)
+- Pillow (image handling)
+QUICK START CHECKLIST
+======================
+□ 1. Install dependencies
+     pip install -r requirements.txt
+□ 2. Create .env file
+     Copy .env.example to .env
+     Add GROQ_API_KEY and TAVILY_API_KEY
+□ 3. Add PDF data
+     Place PDFs in data/schemes_pdfs/
+     Place PDFs in data/exams_pdfs/
+□ 4. Run setup
+     python setup.py
+□ 5. Build vectorstores
+     Automatic during setup, or:
+     python setup.py --build-vectorstores
+□ 6. Run the system
+     python main.py
+USAGE EXAMPLES
+==============
+Interactive Mode:
+-----------------
+$ python main.py
+Enter your details:
+I am 25 years old, male, from Maharashtra.
+My family income is 3 lakh per year.
+I belong to OBC category.
+I completed Bachelor's in Engineering.
+I am unemployed and looking for government jobs.
+I am interested in technical and banking sectors.
+[Press Enter twice to submit]
+File Mode:
+----------
+$ python main.py user_input.txt
+Testing Individual Agents:
+---------------------------
+# Test profiling
+python -m agents.profiling_agent
+# Test scheme agent
+python -m agents.scheme_agent
+# Test exam agent
+python -m agents.exam_agent
+Building Vectorstores:
+-----------------------
+python setup.py --build-vectorstores
+Or in Python:
+from rag.scheme_vectorstore import build_scheme_vectorstore
+from rag.exam_vectorstore import build_exam_vectorstore
+build_scheme_vectorstore()
+build_exam_vectorstore()
+OUTPUT FORMAT
+=============
+Generated file: outputs/results_20260302_143022.json
+{
+  "user_profile": {
+    "age": 25,
+    "gender": "Male",
+    "state": "Maharashtra",
+    "income": "300000",
+    "caste": "OBC",
+    "education": "Bachelor's in Engineering",
+    "employment_status": "Unemployed",
+    "interests": "Technical, Banking"
+  },
+  "scheme_recommendations": "...",
+  "exam_recommendations": "...",
+  "missed_benefits_analysis": "...",
+  "errors": []
+}
+SYSTEM REQUIREMENTS
+===================
+✅ Python 3.8 or higher
+✅ 4GB RAM minimum (8GB recommended)
+✅ 2GB storage for dependencies
+✅ Internet connection (for APIs)
+✅ CPU only (no GPU needed)
+API KEYS REQUIRED
+=================
+🔑 GROQ_API_KEY
+   Get from: https://console.groq.com/
+   Purpose: LLM inference
+   Cost: Free tier available
+🔑 TAVILY_API_KEY
+   Get from: https://tavily.com/
+   Purpose: Web search
+   Cost: Free tier available
+🔑 HF_TOKEN (Optional)
+   Get from: https://huggingface.co/settings/tokens
+   Purpose: Model downloads
+   Cost: Free
+SUPPORT & DOCUMENTATION
+========================
+📖 Full Usage Guide: USAGE_GUIDE.md
+🏗️ Architecture Details: ARCHITECTURE.txt
+❓ Quick Start: README.md
+🐛 Troubleshooting: See USAGE_GUIDE.md
+For issues:
+1. Check setup: python setup.py --check
+2. Verify .env file has correct API keys
+3. Ensure PDFs are in data/ directories
+4. Rebuild vectorstores if needed
+PROJECT STATUS
+==============
+✅ Core System: Complete
+✅ All 7 Agents: Implemented
+✅ RAG System: Functional
+✅ Web Search: Integrated
+✅ Workflow: Orchestrated
+✅ I/O Handlers: Created
+✅ Documentation: Comprehensive
+Ready for deployment and testing!
+NEXT STEPS
+==========
+1. Add your API keys to .env
+2. Add government scheme and exam PDFs
+3. Run setup wizard
+4. Test the system
+5. Customize prompts as needed
+6. Add more PDF data over time
+7. Monitor and improve
+Happy Analyzing! 🎉

README.md CHANGED Viewed

@@ -1,11 +1,238 @@
----
-title: Jansahayak
-emoji: 🐨
-colorFrom: green
-colorTo: indigo
-sdk: docker
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: JanSahayak
+emoji: 🙏
+colorFrom: blue
+colorTo: green
+sdk: docker
+pinned: false
+---
+# 🙏 JanSahayak - AI-Powered Government Schemes & Exams Assistant
+> Your personal AI assistant for discovering government schemes and competitive exam opportunities in India
+[![Hugging Face Spaces](https://img.shields.io/badge/🤗-Hugging%20Face-yellow)](https://huggingface.co/spaces)
+[![Flask](https://img.shields.io/badge/Flask-2.3+-green)](https://flask.palletsprojects.com/)
+[![LangChain](https://img.shields.io/badge/LangChain-Latest-blue)](https://www.langchain.com/)
+---
+## 🌟 Features
+### 🤖 Multi-Agent AI System
+- **Profiling Agent**: Extracts structured user information
+- **Scheme Agent**: Recommends relevant government schemes
+- **Exam Agent**: Suggests competitive exams based on qualifications
+- **RAG Agent**: Retrieves information from curated document database
+### 💡 Intelligent Capabilities
+- ✅ Natural language understanding of user profiles
+- ✅ Smart recommendations based on eligibility criteria
+- ✅ RAG (Retrieval-Augmented Generation) with FAISS vectorstore
+- ✅ Real-time web search via Tavily API
+- ✅ PDF generation for saving recommendations
+- ✅ Beautiful web interface with modern UI
+---
+## 🚀 Deploy to Hugging Face Spaces (Recommended)
+### Why Hugging Face Spaces?
+- ✅ **16GB RAM for FREE** (perfect for RAG apps!)
+- ✅ Built for ML/AI applications
+- ✅ Git-based deployment
+- ✅ Public URL instantly
+- ✅ Persistent storage
+### Quick Deploy Steps:
+**Method 1: Using HF CLI (Easiest)**
+```bash
+# Install HF CLI
+pip install huggingface_hub[cli]
+# Login
+huggingface-cli login
+# Create Space and push
+huggingface-cli repo create jansahayak --type space --space_sdk gradio
+git remote add hf https://huggingface.co/spaces/YOUR_USERNAME/jansahayak
+git push hf main
+```
+**Method 2: Manual Setup**
+1. **Create Space** on [huggingface.co/spaces](https://huggingface.co/spaces)
+   - Click "Create new Space"
+   - Name: `jansahayak`
+   - SDK: Select "Gradio" (works with Flask)
+   - Hardware: CPU basic (Free - 16GB RAM!)
+   - License: MIT
+2. **Clone YOUR Space repo** (not GitHub!)
+   ```bash
+   git clone https://huggingface.co/spaces/YOUR_USERNAME/jansahayak
+   cd jansahayak
+   ```
+3. **Copy your project files**
+   ```bash
+   # Copy all files from your JanSahayak folder to the cloned space folder
+   cp -r /path/to/JanSahayak/* .
+   ```
+4. **Add Environment Variables** (Space Settings → Variables and secrets)
+   ```
+   GROQ_API_KEY=your_groq_key
+   TAVILY_API_KEY=your_tavily_key
+   HF_TOKEN=your_hf_token (optional)
+   SKIP_VECTORSTORES=false
+   ```
+5. **Push to Space**
+   ```bash
+   git add .
+   git commit -m "Initial commit"
+   git push
+   ```
+Your app will be live at: `https://huggingface.co/spaces/YOUR_USERNAME/jansahayak`
+### Important Notes:
+- HF Spaces uses its own Git repo (not GitHub directly)
+- App runs on port 7860 by default (Flask uses 5000, update if needed)
+- First deployment may take 5-10 minutes to install dependencies
+- Check Space logs if deployment fails
+---
+## 🛠️ Local Development
+```bash
+# Clone and setup
+git clone https://github.com/YOUR_USERNAME/JanSahayak.git
+cd JanSahayak
+# Create virtual environment
+python -m venv .venv
+source .venv/bin/activate  # Linux/Mac
+.venv\Scripts\activate     # Windows
+# Install dependencies
+pip install -r requirements.txt
+# Configure API keys
+cp .env.example .env
+# Edit .env with your keys
+# Build vectorstores (optional - if you have PDFs)
+python init_embeddings.py
+# Run app
+python app.py
+# or use launcher scripts: start_web.bat (Windows) / ./start_web.sh (Linux/Mac)
+```
+Visit `http://localhost:5000`
+---
+## 🔑 Get API Keys
+| Service | URL | Free Tier | Used For |
+|---------|-----|-----------|----------|
+| **Groq** | [console.groq.com](https://console.groq.com) | ✅ Yes | LLM Inference |
+| **Tavily** | [tavily.com](https://tavily.com) | 1000 searches/mo | Web Search |
+| **HuggingFace** | [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) | ✅ Yes | Model Downloads |
+---
+## 💾 Adding Custom Documents
+### Government Schemes PDFs
+1. Place PDFs in `data/schemes_pdfs/`
+2. Run `python init_embeddings.py`
+3. Restart app
+### Exam Information PDFs
+1. Place PDFs in `data/exams_pdfs/`
+2. Run `python init_embeddings.py`
+3. Restart app
+Automatically indexed and searchable via RAG!
+---
+## 🧪 Technology Stack
+- **Backend**: Flask
+- **AI**: LangChain + LangGraph
+- **LLM**: Groq (Llama 3.3 70B)
+- **Embeddings**: sentence-transformers/all-MiniLM-L6-v2
+- **Vector DB**: FAISS (local)
+- **Search**: Tavily API
+- **Frontend**: HTML5 + CSS3 + JavaScript
+---
+## 📁 Project Structure
+```
+JanSahayak/
+├── app.py                    # Flask web app
+├── main.py                   # CLI interface
+├── agents/                   # AI agents
+│   ├── profiling_agent.py
+│   ├── scheme_agent.py
+│   ├── exam_agent.py
+│   └── rag_agent.py
+├── rag/                      # RAG components
+│   ├── embeddings.py
+│   ├── scheme_vectorstore.py
+│   └── exam_vectorstore.py
+├── data/                     # Documents
+│   ├── schemes_pdfs/
+│   └── exams_pdfs/
+├── templates/                # HTML templates
+└── static/                   # CSS/JS
+```
+---
+## 🐛 Troubleshooting
+**Memory issues on local machine?**
+```env
+# Set in .env
+SKIP_VECTORSTORES=true
+```
+Uses web search only (no embeddings needed)
+**Vectorstore errors?**
+```bash
+rm -rf rag/scheme_index rag/exam_index
+python init_embeddings.py
+```
+---
+## 🤝 Contributing
+Contributions welcome! Fork → Create branch → Submit PR
+---
+## 📜 License
+MIT License
+---
+## 🙏 Acknowledgments
+Built with [LangChain](https://www.langchain.com/), [Groq](https://groq.com/), [Tavily](https://tavily.com/), and ❤️
+---
+Made for the people of India 🇮🇳

agent_io/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Agent I/O Module Init
+"""

agent_io/benefit_io.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+Benefit Agent I/O Handler
+Manages input/output for missed benefits calculator agent
+"""
+import json
+import os
+from datetime import datetime
+class BenefitIO:
+    """Handles input/output operations for benefit calculator agent"""
+    def __init__(self, input_file: str = "agent_io/benefit_input.json",
+                 output_file: str = "agent_io/benefit_output.json"):
+        self.input_file = input_file
+        self.output_file = output_file
+        self._ensure_directory()
+    def _ensure_directory(self):
+        """Create agent_io directory if it doesn't exist"""
+        os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
+    def read_input(self) -> dict:
+        """
+        Read benefit calculator input from file
+        Returns:
+            Input configuration dictionary
+        """
+        try:
+            if os.path.exists(self.input_file):
+                with open(self.input_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Input file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+    def write_input(self, profile_data: dict, scheme_recommendations: str, years: int = 5):
+        """
+        Write input for benefit calculator
+        Args:
+            profile_data: User profile dictionary
+            scheme_recommendations: Eligible schemes text
+            years: Number of years to calculate (default: 5)
+        """
+        input_data = {
+            "timestamp": datetime.now().isoformat(),
+            "profile": profile_data,
+            "scheme_recommendations": scheme_recommendations,
+            "calculation_years": years,
+            "agent": "benefit_calculator"
+        }
+        with open(self.input_file, 'w', encoding='utf-8') as f:
+            json.dump(input_data, f, indent=2, ensure_ascii=False)
+    def write_output(self, calculation: dict, metadata: dict = None):
+        """
+        Write benefit calculation to output file
+        Args:
+            calculation: Missed benefits calculation
+            metadata: Optional metadata about calculation
+        """
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "calculation": calculation,
+            "metadata": metadata or {},
+            "agent": "benefit_calculator"
+        }
+        with open(self.output_file, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, indent=2, ensure_ascii=False)
+    def read_output(self) -> dict:
+        """
+        Read previous benefit calculations
+        Returns:
+            Previous calculations dictionary
+        """
+        try:
+            if os.path.exists(self.output_file):
+                with open(self.output_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Output file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+if __name__ == "__main__":
+    # Test BenefitIO
+    io = BenefitIO()
+    # Sample input
+    profile = {
+        "age": 25,
+        "income": "300000"
+    }
+    schemes = "PM Kisan: ₹6000/year"
+    io.write_input(profile, schemes, years=5)
+    print("Input written successfully")
+    # Sample output
+    calculation = {
+        "total_missed": "₹30,000",
+        "breakdown": {"2022": "₹6000", "2023": "₹6000"}
+    }
+    io.write_output(calculation)
+    print("Output written successfully")

agent_io/exam_io.py ADDED Viewed

	@@ -0,0 +1,115 @@

+"""
+Exam Agent I/O Handler
+Manages input/output for exam recommendation agent
+"""
+import json
+import os
+from datetime import datetime
+class ExamIO:
+    """Handles input/output operations for exam agent"""
+    def __init__(self, input_file: str = "agent_io/exam_input.json",
+                 output_file: str = "agent_io/exam_output.json"):
+        self.input_file = input_file
+        self.output_file = output_file
+        self._ensure_directory()
+    def _ensure_directory(self):
+        """Create agent_io directory if it doesn't exist"""
+        os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
+    def read_input(self) -> dict:
+        """
+        Read exam agent input from file
+        Returns:
+            Input configuration dictionary
+        """
+        try:
+            if os.path.exists(self.input_file):
+                with open(self.input_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Input file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+    def write_input(self, profile_data: dict, preferences: dict = None):
+        """
+        Write input for exam agent
+        Args:
+            profile_data: Student profile dictionary
+            preferences: Optional student preferences
+        """
+        input_data = {
+            "timestamp": datetime.now().isoformat(),
+            "profile": profile_data,
+            "preferences": preferences or {},
+            "agent": "exam_recommendation"
+        }
+        with open(self.input_file, 'w', encoding='utf-8') as f:
+            json.dump(input_data, f, indent=2, ensure_ascii=False)
+    def write_output(self, recommendations: dict, metadata: dict = None):
+        """
+        Write exam recommendations to output file
+        Args:
+            recommendations: Exam recommendations from agent
+            metadata: Optional metadata about the recommendation process
+        """
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "recommendations": recommendations,
+            "metadata": metadata or {},
+            "agent": "exam_recommendation"
+        }
+        with open(self.output_file, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, indent=2, ensure_ascii=False)
+    def read_output(self) -> dict:
+        """
+        Read previous exam recommendations
+        Returns:
+            Previous recommendations dictionary
+        """
+        try:
+            if os.path.exists(self.output_file):
+                with open(self.output_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Output file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+if __name__ == "__main__":
+    # Test ExamIO
+    io = ExamIO()
+    # Sample input
+    profile = {
+        "age": 25,
+        "education": "Bachelor's in Engineering",
+        "interests": "Technical jobs"
+    }
+    io.write_input(profile, {"exam_type": "government"})
+    print("Input written successfully")
+    # Sample output
+    recommendations = {
+        "exams": [
+            {"name": "SSC CGL", "eligibility": "Graduate"}
+        ]
+    }
+    io.write_output(recommendations, {"sources": 5})
+    print("Output written successfully")

agent_io/profiling_io.py ADDED Viewed

	@@ -0,0 +1,111 @@

+"""
+Profiling Agent I/O Handler
+Manages input/output for user profiling agent
+"""
+import json
+import os
+from datetime import datetime
+class ProfilingIO:
+    """Handles input/output operations for profiling agent"""
+    def __init__(self, input_file: str = "agent_io/profiling_input.json",
+                 output_file: str = "agent_io/profiling_output.json"):
+        self.input_file = input_file
+        self.output_file = output_file
+        self._ensure_directory()
+    def _ensure_directory(self):
+        """Create agent_io directory if it doesn't exist"""
+        os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
+    def read_input(self) -> dict:
+        """
+        Read profiling agent input from file
+        Returns:
+            Raw user input dictionary
+        """
+        try:
+            if os.path.exists(self.input_file):
+                with open(self.input_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Input file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+    def write_input(self, user_input: str, documents: list = None):
+        """
+        Write raw user input for profiling
+        Args:
+            user_input: Raw text input from user
+            documents: Optional list of uploaded documents
+        """
+        input_data = {
+            "timestamp": datetime.now().isoformat(),
+            "user_input": user_input,
+            "documents": documents or [],
+            "agent": "user_profiling"
+        }
+        with open(self.input_file, 'w', encoding='utf-8') as f:
+            json.dump(input_data, f, indent=2, ensure_ascii=False)
+    def write_output(self, profile_data: dict, confidence: dict = None):
+        """
+        Write extracted profile to output file
+        Args:
+            profile_data: Structured profile data
+            confidence: Optional confidence scores for extracted fields
+        """
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "profile": profile_data,
+            "confidence": confidence or {},
+            "agent": "user_profiling"
+        }
+        with open(self.output_file, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, indent=2, ensure_ascii=False)
+    def read_output(self) -> dict:
+        """
+        Read extracted profile
+        Returns:
+            Structured profile dictionary
+        """
+        try:
+            if os.path.exists(self.output_file):
+                with open(self.output_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Output file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+if __name__ == "__main__":
+    # Test ProfilingIO
+    io = ProfilingIO()
+    # Sample input
+    user_text = "I am 25 years old from Maharashtra, OBC category, income 3 lakh."
+    io.write_input(user_text, documents=["resume.pdf"])
+    print("Input written successfully")
+    # Sample output
+    profile = {
+        "age": 25,
+        "state": "Maharashtra",
+        "caste": "OBC",
+        "income": "300000"
+    }
+    io.write_output(profile, confidence={"age": 1.0, "state": 1.0})
+    print("Output written successfully")

agent_io/scheme_io.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""
+Scheme Agent I/O Handler
+Manages input/output for scheme recommendation agent
+"""
+import json
+import os
+from datetime import datetime
+class SchemeIO:
+    """Handles input/output operations for scheme agent"""
+    def __init__(self, input_file: str = "agent_io/scheme_input.json",
+                 output_file: str = "agent_io/scheme_output.json"):
+        self.input_file = input_file
+        self.output_file = output_file
+        self._ensure_directory()
+    def _ensure_directory(self):
+        """Create agent_io directory if it doesn't exist"""
+        os.makedirs(os.path.dirname(self.input_file), exist_ok=True)
+    def read_input(self) -> dict:
+        """
+        Read scheme agent input from file
+        Returns:
+            Input configuration dictionary
+        """
+        try:
+            if os.path.exists(self.input_file):
+                with open(self.input_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Input file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+    def write_input(self, profile_data: dict, preferences: dict = None):
+        """
+        Write input for scheme agent
+        Args:
+            profile_data: User profile dictionary
+            preferences: Optional user preferences
+        """
+        input_data = {
+            "timestamp": datetime.now().isoformat(),
+            "profile": profile_data,
+            "preferences": preferences or {},
+            "agent": "scheme_recommendation"
+        }
+        with open(self.input_file, 'w', encoding='utf-8') as f:
+            json.dump(input_data, f, indent=2, ensure_ascii=False)
+    def write_output(self, recommendations: dict, metadata: dict = None):
+        """
+        Write scheme recommendations to output file
+        Args:
+            recommendations: Scheme recommendations from agent
+            metadata: Optional metadata about the recommendation process
+        """
+        output_data = {
+            "timestamp": datetime.now().isoformat(),
+            "recommendations": recommendations,
+            "metadata": metadata or {},
+            "agent": "scheme_recommendation"
+        }
+        with open(self.output_file, 'w', encoding='utf-8') as f:
+            json.dump(output_data, f, indent=2, ensure_ascii=False)
+    def read_output(self) -> dict:
+        """
+        Read previous scheme recommendations
+        Returns:
+            Previous recommendations dictionary
+        """
+        try:
+            if os.path.exists(self.output_file):
+                with open(self.output_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            else:
+                return {"error": "Output file not found"}
+        except Exception as e:
+            return {"error": str(e)}
+if __name__ == "__main__":
+    # Test SchemeIO
+    io = SchemeIO()
+    # Sample input
+    profile = {
+        "age": 25,
+        "income": "300000",
+        "state": "Maharashtra",
+        "caste": "OBC"
+    }
+    io.write_input(profile, {"priority": "high_benefit"})
+    print("Input written successfully")
+    # Sample output
+    recommendations = {
+        "schemes": [
+            {"name": "PM Kisan", "benefit": "₹6000/year"}
+        ]
+    }
+    io.write_output(recommendations, {"sources": 5})
+    print("Output written successfully")

agents/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Agents Module Init
+"""

agents/benefit_agent.py ADDED Viewed

	@@ -0,0 +1,213 @@

+"""
+Missed Benefits Calculator Agent
+Estimates potential benefits user might have missed
+"""
+import json
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage, SystemMessage
+from config import GROQ_API_KEY
+def get_llm():
+    """Initialize Groq LLM"""
+    if not GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY not found in environment variables")
+    return ChatGroq(
+        api_key=GROQ_API_KEY,
+        model="llama-3.3-70b-versatile",
+        temperature=0.2
+    )
+def calculate_missed_benefits(profile_data: dict, scheme_recommendations: str) -> dict:
+    """
+    Calculates potential benefits the user might have missed in the past
+    Args:
+        profile_data: User profile dictionary
+        scheme_recommendations: Recommended schemes text
+    Returns:
+        Dictionary with missed benefits calculation
+    """
+    try:
+        llm = get_llm()
+        profile_str = json.dumps(profile_data, indent=2)
+        prompt = f"""
+You are a financial analyst specializing in Indian government welfare schemes.
+Based on the user's profile and recommended schemes, calculate how much money/benefits
+they might have missed in the past 5 years by not applying to eligible schemes.
+**USER PROFILE:**
+{profile_str}
+**RECOMMENDED SCHEMES:**
+{scheme_recommendations}
+**ANALYSIS REQUIREMENTS:**
+1. **Identify Eligible Schemes:**
+   - List schemes user was eligible for in past 5 years
+   - Consider age, income, education criteria over time
+2. **Calculate Monetary Benefits:**
+   - One-time payments missed
+   - Annual recurring benefits missed
+   - Subsidies or discounts not availed
+   - Total missed amount (conservative estimate)
+3. **Non-Monetary Benefits:**
+   - Training opportunities missed
+   - Healthcare benefits not utilized
+   - Educational scholarships lost
+   - Employment opportunities missed
+4. **Year-wise Breakdown:**
+   - Provide year-wise missed benefit estimate
+   - Account for scheme start dates
+   - Consider eligibility changes over time
+5. **Actionable Insights:**
+   - Can any benefits be claimed retroactively?
+   - Which schemes should be applied immediately?
+   - Priority ranking for current applications
+**OUTPUT FORMAT:**
+### Total Missed Benefits (Past 5 Years)
+- **Monetary Loss:** ₹[Amount]
+- **Non-Monetary Loss:** [Description]
+### Year-wise Breakdown
+**2022:**
+- Scheme Name: ₹[Amount] | [Benefit Description]
+**2023:**
+- Scheme Name: ₹[Amount] | [Benefit Description]
+[Continue for all years]
+### Retroactive Claims Possible
+- List schemes that allow backdated applications
+- Required documentation for backdated claims
+### Immediate Action Items
+1. [Highest priority scheme to apply now]
+2. [Second priority scheme]
+3. [Third priority scheme]
+### Future Projections
+If user applies now, estimated benefits over next 5 years: ₹[Amount]
+---
+**IMPORTANT NOTES:**
+- Provide conservative estimates (lower bound)
+- Mark assumptions clearly
+- Only include verified government schemes
+- Consider state-specific schemes based on user's state
+- Factor in income bracket changes over time
+Proceed with calculation:
+"""
+        messages = [
+            SystemMessage(content="You are a financial analyst for government welfare schemes. Provide realistic, conservative estimates."),
+            HumanMessage(content=prompt)
+        ]
+        response = llm.invoke(messages)
+        return {
+            "calculation": response.content,
+            "profile_considered": profile_data.get('age', 'N/A'),
+            "schemes_analyzed": "Available in recommendations"
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "calculation": "Unable to calculate missed benefits"
+        }
+def estimate_future_benefits(profile_data: dict, scheme_recommendations: str, years: int = 5) -> dict:
+    """
+    Estimates potential benefits over the next N years if user applies now
+    Args:
+        profile_data: User profile dictionary
+        scheme_recommendations: Recommended schemes text
+        years: Number of years to project (default: 5)
+    Returns:
+        Dictionary with future benefits projection
+    """
+    try:
+        llm = get_llm()
+        profile_str = json.dumps(profile_data, indent=2)
+        prompt = f"""
+Based on the user's current profile and eligible schemes, estimate the total benefits
+they can receive over the next {years} years if they apply immediately.
+**USER PROFILE:**
+{profile_str}
+**ELIGIBLE SCHEMES:**
+{scheme_recommendations}
+Provide:
+1. Year-wise projected benefits
+2. Total estimated benefits over {years} years
+3. Required actions to maximize benefits
+4. Key deadlines to watch
+Return structured calculation with conservative estimates.
+"""
+        messages = [
+            SystemMessage(content="You are a financial projection analyst for government schemes."),
+            HumanMessage(content=prompt)
+        ]
+        response = llm.invoke(messages)
+        return {
+            "projection": response.content,
+            "years_projected": years,
+            "profile_age": profile_data.get('age', 'N/A')
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "projection": "Unable to estimate future benefits"
+        }
+if __name__ == "__main__":
+    # Test the agent
+    test_profile = {
+        "age": 25,
+        "income": "300000",
+        "caste": "OBC",
+        "state": "Maharashtra",
+        "education": "Bachelor's in Engineering",
+        "employment_status": "Unemployed"
+    }
+    test_schemes = """
+    1. PM Kisan Samman Nidhi: ₹6000 per year
+    2. Post Matric Scholarship (OBC): ₹5000-10000 per year
+    3. Skill Development Scheme: Free training worth ₹20000
+    """
+    result = calculate_missed_benefits(test_profile, test_schemes)
+    print(json.dumps(result, indent=2))

agents/document_agent.py ADDED Viewed

	@@ -0,0 +1,165 @@

+"""
+Document Processing Agent
+Handles PDF and image text extraction
+"""
+import os
+import pytesseract
+from PIL import Image
+from pypdf import PdfReader
+def process_pdf(file_path: str) -> dict:
+    """
+    Extracts text from PDF file
+    Args:
+        file_path: Path to PDF file
+    Returns:
+        Dictionary with extracted text and metadata
+    """
+    try:
+        if not os.path.exists(file_path):
+            return {"error": f"File not found: {file_path}", "text": ""}
+        reader = PdfReader(file_path)
+        text = ""
+        for page_num, page in enumerate(reader.pages):
+            page_text = page.extract_text()
+            text += f"\n--- Page {page_num + 1} ---\n{page_text}"
+        return {
+            "file_path": file_path,
+            "pages": len(reader.pages),
+            "text": text,
+            "success": True
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "file_path": file_path,
+            "text": "",
+            "success": False
+        }
+def process_image(file_path: str, language: str = 'eng+hin') -> dict:
+    """
+    Extracts text from image using OCR
+    Args:
+        file_path: Path to image file
+        language: Tesseract language code (default: English + Hindi)
+    Returns:
+        Dictionary with extracted text and metadata
+    """
+    try:
+        if not os.path.exists(file_path):
+            return {"error": f"File not found: {file_path}", "text": ""}
+        img = Image.open(file_path)
+        text = pytesseract.image_to_string(img, lang=language)
+        return {
+            "file_path": file_path,
+            "image_size": img.size,
+            "text": text,
+            "success": True
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "file_path": file_path,
+            "text": "",
+            "success": False
+        }
+def process_resume(file_path: str) -> dict:
+    """
+    Processes resume (PDF or image) and extracts relevant information
+    Args:
+        file_path: Path to resume file
+    Returns:
+        Extracted resume information
+    """
+    file_ext = os.path.splitext(file_path)[1].lower()
+    if file_ext == '.pdf':
+        result = process_pdf(file_path)
+    elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']:
+        result = process_image(file_path)
+    else:
+        return {
+            "error": f"Unsupported file format: {file_ext}",
+            "text": "",
+            "success": False
+        }
+    if result.get("success"):
+        # Basic resume parsing (can be enhanced)
+        text = result["text"]
+        result["document_type"] = "resume"
+        result["contains_email"] = "@" in text
+        result["contains_phone"] = any(char.isdigit() for char in text)
+    return result
+def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list:
+    """
+    Processes multiple documents in a folder
+    Args:
+        folder_path: Path to folder containing documents
+        file_type: Type of files to process ("pdf" or "image")
+    Returns:
+        List of processing results for each document
+    """
+    results = []
+    if not os.path.exists(folder_path):
+        return [{"error": f"Folder not found: {folder_path}"}]
+    extensions = {
+        "pdf": [".pdf"],
+        "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"]
+    }
+    valid_extensions = extensions.get(file_type, [".pdf"])
+    for filename in os.listdir(folder_path):
+        file_path = os.path.join(folder_path, filename)
+        file_ext = os.path.splitext(filename)[1].lower()
+        if file_ext in valid_extensions:
+            if file_type == "pdf":
+                result = process_pdf(file_path)
+            else:
+                result = process_image(file_path)
+            results.append(result)
+    return results
+if __name__ == "__main__":
+    # Test the agent
+    # Note: You'll need to provide actual file paths to test
+    # Example usage
+    print("Document Processing Agent")
+    print("=" * 50)
+    print("Available functions:")
+    print("1. process_pdf(file_path)")
+    print("2. process_image(file_path)")
+    print("3. process_resume(file_path)")
+    print("4. batch_process_documents(folder_path, file_type)")

agents/exam_agent.py ADDED Viewed

	@@ -0,0 +1,138 @@

+"""
+Exam Recommendation Agent
+Provides competitive exam recommendations based on student profile
+Uses FAISS for local vector storage
+"""
+import json
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage, SystemMessage
+from rag.exam_vectorstore import load_exam_vectorstore
+from prompts.exam_prompt import EXAM_PROMPT
+from tools.tavily_tool import government_focused_search
+from config import GROQ_API_KEY
+def get_llm():
+    """Initialize Groq LLM"""
+    if not GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY not found in environment variables")
+    return ChatGroq(
+        api_key=GROQ_API_KEY,
+        model="llama-3.3-70b-versatile",
+        temperature=0.3
+    )
+def run_exam_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict:
+    """
+    Recommends competitive exams based on student profile
+    Args:
+        profile_data: Structured user profile
+        use_web_search: Whether to use Tavily for live search
+        vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading)
+    Returns:
+        Exam recommendations dictionary
+    """
+    try:
+        # Use provided vectorstore or try to load it
+        context = ""
+        sources_used = 0
+        if vectorstore is not None:
+            print("✅ Using pre-loaded vectorstore")
+            try:
+                # Create search query from profile
+                search_query = f"""
+                Student Profile:
+                Education: {profile_data.get('education', 'N/A')}
+                Age: {profile_data.get('age', 'N/A')}
+                Interests: {profile_data.get('interests', 'N/A')}
+                Skills: {profile_data.get('skills', 'N/A')}
+                Occupation: {profile_data.get('occupation', 'N/A')}
+                """
+                # RAG retrieval
+                docs = vectorstore.similarity_search(search_query, k=5)
+                context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)])
+                sources_used = len(docs)
+                print(f"✓ Retrieved {sources_used} exam documents from vectorstore")
+            except Exception as e:
+                print(f"⚠️  Error querying vectorstore: {str(e)}")
+                context = "Vectorstore query failed. Using live web search."
+        else:
+            print("ℹ️  No vectorstore provided, using web search only")
+            context = "No local exam database available. Using live web search."
+        # Create profile string
+        profile_str = json.dumps(profile_data, indent=2)
+        # Web search (fallback or enhancement)
+        web_context = ""
+        if use_web_search:
+            try:
+                education = profile_data.get('education', 'graduate')
+                interests = profile_data.get('interests', 'government jobs')
+                web_query = f"competitive exams India {education} {interests} eligibility 2026"
+                print(f"🔍 Searching web: {web_query}")
+                web_results = government_focused_search(web_query)
+                web_context = f"\n\nLive Web Search Results:\n{web_results}"
+                print("✓ Web search completed")
+            except Exception as e:
+                web_context = f"\n\nWeb search unavailable: {str(e)}"
+                print(f"⚠ Web search failed: {str(e)}")
+        # Combine contexts
+        full_context = context + web_context
+        # If no context at all, return helpful message
+        if not full_context.strip():
+            return {
+                "recommendations": "Unable to retrieve exam information. Please ensure Tavily API key is configured or vectorstore is built.",
+                "sources_used": 0,
+                "web_search_used": use_web_search
+            }
+        # Generate recommendations
+        llm = get_llm()
+        prompt = EXAM_PROMPT.format(
+            context=full_context,
+            profile=profile_str
+        )
+        messages = [
+            SystemMessage(content="You are an expert competitive exam advisor. Provide accurate, verified information only."),
+            HumanMessage(content=prompt)
+        ]
+        response = llm.invoke(messages)
+        return {
+            "recommendations": response.content,
+            "sources_used": sources_used,
+            "web_search_used": use_web_search
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "recommendations": []
+        }
+if __name__ == "__main__":
+    # Test the agent
+    test_profile = {
+        "education": "Bachelor's in Engineering",
+        "age": 25,
+        "interests": "Technical jobs, government sector",
+        "skills": "Programming, problem solving",
+        "occupation": "Student"
+    }
+    result = run_exam_agent(test_profile, use_web_search=False)
+    print(json.dumps(result, indent=2))

agents/profiling_agent.py ADDED Viewed

	@@ -0,0 +1,149 @@

+"""
+User Profiling Agent
+Extracts structured user information for eligibility matching
+"""
+import json
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage, SystemMessage
+from prompts.profiling_prompt import PROFILING_PROMPT
+from config import GROQ_API_KEY
+def get_llm():
+    """Initialize Groq LLM"""
+    if not GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY not found in environment variables")
+    return ChatGroq(
+        api_key=GROQ_API_KEY,
+        model="llama-3.3-70b-versatile",
+        temperature=0.1  # Low temperature for structured extraction
+    )
+def extract_json_from_text(text: str) -> dict:
+    """Extract JSON from text that might contain markdown or extra content"""
+    import re
+    # Try direct JSON parse first
+    try:
+        return json.loads(text.strip())
+    except json.JSONDecodeError:
+        pass
+    # Try to extract JSON from markdown code blocks
+    json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```'
+    matches = re.findall(json_pattern, text, re.DOTALL)
+    if matches:
+        try:
+            return json.loads(matches[0])
+        except json.JSONDecodeError:
+            pass
+    # Try to find complete JSON object (improved pattern)
+    # Match from first { to last }
+    start_idx = text.find('{')
+    end_idx = text.rfind('}')
+    if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
+        try:
+            potential_json = text[start_idx:end_idx+1]
+            return json.loads(potential_json)
+        except json.JSONDecodeError:
+            pass
+    # Fallback: try to find any JSON-like structure
+    json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}'
+    matches = re.findall(json_pattern, text, re.DOTALL)
+    for match in matches:
+        try:
+            return json.loads(match)
+        except json.JSONDecodeError:
+            continue
+    return None
+def run_profiling_agent(user_input: str) -> dict:
+    """
+    Extracts structured profile information from user input
+    Args:
+        user_input: Raw user input text
+    Returns:
+        Structured profile dictionary
+    """
+    try:
+        llm = get_llm()
+        prompt = PROFILING_PROMPT.format(user_input=user_input)
+        messages = [
+            SystemMessage(content="You are an expert user profiling agent. Return ONLY a valid JSON object, nothing else."),
+            HumanMessage(content=prompt)
+        ]
+        response = llm.invoke(messages)
+        print(f"\n🤖 LLM Response (first 200 chars): {response.content[:200]}...")
+        # Extract JSON from response
+        profile_data = extract_json_from_text(response.content)
+        if profile_data:
+            # Normalize keys to lowercase with underscores
+            normalized_profile = {}
+            for key, value in profile_data.items():
+                normalized_key = key.lower().replace(' ', '_').replace('-', '_')
+                normalized_profile[normalized_key] = value
+            print(f"✅ Profile extracted: {list(normalized_profile.keys())}")
+            return normalized_profile
+        else:
+            # Fallback: Create basic profile from user input
+            print("⚠️ Could not parse JSON, creating basic profile")
+            return {
+                "user_input": user_input,
+                "raw_profile": response.content,
+                "note": "Profile extraction incomplete. Using raw input."
+            }
+    except Exception as e:
+        print(f"❌ Profiling error: {str(e)}")
+        return {
+            "error": str(e),
+            "user_input": user_input
+        }
+def validate_profile(profile_data: dict) -> bool:
+    """
+    Validates that profile has minimum required information
+    Args:
+        profile_data: Profile dictionary
+    Returns:
+        True if valid, False otherwise
+    """
+    required_fields = ['age', 'state', 'education']
+    for field in required_fields:
+        if field not in profile_data or profile_data[field] == "Not Provided":
+            return False
+    return True
+if __name__ == "__main__":
+    # Test the agent
+    test_input = """
+    I am a 25-year-old male from Maharashtra. I completed my Bachelor's in Engineering.
+    My family income is around 3 lakh per year. I belong to the OBC category.
+    I am currently unemployed and looking for government job opportunities.
+    """
+    result = run_profiling_agent(test_input)
+    print(json.dumps(result, indent=2))

agents/rag_agent.py ADDED Viewed

	@@ -0,0 +1,91 @@

+"""
+RAG Retrieval Agent
+Dedicated agent for vector database queries
+Uses FAISS for local vector storage
+"""
+import json
+from rag.scheme_vectorstore import load_scheme_vectorstore
+from rag.exam_vectorstore import load_exam_vectorstore
+def run_rag_agent(query: str, database: str = "schemes", k: int = 5) -> dict:
+    """
+    Performs RAG retrieval from specified vectorstore
+    Args:
+        query: Search query
+        database: "schemes" or "exams"
+        k: Number of documents to retrieve
+    Returns:
+        Retrieved documents dictionary
+    """
+    try:
+        if database == "schemes":
+            vectorstore = load_scheme_vectorstore()
+        elif database == "exams":
+            vectorstore = load_exam_vectorstore()
+        else:
+            return {
+                "error": f"Invalid database: {database}. Use 'schemes' or 'exams'",
+                "documents": []
+            }
+        # Similarity search
+        docs = vectorstore.similarity_search(query, k=k)
+        # Format results
+        formatted_docs = []
+        for i, doc in enumerate(docs):
+            formatted_docs.append({
+                "id": i + 1,
+                "content": doc.page_content,
+                "metadata": doc.metadata,
+                "source": doc.metadata.get('source', 'Unknown')
+            })
+        return {
+            "query": query,
+            "database": database,
+            "documents_found": len(formatted_docs),
+            "documents": formatted_docs
+        }
+    except FileNotFoundError as e:
+        return {
+            "error": f"Vectorstore not found for {database}. Please build it first.",
+            "documents": []
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "documents": []
+        }
+def hybrid_rag_search(query: str, k: int = 3) -> dict:
+    """
+    Searches both scheme and exam databases
+    Args:
+        query: Search query
+        k: Number of documents per database
+    Returns:
+        Combined results from both databases
+    """
+    scheme_results = run_rag_agent(query, database="schemes", k=k)
+    exam_results = run_rag_agent(query, database="exams", k=k)
+    return {
+        "query": query,
+        "scheme_results": scheme_results,
+        "exam_results": exam_results
+    }
+if __name__ == "__main__":
+    # Test the agent
+    result = run_rag_agent("agricultural schemes for farmers", database="schemes", k=3)
+    print(json.dumps(result, indent=2))

agents/scheme_agent.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Scheme Recommendation Agent
+Provides RAG-based government scheme recommendations
+Uses FAISS for local vector storage
+"""
+import json
+from langchain_groq import ChatGroq
+from langchain_core.messages import HumanMessage, SystemMessage
+from rag.scheme_vectorstore import load_scheme_vectorstore
+from prompts.scheme_prompt import SCHEME_PROMPT
+from tools.tavily_tool import government_focused_search
+from config import GROQ_API_KEY
+def get_llm():
+    """Initialize Groq LLM"""
+    if not GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY not found in environment variables")
+    return ChatGroq(
+        api_key=GROQ_API_KEY,
+        model="llama-3.3-70b-versatile",
+        temperature=0.3
+    )
+def run_scheme_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict:
+    """
+    Recommends government schemes based on user profile
+    Args:
+        profile_data: Structured user profile
+        use_web_search: Whether to use Tavily for live search
+        vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading)
+    Returns:
+        Scheme recommendations dictionary
+    """
+    try:
+        # Use provided vectorstore or try to load it
+        context = ""
+        sources_used = 0
+        if vectorstore is not None:
+            print("✅ Using pre-loaded vectorstore")
+            try:
+                # Create search query from profile
+                search_query = f"""
+                User Profile:
+                Income: {profile_data.get('income', 'N/A')}
+                Caste: {profile_data.get('caste', 'N/A')}
+                State: {profile_data.get('state', 'N/A')}
+                Age: {profile_data.get('age', 'N/A')}
+                Gender: {profile_data.get('gender', 'N/A')}
+                Employment: {profile_data.get('employment_status', 'N/A')}
+                """
+                # RAG retrieval
+                docs = vectorstore.similarity_search(search_query, k=5)
+                context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)])
+                sources_used = len(docs)
+                print(f"✓ Retrieved {sources_used} scheme documents from vectorstore")
+            except Exception as e:
+                print(f"⚠️  Error querying vectorstore: {str(e)}")
+                context = "Vectorstore query failed. Using live web search."
+        else:
+            print("ℹ️  No vectorstore provided, using web search only")
+            context = "No local scheme database available. Using live web search."
+        # Create profile string
+        profile_str = json.dumps(profile_data, indent=2)
+        # Web search (fallback or enhancement)
+        web_context = ""
+        if use_web_search:
+            try:
+                state = profile_data.get('state', 'India')
+                caste = profile_data.get('caste', '')
+                income = profile_data.get('income', '')
+                web_query = f"government schemes India {state} {caste} eligibility benefits 2026"
+                print(f"🔍 Searching web: {web_query}")
+                web_results = government_focused_search(web_query)
+                web_context = f"\n\nLive Web Search Results:\n{web_results}"
+                print("✓ Web search completed")
+            except Exception as e:
+                web_context = f"\n\nWeb search unavailable: {str(e)}"
+                print(f"⚠ Web search failed: {str(e)}")
+        # Combine contexts
+        full_context = context + web_context
+        # If no context at all, return helpful message
+        if not full_context.strip():
+            return {
+                "recommendations": "Unable to retrieve scheme information. Please ensure Tavily API key is configured or vectorstore is built.",
+                "sources_used": 0,
+                "web_search_used": use_web_search
+            }
+        # Generate recommendations
+        llm = get_llm()
+        prompt = SCHEME_PROMPT.format(
+            context=full_context,
+            profile=profile_str
+        )
+        messages = [
+            SystemMessage(content="You are an expert government scheme advisor. Provide accurate, verified information only."),
+            HumanMessage(content=prompt)
+        ]
+        response = llm.invoke(messages)
+        return {
+            "recommendations": response.content,
+            "sources_used": sources_used,
+            "web_search_used": use_web_search
+        }
+    except Exception as e:
+        return {
+            "error": str(e),
+            "recommendations": []
+        }
+if __name__ == "__main__":
+    # Test the agent
+    test_profile = {
+        "income": "300000",
+        "caste": "OBC",
+        "state": "Maharashtra",
+        "age": 25,
+        "gender": "Male",
+        "employment_status": "Unemployed",
+        "education": "Bachelor's in Engineering"
+    }
+    result = run_scheme_agent(test_profile, use_web_search=False)
+    print(json.dumps(result, indent=2))

agents/search_agent.py ADDED Viewed

	@@ -0,0 +1,71 @@

+"""
+Web Search Agent
+Uses Tavily to search government websites for real-time information
+"""
+from tools.tavily_tool import tavily_search, government_focused_search
+def run_search_agent(query: str, government_only: bool = True) -> dict:
+    """
+    Performs web search for government information
+    Args:
+        query: Search query
+        government_only: If True, restricts to .gov.in domains
+    Returns:
+        Search results dictionary
+    """
+    try:
+        if government_only:
+            results = government_focused_search(query)
+        else:
+            results = tavily_search(query)
+        return {
+            "query": query,
+            "results": results,
+            "government_only": government_only
+        }
+    except Exception as e:
+        return {
+            "query": query,
+            "error": str(e),
+            "results": []
+        }
+def search_scheme_details(scheme_name: str) -> dict:
+    """
+    Search for specific scheme details
+    Args:
+        scheme_name: Name of the government scheme
+    Returns:
+        Scheme details from official sources
+    """
+    query = f"{scheme_name} official website application process eligibility"
+    return run_search_agent(query, government_only=True)
+def search_exam_details(exam_name: str) -> dict:
+    """
+    Search for specific exam details
+    Args:
+        exam_name: Name of the competitive exam
+    Returns:
+        Exam details from official sources
+    """
+    query = f"{exam_name} official notification eligibility exam pattern 2026"
+    return run_search_agent(query, government_only=True)
+if __name__ == "__main__":
+    # Test the agent
+    result = run_search_agent("pradhan mantri kisan samman nidhi yojana", government_only=True)
+    print(result)

app.py ADDED Viewed

	@@ -0,0 +1,599 @@

+"""
+JanSahayak Flask Web Application
+Beautiful UI for Multi-Agent Government Intelligence System
+"""
+from flask import Flask, render_template, request, jsonify, session, send_file
+import json
+import os
+from datetime import datetime
+from graph.workflow import run_workflow
+import uuid
+import io
+import re
+from reportlab.lib.pagesizes import letter, A4
+from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
+from reportlab.lib.units import inch
+from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak
+from reportlab.lib import colors
+from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY
+app = Flask(__name__)
+app.secret_key = os.urandom(24)  # For session management
+# Store active sessions
+sessions = {}
+# Global vectorstores (loaded on first use for faster startup)
+SCHEME_VECTORSTORE = None
+EXAM_VECTORSTORE = None
+VECTORSTORES_INITIALIZED = False
+# Check if running on a memory-constrained platform
+SKIP_VECTORSTORES = os.environ.get('SKIP_VECTORSTORES', 'false').lower() == 'true'
+def initialize_vectorstores():
+    """Load vectorstores lazily on first use to avoid blocking port binding"""
+    global SCHEME_VECTORSTORE, EXAM_VECTORSTORE, VECTORSTORES_INITIALIZED
+    if VECTORSTORES_INITIALIZED:
+        return  # Already initialized
+    # Skip vectorstore loading on memory-constrained platforms (use web search only)
+    if SKIP_VECTORSTORES:
+        print("\n" + "="*70)
+        print("⚡ LIGHTWEIGHT MODE: Skipping vectorstore loading")
+        print("="*70)
+        print("✅ Using Tavily web search only (no embeddings model)")
+        print("✅ Low memory usage (<200MB)")
+        print("✅ Real-time, up-to-date information")
+        print("="*70 + "\n")
+        SCHEME_VECTORSTORE = None
+        EXAM_VECTORSTORE = None
+        VECTORSTORES_INITIALIZED = True
+        return
+    print("\n" + "="*70)
+    print("📚 Initializing Vector Stores (lazy loading)")
+    print("="*70)
+    # Load scheme vectorstore
+    try:
+        from rag.scheme_vectorstore import load_scheme_vectorstore
+        SCHEME_VECTORSTORE = load_scheme_vectorstore()
+        print("✅ Scheme vectorstore loaded successfully")
+    except Exception as e:
+        print(f"⚠️  Scheme vectorstore not available: {str(e)}")
+        print("   Will use web search only for schemes")
+        SCHEME_VECTORSTORE = None
+    # Load exam vectorstore
+    try:
+        from rag.exam_vectorstore import load_exam_vectorstore
+        EXAM_VECTORSTORE = load_exam_vectorstore()
+        print("✅ Exam vectorstore loaded successfully")
+    except Exception as e:
+        print(f"⚠️  Exam vectorstore not available: {str(e)}")
+        print("   Will use web search only for exams")
+        EXAM_VECTORSTORE = None
+    VECTORSTORES_INITIALIZED = True
+    print("="*70 + "\n")
+def format_markdown(text):
+    """Convert markdown-style text to HTML"""
+    if not text or not isinstance(text, str):
+        return text
+    import re
+    # Convert headers (### heading)
+    text = re.sub(r'###\s+(.+?)(?=\n|$)', r'<h4>\1</h4>', text)
+    text = re.sub(r'##\s+(.+?)(?=\n|$)', r'<h3>\1</h3>', text)
+    # Convert bold (**text**)
+    text = re.sub(r'\*\*(.+?)\*\*', r'<strong>\1</strong>', text)
+    # Convert italic (*text*)
+    text = re.sub(r'\*(.+?)\*', r'<em>\1</em>', text)
+    # Convert bullet points (- item or * item)
+    text = re.sub(r'^[\-\*]\s+(.+)$', r'<li>\1</li>', text, flags=re.MULTILINE)
+    text = re.sub(r'(<li>.*?</li>)', r'<ul>\1</ul>', text, flags=re.DOTALL)
+    text = text.replace('</ul>\n<ul>', '\n')  # Merge consecutive lists
+    # Convert line breaks
+    text = text.replace('\n\n', '</p><p>')
+    text = text.replace('\n', '<br>')
+    # Wrap in paragraph if not starting with a tag
+    if not text.startswith('<'):
+        text = f'<p>{text}</p>'
+    return text
+# Register Jinja filter
+app.jinja_env.filters['format_markdown'] = format_markdown
+@app.route('/')
+def index():
+    """Landing page with input form"""
+    return render_template('index.html')
+@app.route('/about')
+def about():
+    """About page"""
+    return render_template('about.html')
+@app.route('/health')
+def health():
+    """Health check endpoint for monitoring"""
+    from config import GROQ_API_KEY, TAVILY_API_KEY, HF_TOKEN
+    return jsonify({
+        'status': 'ok',
+        'service': 'JanSahayak',
+        'api_keys_configured': {
+            'groq': bool(GROQ_API_KEY),
+            'tavily': bool(TAVILY_API_KEY),
+            'hf_token': bool(HF_TOKEN)
+        }
+    })
+@app.route('/analyze', methods=['POST'])
+def analyze():
+    """Process user input and run workflow"""
+    try:
+        # First check if API keys are configured
+        from config import GROQ_API_KEY, TAVILY_API_KEY
+        if not GROQ_API_KEY or GROQ_API_KEY == "":
+            return jsonify({
+                'success': False,
+                'error': 'GROQ_API_KEY is not configured. Please set environment variables on Render.'
+            }), 500
+        if not TAVILY_API_KEY or TAVILY_API_KEY == "":
+            return jsonify({
+                'success': False,
+                'error': 'TAVILY_API_KEY is not configured. Please set environment variables on Render.'
+            }), 500
+        # Initialize vectorstores lazily on first request
+        initialize_vectorstores()
+        # Get user input
+        user_input = request.json.get('user_input', '')
+        structured_data = request.json.get('structured_data', None)
+        if not user_input.strip():
+            return jsonify({
+                'success': False,
+                'error': 'Please provide your details'
+            }), 400
+        # Generate session ID
+        session_id = str(uuid.uuid4())
+        # Store in session (including structured data if available)
+        sessions[session_id] = {
+            'status': 'processing',
+            'input': user_input,
+            'structured_data': structured_data,
+            'started_at': datetime.now().isoformat()
+        }
+        # Extract user interests from structured data
+        user_interests = structured_data.get('interests', ['schemes', 'exams']) if structured_data else ['schemes', 'exams']
+        # Prepare structured profile if available
+        structured_profile = None
+        if structured_data:
+            structured_profile = {
+                'name': structured_data.get('name', 'Not Provided'),
+                'age': structured_data.get('age', 'Not Provided'),
+                'gender': structured_data.get('gender', 'Not Provided'),
+                'state': structured_data.get('state', 'Not Provided'),
+                'education': structured_data.get('education', 'Not Provided'),
+                'employment_status': structured_data.get('employment', 'Not Provided'),
+                'income': structured_data.get('income', 'Not Provided'),
+                'caste': structured_data.get('category', 'Not Provided'),
+                'specialization': structured_data.get('specialization', 'Not Provided'),
+                'career_interest': structured_data.get('career_interest', 'Not Provided'),
+                'interests': structured_data.get('interests', [])
+            }
+        # Run workflow with interests, structured profile, and pre-loaded vectorstores
+        result = run_workflow(
+            user_input,
+            user_interests,
+            structured_profile,
+            scheme_vectorstore=SCHEME_VECTORSTORE,
+            exam_vectorstore=EXAM_VECTORSTORE
+        )
+        # Ensure user_profile key exists in result
+        if 'user_profile' not in result and 'profile' in result:
+            result['user_profile'] = result['profile']
+        # Update session
+        sessions[session_id]['status'] = 'completed'
+        sessions[session_id]['result'] = result
+        sessions[session_id]['completed_at'] = datetime.now().isoformat()
+        # Save to file
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"outputs/results_{timestamp}.json"
+        os.makedirs('outputs', exist_ok=True)
+        with open(filename, 'w', encoding='utf-8') as f:
+            json.dump(result, f, indent=2, ensure_ascii=False)
+        return jsonify({
+            'success': True,
+            'session_id': session_id,
+            'result': result,
+            'filename': filename
+        })
+    except ImportError as e:
+        print(f"Import Error in /analyze: {str(e)}")
+        return jsonify({
+            'success': False,
+            'error': f'Configuration error: {str(e)}. Please ensure all dependencies are installed.'
+        }), 500
+    except TimeoutError as e:
+        print(f"Timeout Error in /analyze: {str(e)}")
+        return jsonify({
+            'success': False,
+            'error': 'Request timed out. The analysis is taking longer than expected. Please try again.'
+        }), 504
+    except Exception as e:
+        print(f"Error in /analyze: {str(e)}")
+        import traceback
+        traceback.print_exc()
+        return jsonify({
+            'success': False,
+            'error': f'An error occurred during analysis: {str(e)}'
+        }), 500
+@app.route('/result/<session_id>')
+def result(session_id):
+    """Display results page"""
+    if session_id not in sessions:
+        return render_template('error.html',
+                             error='Session not found'), 404
+    session_data = sessions[session_id]
+    if session_data['status'] != 'completed':
+        return render_template('error.html',
+                             error='Analysis still in progress'), 400
+    return render_template('results.html',
+                         session_id=session_id,
+                         session_data=session_data,
+                         result=session_data['result'])
+@app.route('/api/status/<session_id>')
+def status(session_id):
+    """Check analysis status"""
+    if session_id not in sessions:
+        return jsonify({'error': 'Session not found'}), 404
+    return jsonify(sessions[session_id])
+@app.route('/history')
+def history():
+    """View analysis history"""
+    output_files = []
+    if os.path.exists('outputs'):
+        files = [f for f in os.listdir('outputs') if f.endswith('.json')]
+        files.sort(reverse=True)
+        for filename in files[:10]:  # Show last 10
+            filepath = os.path.join('outputs', filename)
+            with open(filepath, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+                output_files.append({
+                    'filename': filename,
+                    'timestamp': filename.replace('results_', '').replace('.json', ''),
+                    'profile': data.get('user_profile', {}),
+                    'errors': data.get('errors', [])
+                })
+    return render_template('history.html', files=output_files)
+@app.route('/api/file/<filename>')
+def get_file(filename):
+    """Download result file"""
+    try:
+        filepath = os.path.join('outputs', filename)
+        with open(filepath, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        return jsonify(data)
+    except Exception as e:
+        return jsonify({'error': str(e)}), 404
+@app.route('/download/pdf/<session_id>')
+def download_pdf(session_id):
+    """Generate and download PDF report"""
+    try:
+        if session_id not in sessions:
+            return jsonify({'error': 'Session not found'}), 404
+        session_data = sessions[session_id]
+        result = session_data.get('result', {})
+        # Create PDF in memory
+        buffer = io.BytesIO()
+        doc = SimpleDocTemplate(buffer, pagesize=letter,
+                              rightMargin=72, leftMargin=72,
+                              topMargin=72, bottomMargin=18)
+        # Container for PDF elements
+        elements = []
+        # Define styles
+        styles = getSampleStyleSheet()
+        title_style = ParagraphStyle(
+            'CustomTitle',
+            parent=styles['Heading1'],
+            fontSize=24,
+            textColor=colors.HexColor('#5B21B6'),
+            spaceAfter=30,
+            alignment=TA_CENTER
+        )
+        heading_style = ParagraphStyle(
+            'CustomHeading',
+            parent=styles['Heading2'],
+            fontSize=16,
+            textColor=colors.HexColor('#7C3AED'),
+            spaceAfter=12,
+            spaceBefore=12
+        )
+        normal_style = styles['BodyText']
+        normal_style.alignment = TA_JUSTIFY
+        # Get user name for personalization
+        profile = result.get('user_profile', {})
+        user_name = profile.get('name', 'Citizen')
+        if user_name and user_name != 'Not Provided':
+            user_name = user_name.strip()
+        else:
+            user_name = 'Citizen'
+        # Title with logo-like header
+        elements.append(Paragraph("🇮🇳 JanSahayak", title_style))
+        elements.append(Paragraph("Government Benefits Analysis Report", styles['Heading3']))
+        elements.append(Spacer(1, 0.2*inch))
+        # Personalized greeting
+        greeting = ParagraphStyle('Greeting', parent=styles['Normal'], fontSize=14,
+                                 textColor=colors.HexColor('#374151'), spaceBefore=6, spaceAfter=12)
+        elements.append(Paragraph(f"<b>Prepared for: {user_name}</b>", greeting))
+        # Timestamp
+        timestamp = datetime.now().strftime("%B %d, %Y at %I:%M %p")
+        elements.append(Paragraph(f"<i>Generated: {timestamp}</i>", styles['Normal']))
+        # Separator line
+        elements.append(Spacer(1, 0.2*inch))
+        elements.append(Table([['_'*100]], colWidths=[6.5*inch]))
+        elements.append(Spacer(1, 0.4*inch))
+        # User Profile Section
+        elements.append(Paragraph("Your Profile", heading_style))
+        profile = result.get('user_profile', {})
+        if profile:
+            profile_data = []
+            for key, value in profile.items():
+                if key not in ['raw_profile', 'user_input', 'error', 'note'] and value != 'Not Provided':
+                    label = key.replace('_', ' ').title()
+                    # Format interests list properly
+                    if key == 'interests' and isinstance(value, list):
+                        value = ', '.join([v.title() for v in value])
+                    profile_data.append([Paragraph(f"<b>{label}:</b>", normal_style),
+                                       Paragraph(str(value), normal_style)])
+            if profile_data:
+                profile_table = Table(profile_data, colWidths=[2.2*inch, 4.3*inch])
+                profile_table.setStyle(TableStyle([
+                    ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#EEF2FF')),  # Label column
+                    ('BACKGROUND', (1, 0), (1, -1), colors.white),  # Value column
+                    ('TEXTCOLOR', (0, 0), (-1, -1), colors.HexColor('#1F2937')),
+                    ('ALIGN', (0, 0), (-1, -1), 'LEFT'),
+                    ('VALIGN', (0, 0), (-1, -1), 'TOP'),
+                    ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),  # Bold labels
+                    ('FONTNAME', (1, 0), (1, -1), 'Helvetica'),
+                    ('FONTSIZE', (0, 0), (-1, -1), 10),
+                    ('BOTTOMPADDING', (0, 0), (-1, -1), 10),
+                    ('TOPPADDING', (0, 0), (-1, -1), 10),
+                    ('LEFTPADDING', (0, 0), (-1, -1), 12),
+                    ('RIGHTPADDING', (0, 0), (-1, -1), 12),
+                    ('GRID', (0, 0), (-1, -1), 1, colors.HexColor('#D1D5DB')),
+                    ('ROWBACKGROUNDS', (0, 0), (-1, -1), [colors.white, colors.HexColor('#F9FAFB')]),
+                ]))
+                elements.append(profile_table)
+        elements.append(Spacer(1, 0.4*inch))
+        # Helper function to clean and format text
+        def clean_text(text):
+            if not text or not isinstance(text, str):
+                return "No information available"
+            # Skip if "Not requested by user"
+            if "Not requested by user" in text:
+                return None
+            # Remove HTML tags
+            text = re.sub(r'<[^>]+>', '', text)
+            # Convert markdown headers to regular text with proper spacing
+            text = re.sub(r'###\s+(.+)', r'\n\1\n', text)
+            text = re.sub(r'##\s+(.+)', r'\n\1\n', text)
+            # Clean up bold markers
+            text = re.sub(r'\*\*(.+?)\*\*', r'\1', text)
+            # Clean up bullet points
+            text = re.sub(r'^\*\s+', '\u2022 ', text, flags=re.MULTILINE)
+            text = re.sub(r'^-\s+', '\u2022 ', text, flags=re.MULTILINE)
+            return text.strip()
+        # Section style for better visual separation
+        section_box_style = ParagraphStyle(
+            'SectionBox',
+            parent=normal_style,
+            leftIndent=20,
+            rightIndent=20,
+            spaceBefore=6,
+            spaceAfter=6,
+            borderColor=colors.HexColor('#E5E7EB'),
+            borderWidth=1,
+            borderPadding=10,
+            backColor=colors.HexColor('#F9FAFB')
+        )
+        # Government Schemes Section
+        schemes_text = clean_text(result.get('scheme_recommendations', 'No recommendations available'))
+        if schemes_text:
+            elements.append(Paragraph("\ud83c\udfdb\ufe0f Government Schemes for You", heading_style))
+            elements.append(Spacer(1, 0.1*inch))
+            # Split into paragraphs and add with better formatting
+            paragraphs = [p.strip() for p in schemes_text.split('\n\n') if p.strip()]
+            for para in paragraphs:
+                if para:
+                    elements.append(Paragraph(para, normal_style))
+                    elements.append(Spacer(1, 0.15*inch))
+            elements.append(Spacer(1, 0.2*inch))
+        # Competitive Exams Section
+        exams_text = clean_text(result.get('exam_recommendations', 'No recommendations available'))
+        if exams_text:
+            elements.append(Paragraph("\ud83c\udf93 Competitive Exams for You", heading_style))
+            elements.append(Spacer(1, 0.1*inch))
+            paragraphs = [p.strip() for p in exams_text.split('\n\n') if p.strip()]
+            for para in paragraphs:
+                if para:
+                    elements.append(Paragraph(para, normal_style))
+                    elements.append(Spacer(1, 0.15*inch))
+            elements.append(Spacer(1, 0.2*inch))
+        # Missed Benefits Section
+        benefits_text = clean_text(result.get('missed_benefits_analysis', 'No analysis available'))
+        if benefits_text:
+            elements.append(Paragraph("\ud83d\udcca Missed Benefits Analysis", heading_style))
+            elements.append(Spacer(1, 0.1*inch))
+            paragraphs = [p.strip() for p in benefits_text.split('\n\n') if p.strip()]
+            for para in paragraphs:
+                if para:
+                    elements.append(Paragraph(para, normal_style))
+                    elements.append(Spacer(1, 0.15*inch))
+        # Errors (if any)
+        errors = result.get('errors', [])
+        if errors:
+            elements.append(Spacer(1, 0.3*inch))
+            elements.append(Paragraph("Notices", heading_style))
+            for error in errors:
+                elements.append(Paragraph(f"• {error}", normal_style))
+        # Footer with disclaimer
+        elements.append(Spacer(1, 0.5*inch))
+        # Add separator before footer
+        elements.append(Table([['_'*100]], colWidths=[6.5*inch]))
+        elements.append(Spacer(1, 0.2*inch))
+        footer_style = ParagraphStyle('Footer', parent=styles['Normal'],
+                                     fontSize=9, textColor=colors.HexColor('#6B7280'),
+                                     alignment=TA_CENTER)
+        elements.append(Paragraph(
+            "<i>This report is generated by JanSahayak AI system. "
+            "For official information and application procedures, "
+            "please visit the respective government ministry websites or contact local government offices.</i>",
+            footer_style
+        ))
+        elements.append(Spacer(1, 0.1*inch))
+        elements.append(Paragraph(
+            "<i>Generated by JanSahayak - Your Government Benefits Assistant</i>",
+            footer_style
+        ))
+        # Build PDF
+        doc.build(elements)
+        # Prepare response
+        buffer.seek(0)
+        # Create filename with user's name
+        safe_name = re.sub(r'[^a-zA-Z0-9\s]', '', user_name).replace(' ', '_')
+        timestamp_str = datetime.now().strftime("%Y%m%d")
+        filename = f'JanSahayak_{safe_name}_{timestamp_str}.pdf'
+        return send_file(
+            buffer,
+            as_attachment=True,
+            download_name=filename,
+            mimetype='application/pdf'
+        )
+    except Exception as e:
+        print(f"PDF Generation Error: {str(e)}")
+        return jsonify({'error': str(e)}), 500
+if __name__ == '__main__':
+    # Get port from environment variable (for deployment platforms)
+    port = int(os.environ.get('PORT', 5000))
+    # Check if running in production
+    is_production = os.environ.get('FLASK_ENV') != 'development'
+    print("\n" + "="*70)
+    print("🙏 JANSAHAYAK - Starting Web Server")
+    print("="*70)
+    # Check API keys on startup
+    from config import GROQ_API_KEY, TAVILY_API_KEY, HF_TOKEN
+    if not GROQ_API_KEY or GROQ_API_KEY == "":
+        print("⚠️  WARNING: GROQ_API_KEY is not set!")
+        print("   The application will not work without this API key.")
+    else:
+        print("✅ GROQ_API_KEY is configured")
+    if not TAVILY_API_KEY or TAVILY_API_KEY == "":
+        print("⚠️  WARNING: TAVILY_API_KEY is not set!")
+        print("   The application will not work without this API key.")
+    else:
+        print("✅ TAVILY_API_KEY is configured")
+    if not HF_TOKEN or HF_TOKEN == "":
+        print("⚠️  WARNING: HF_TOKEN is not set (optional but recommended)")
+    else:
+        print("✅ HF_TOKEN is configured")
+    print(f"\n📱 Starting Flask server on port {port}...")
+    print(f"🌍 Environment: {'Production' if is_production else 'Development'}")
+    print("🔄 Vectorstores will be loaded on first request")
+    print("🛑 Press CTRL+C to stop the server\n")
+    # Start Flask FIRST to bind to port, then load vectorstores in background
+    app.run(debug=not is_production, host='0.0.0.0', port=port, threaded=True)

config.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import os
+from dotenv import load_dotenv
+load_dotenv()
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")
+HF_TOKEN = os.getenv("HF_TOKEN")

data/exams_pdfs/README.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# Placeholder for competitive exam PDFs
+# Add your competitive exam PDF files to this directory
+# Examples of exams to add:
+# - UPSC (Civil Services, NDA, CDS)
+# - SSC (CGL, CHSL, MTS, JE)
+# - Banking (IBPS, SBI PO/Clerk, RBI)
+# - Railways (RRB NTPC, ALP, Group D)
+# - State PSC exams
+# - Defense exams (NDA, CDS, AFCAT)
+# - Teaching exams (CTET, TET)
+# Download official notifications and syllabi from exam conducting bodies

data/exams_pdfs/exam.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae2c52c533fe29a081d8fe477d079e1d2e610aa398be5a8324f63b583c5beacf
+size 149005

data/schemes_pdfs/Government Welfare Schemes & Policies - Disha Experts.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad6e1fb3d26677250c6597ca9ed83f24000f8c062529f7188b693839f0c6ade9
+size 2410388

data/schemes_pdfs/Government of India Welfare Schemes & Policies For Competitive Exams.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:11f0608bcece884567ea3e98720c8e557d32d4fe203f3f1dde5356fcf39f7ee7
+size 2387327

data/schemes_pdfs/README.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+# Placeholder for government scheme PDFs
+# Add your government scheme PDF files to this directory
+# Examples of schemes to add:
+# - PM Kisan Samman Nidhi
+# - Ayushman Bharat
+# - PM Awas Yojana
+# - Skill Development Schemes
+# - Scholarships (SC/ST/OBC/Minority)
+# - State-specific schemes
+# Download official PDFs from government websites (.gov.in domains)

data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e4f7683bac6e79e923ac9441191f073cdbb67c41fcf84d5b401b02ce51520648
+size 511889

graph/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+"""
+Graph Module Init
+"""

graph/workflow.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+LangGraph Workflow
+Orchestrates multi-agent system using LangGraph
+"""
+from typing import TypedDict, Annotated
+from langgraph.graph import StateGraph, END
+import operator
+class AgentState(TypedDict):
+    """
+    State object that gets passed between agents
+    Contains all intermediate and final results
+    """
+    # Input
+    user_input: str
+    user_interests: list  # ['schemes', 'exams']
+    # Pre-loaded vectorstores
+    scheme_vectorstore: object  # FAISS vectorstore or None
+    exam_vectorstore: object  # FAISS vectorstore or None
+    # Profiling Agent Output
+    profile: dict
+    # Scheme Agent Output
+    scheme_recommendations: str
+    # Exam Agent Output
+    exam_recommendations: str
+    # Benefit Agent Output
+    missed_benefits: str
+    # Final Output
+    final_output: dict
+    # Error tracking
+    errors: Annotated[list, operator.add]
+def profiling_node(state: AgentState) -> dict:
+    """
+    Node: User Profiling Agent
+    Extracts structured profile from user input
+    """
+    from agents.profiling_agent import run_profiling_agent
+    try:
+        # Check if we already have a structured profile (from form)
+        existing_profile = state.get("profile", {})
+        # If we have useful profile data already, skip LLM profiling
+        useful_fields = [k for k in existing_profile.keys() if k not in ['raw_profile', 'user_input', 'error', 'note'] and existing_profile[k] not in ['Not Provided', 'N/A', '', None]]
+        if len(useful_fields) >= 3:
+            print("\n✅ Using pre-extracted profile data (skipping LLM profiling)")
+            return {"profile": existing_profile}
+        print("\n🔍 Running Profiling Agent...")
+        user_input = state.get("user_input", "")
+        profile = run_profiling_agent(user_input)
+        # Merge with existing profile if available
+        if existing_profile:
+            profile = {**profile, **existing_profile}  # existing_profile takes precedence
+        if "error" in profile and len(profile) <= 2:  # Only error and maybe user_input
+            print("❌ Profile extraction failed, using fallback data")
+            return {
+                "profile": existing_profile if existing_profile else {},
+                "errors": ["Profiling failed: " + profile.get("error", "Unknown error")]
+            }
+        print("✅ Profile extracted successfully")
+        return {"profile": profile}
+    except Exception as e:
+        print(f"❌ Profiling Agent Error: {str(e)}")
+        existing_profile = state.get("profile", {})
+        return {
+            "profile": existing_profile if existing_profile else {},
+            "errors": [f"Profiling: {str(e)}"]
+        }
+def scheme_node(state: AgentState) -> dict:
+    """
+    Node: Scheme Recommendation Agent
+    Recommends government schemes based on profile
+    """
+    from agents.scheme_agent import run_scheme_agent
+    try:
+        # Check if user wants scheme recommendations
+        interests = state.get("user_interests", ["schemes", "exams"])
+        if "schemes" not in interests:
+            print("\n⏭️ Skipping Scheme Agent (not requested)")
+            return {"scheme_recommendations": "Not requested by user"}
+        print("\n🏛️ Running Scheme Recommendation Agent...")
+        profile = state.get("profile", {})
+        scheme_vectorstore = state.get("scheme_vectorstore", None)
+        # Check if profile has useful data (at least 2 fields with actual values)
+        useful_fields = [k for k in profile.keys()
+                        if k not in ['raw_profile', 'user_input', 'error', 'note']
+                        and profile[k] not in ['Not Provided', 'N/A', '', None]]
+        if not profile or len(useful_fields) < 2:
+            print(f"⚠️  Limited profile data ({len(useful_fields)} fields), will rely more on web search")
+        else:
+            print(f"✅ Profile has {len(useful_fields)} useful fields")
+        result = run_scheme_agent(profile, use_web_search=True, vectorstore=scheme_vectorstore)
+        print("✅ Scheme recommendations generated")
+        return {"scheme_recommendations": result.get("recommendations", "")}
+    except Exception as e:
+        print(f"❌ Scheme Agent Error: {str(e)}")
+        return {
+            "scheme_recommendations": f"Error generating recommendations: {str(e)}",
+            "errors": [f"Scheme: {str(e)}"]
+        }
+def exam_node(state: AgentState) -> dict:
+    """
+    Node: Exam Recommendation Agent
+    Recommends competitive exams based on profile
+    """
+    from agents.exam_agent import run_exam_agent
+    try:
+        # Check if user wants exam recommendations
+        interests = state.get("user_interests", ["schemes", "exams"])
+        if "exams" not in interests:
+            print("\n⏭️ Skipping Exam Agent (not requested)")
+            return {"exam_recommendations": "Not requested by user"}
+        print("\n🎓 Running Exam Recommendation Agent...")
+        profile = state.get("profile", {})
+        exam_vectorstore = state.get("exam_vectorstore", None)
+        # Check if profile has useful data
+        useful_fields = [k for k in profile.keys() if k not in ['raw_profile', 'user_input', 'error', 'note']]
+        if not profile or len(useful_fields) < 2:
+            print("⚠️  Insufficient profile data, using web search only")
+            # Still try with whatever we have
+        result = run_exam_agent(profile, use_web_search=True, vectorstore=exam_vectorstore)
+        print("✅ Exam recommendations generated")
+        return {"exam_recommendations": result.get("recommendations", "")}
+    except Exception as e:
+        print(f"❌ Exam Agent Error: {str(e)}")
+        return {
+            "exam_recommendations": f"Error generating recommendations: {str(e)}",
+            "errors": [f"Exam: {str(e)}"]
+        }
+def benefit_node(state: AgentState) -> dict:
+    """
+    Node: Missed Benefits Calculator Agent
+    Calculates potential missed benefits
+    """
+    from agents.benefit_agent import calculate_missed_benefits
+    try:
+        print("\n💰 Running Benefit Calculator Agent...")
+        profile = state.get("profile", {})
+        scheme_recommendations = state.get("scheme_recommendations", "")
+        if not profile or not scheme_recommendations:
+            print("⚠️ Insufficient data for benefit calculation")
+            return {"missed_benefits": "Insufficient data"}
+        result = calculate_missed_benefits(profile, scheme_recommendations)
+        print("✅ Benefit calculation completed")
+        return {"missed_benefits": result.get("calculation", "")}
+    except Exception as e:
+        print(f"❌ Benefit Agent Error: {str(e)}")
+        return {
+            "missed_benefits": "",
+            "errors": [f"Benefit: {str(e)}"]
+        }
+def output_node(state: AgentState) -> dict:
+    """
+    Node: Final Output Compiler
+    Compiles all agent outputs into final response
+    """
+    print("\n📊 Compiling Final Output...")
+    final_output = {
+        "user_profile": state.get("profile", {}),
+        "scheme_recommendations": state.get("scheme_recommendations", ""),
+        "exam_recommendations": state.get("exam_recommendations", ""),
+        "missed_benefits_analysis": state.get("missed_benefits", ""),
+        "errors": state.get("errors", [])
+    }
+    print("✅ Final output ready")
+    return {"final_output": final_output}
+def build_workflow():
+    """
+    Builds the LangGraph workflow
+    Returns:
+        Compiled workflow graph
+    """
+    # Create workflow
+    workflow = StateGraph(AgentState)
+    # Add nodes
+    workflow.add_node("profiling", profiling_node)
+    workflow.add_node("scheme", scheme_node)
+    workflow.add_node("exam", exam_node)
+    workflow.add_node("benefit", benefit_node)
+    workflow.add_node("output", output_node)
+    # Set entry point
+    workflow.set_entry_point("profiling")
+    # Define edges (workflow flow)
+    # Step 1: Profiling runs first
+    workflow.add_edge("profiling", "scheme")
+    workflow.add_edge("profiling", "exam")
+    # Step 2: Both scheme and exam converge to benefit (runs after both complete)
+    workflow.add_edge("scheme", "benefit")
+    workflow.add_edge("exam", "benefit")
+    # Step 3: Benefit goes to output
+    workflow.add_edge("benefit", "output")
+    # Set finish point
+    workflow.add_edge("output", END)
+    # Compile workflow
+    return workflow.compile()
+def run_workflow(user_input: str, user_interests: list = None, structured_profile: dict = None,
+                 scheme_vectorstore=None, exam_vectorstore=None) -> dict:
+    """
+    Runs the complete multi-agent workflow
+    Args:
+        user_input: Raw user input text
+        user_interests: List of interests ['schemes', 'exams']
+        structured_profile: Pre-extracted profile data from form (optional)
+        scheme_vectorstore: Pre-loaded scheme vectorstore (optional)
+        exam_vectorstore: Pre-loaded exam vectorstore (optional)
+    Returns:
+        Final compiled output dictionary
+    """
+    print("="*60)
+    print("🚀 Starting JanSahayak Multi-Agent System")
+    print("="*60)
+    if user_interests:
+        print(f"🎯 User Interests: {', '.join(user_interests)}")
+    if structured_profile:
+        print("📋 Using structured profile data from form")
+    if scheme_vectorstore:
+        print("📚 Using pre-loaded scheme vectorstore")
+    if exam_vectorstore:
+        print("📚 Using pre-loaded exam vectorstore")
+    # Build workflow
+    app = build_workflow()
+    # Initialize state
+    initial_state = {
+        "user_input": user_input,
+        "user_interests": user_interests or ["schemes", "exams"],
+        "profile": structured_profile if structured_profile else {},
+        "scheme_vectorstore": scheme_vectorstore,
+        "exam_vectorstore": exam_vectorstore,
+        "errors": []
+    }
+    # Run workflow
+    result = app.invoke(initial_state)
+    print("\n" + "="*60)
+    print("✅ Workflow Completed")
+    print("="*60)
+    return result.get("final_output", {})
+if __name__ == "__main__":
+    # Test workflow
+    test_input = """
+    I am a 25-year-old male from Maharashtra. I completed my Bachelor's in Engineering.
+    My family income is around 3 lakh per year. I belong to the OBC category.
+    I am currently unemployed and looking for government job opportunities.
+    I am interested in technical positions and government jobs.
+    """
+    result = run_workflow(test_input)
+    print("\n📄 Final Result:")
+    print("="*60)
+    import json
+    print(json.dumps(result, indent=2, ensure_ascii=False))

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/58d4a9a45664eb9e12de9549c548c09b6134c17f.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/59d594003bf59880a884c574bf88ef7555bb0202.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/72b987fd805cfa2b58c4c8c952b274a11bfd5a00.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/952a9b81c0bfd99800fabf352f69c7ccd46c5e43.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/c79f2b6a0cea6f4b564fed1938984bace9d30ff0.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/cb202bfe2e3c98645018a6d12f182a434c9d3e02.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/d1514c3162bbe87b343f565fadc62e6c06f04f03.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/e7b0375001f109a6b8873d756ad4f7bbb15fbaa5.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fb140275c155a9c7c5a3b3e0e77a9e839594a938.lock ADDED Viewed

File without changes

hf_cache/.locks/models--sentence-transformers--all-MiniLM-L6-v2/fd1b291129c607e5d49799f87cb219b27f98acdf.lock ADDED Viewed

File without changes

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/adapter_config.json ADDED Viewed

File without changes

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/added_tokens.json ADDED Viewed

File without changes

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/.no_exist/c9745ed1d9f207416be6d2e6f8de32d1f16199bf/chat_template.jinja ADDED Viewed

File without changes

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
+size 90868376

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f ADDED Viewed

	@@ -0,0 +1,173 @@

+---
+language: en
+license: apache-2.0
+library_name: sentence-transformers
+tags:
+- sentence-transformers
+- feature-extraction
+- sentence-similarity
+- transformers
+datasets:
+- s2orc
+- flax-sentence-embeddings/stackexchange_xml
+- ms_marco
+- gooaq
+- yahoo_answers_topics
+- code_search_net
+- search_qa
+- eli5
+- snli
+- multi_nli
+- wikihow
+- natural_questions
+- trivia_qa
+- embedding-data/sentence-compression
+- embedding-data/flickr30k-captions
+- embedding-data/altlex
+- embedding-data/simple-wiki
+- embedding-data/QQP
+- embedding-data/SPECTER
+- embedding-data/PAQ_pairs
+- embedding-data/WikiAnswers
+pipeline_tag: sentence-similarity
+---
+# all-MiniLM-L6-v2
+This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
+## Usage (Sentence-Transformers)
+Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
+```
+pip install -U sentence-transformers
+```
+Then you can use the model like this:
+```python
+from sentence_transformers import SentenceTransformer
+sentences = ["This is an example sentence", "Each sentence is converted"]
+model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = model.encode(sentences)
+print(embeddings)
+```
+## Usage (HuggingFace Transformers)
+Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
+```python
+from transformers import AutoTokenizer, AutoModel
+import torch
+import torch.nn.functional as F
+#Mean Pooling - Take attention mask into account for correct averaging
+def mean_pooling(model_output, attention_mask):
+    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+# Sentences we want sentence embeddings for
+sentences = ['This is an example sentence', 'Each sentence is converted']
+# Load model from HuggingFace Hub
+tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
+# Tokenize sentences
+encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+# Compute token embeddings
+with torch.no_grad():
+    model_output = model(**encoded_input)
+# Perform pooling
+sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+# Normalize embeddings
+sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+print("Sentence embeddings:")
+print(sentence_embeddings)
+```
+------
+## Background
+The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
+contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
+1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
+We developed this model during the
+[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
+organized by Hugging Face. We developed this model as part of the project:
+[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
+## Intended uses
+Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
+the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
+By default, input text longer than 256 word pieces is truncated.
+## Training procedure
+### Pre-training
+We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
+### Fine-tuning
+We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
+We then apply the cross entropy loss by comparing with true pairs.
+#### Hyper parameters
+We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
+We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
+a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
+#### Training data
+We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
+We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
+| Dataset                                                  | Paper                                    | Number of training tuples  |
+|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
+| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
+| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
+| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
+| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
+| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
+| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
+| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
+| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
+| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
+| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
+| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
+| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
+| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
+| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
+| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
+| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
+| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
+| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
+| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
+| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
+| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
+| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
+| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
+| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
+| **Total** | | **1,170,060,424** |

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/59d594003bf59880a884c574bf88ef7555bb0202 ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "max_seq_length": 256,
+  "do_lower_case": false
+}

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/72b987fd805cfa2b58c4c8c952b274a11bfd5a00 ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "_name_or_path": "nreimers/MiniLM-L6-H384-uncased",
+  "architectures": [
+    "BertModel"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 384,
+  "initializer_range": 0.02,
+  "intermediate_size": 1536,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.8.2",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 30522
+}

hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/952a9b81c0bfd99800fabf352f69c7ccd46c5e43 ADDED Viewed

	@@ -0,0 +1,20 @@

+[
+  {
+    "idx": 0,
+    "name": "0",
+    "path": "",
+    "type": "sentence_transformers.models.Transformer"
+  },
+  {
+    "idx": 1,
+    "name": "1",
+    "path": "1_Pooling",
+    "type": "sentence_transformers.models.Pooling"
+  },
+  {
+    "idx": 2,
+    "name": "2",
+    "path": "2_Normalize",
+    "type": "sentence_transformers.models.Normalize"
+  }
+]