diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..24da3715a2c1e03556e8ec8aeafd333fb9b75171 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,31 @@ +# Environment +.env +.venv/ +venv/ +env/ + +# Python cache +__pycache__/ +*.py[cod] +*$py.class +*.so + +# Git +.git/ +.gitignore + +# IDE +.vscode/ +.idea/ + +# Documentation +*.md +ARCHITECTURE.txt +PROJECT_STRUCTURE.txt + +# Outputs (will be generated) +outputs/*.json + +# RAG indexes (build during deployment) +rag/scheme_index/ +rag/exam_index/ diff --git a/.env.example b/.env.example new file mode 100644 index 0000000000000000000000000000000000000000..efe999fecbdb553316f59e7fe4cb931dea4e7b4d --- /dev/null +++ b/.env.example @@ -0,0 +1,8 @@ +GROQ_API_KEY="your_groq_api_key_here" +TAVILY_API_KEY="your_tavily_api_key_here" +HF_TOKEN="your_huggingface_token_here" + +# Skip vectorstores on memory-constrained platforms +# Set to "true" to use only web search (saves ~300MB RAM) +# Set to "false" to use FAISS vectorstores (for Hugging Face Spaces) +SKIP_VECTORSTORES="false" diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..8ef15780fb8400e93db654c182f87bf249ef9ace 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +data/exams_pdfs/exam.pdf filter=lfs diff=lfs merge=lfs -text +data/schemes_pdfs/all-indian-government-schemes-list-2026-716.pdf filter=lfs diff=lfs merge=lfs -text +data/schemes_pdfs/Government[[:space:]]of[[:space:]]India[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]For[[:space:]]Competitive[[:space:]]Exams.pdf filter=lfs diff=lfs merge=lfs -text +data/schemes_pdfs/Government[[:space:]]Welfare[[:space:]]Schemes[[:space:]]&[[:space:]]Policies[[:space:]]-[[:space:]]Disha[[:space:]]Experts.pdf filter=lfs diff=lfs merge=lfs -text +hf_cache/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db filter=lfs diff=lfs merge=lfs -text +rag/scheme_index/index.faiss filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d8c3321c364cc97ec382b6a2d732abb18eafc7c4 --- /dev/null +++ b/.gitignore @@ -0,0 +1,29 @@ +# Environment +.env +.venv/ +venv/ +env/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so + +# HuggingFace Cache (downloaded models) +hf_cache/ + +# RAG Indexes (now included for production) +# rag/scheme_index/ +# rag/exam_index/ + +# Outputs +outputs/*.json + +# IDE +.vscode/ +.idea/ + +# Data files (optional - uncomment if PDFs are large) +# data/schemes_pdfs/*.pdf +# data/exams_pdfs/*.pdf diff --git a/ARCHITECTURE.txt b/ARCHITECTURE.txt new file mode 100644 index 0000000000000000000000000000000000000000..c091bae2bae8d033785b251df4f9f8cb75d13144 --- /dev/null +++ b/ARCHITECTURE.txt @@ -0,0 +1,330 @@ +""" +JanSahayak Architecture Overview +================================ + +SYSTEM COMPONENTS +----------------- + +1. AGENTS (agents/) + - profiling_agent.py → User Profile Extraction + - scheme_agent.py → Government Scheme Recommendations + - exam_agent.py → Competitive Exam Recommendations + - search_agent.py → Live Web Search (Tavily) + - rag_agent.py → Vector Database Retrieval + - document_agent.py → PDF/Image Text Extraction + - benefit_agent.py → Missed Benefits Calculator + +2. PROMPTS (prompts/) + - profiling_prompt.py → User profiling instructions + - scheme_prompt.py → Scheme recommendation template + - exam_prompt.py → Exam recommendation template + - rag_prompt.py → RAG retrieval instructions + +3. RAG SYSTEM (rag/) + - embeddings.py → HuggingFace embeddings (CPU) + - scheme_vectorstore.py → FAISS store for schemes + - exam_vectorstore.py → FAISS store for exams + +4. TOOLS (tools/) + - tavily_tool.py → Live government website search + +5. WORKFLOW (graph/) + - workflow.py → LangGraph orchestration + +6. I/O HANDLERS (agent_io/) + - profiling_io.py → Profiling agent I/O + - scheme_io.py → Scheme agent I/O + - exam_io.py → Exam agent I/O + - benefit_io.py → Benefit agent I/O + +7. DATA (data/) + - schemes_pdfs/ → Government scheme PDFs + - exams_pdfs/ → Competitive exam PDFs + +8. OUTPUTS (outputs/) + - results_*.json → Generated analysis results + +9. CONFIGURATION + - config.py → Configuration loader + - .env → API keys (user creates) + - requirements.txt → Python dependencies + +10. ENTRY POINTS + - main.py → Main application + - setup.py → Setup wizard + + +WORKFLOW EXECUTION +------------------ + +User Input + ↓ +[Profiling Agent] + ↓ + ├─→ [Scheme Agent] ──→ [Benefit Agent] ──┐ + │ ↓ │ + │ [RAG Search] │ + │ ↓ │ + │ [Tavily Search] │ + │ │ + └─→ [Exam Agent] ────────────────────────┤ + ↓ │ + [RAG Search] │ + ↓ │ + [Tavily Search] │ + ↓ + [Final Output] + ↓ + [JSON Results File] + + +TECHNOLOGY STACK +---------------- + +LLM & AI: +- Groq API (llama-3.3-70b-versatile) → Fast inference +- LangChain → Agent framework +- LangGraph → Workflow orchestration + +Embeddings & Search: +- HuggingFace Transformers → sentence-transformers/all-MiniLM-L6-v2 +- FAISS (CPU) → Vector similarity search + +Web Search: +- Tavily API → Government website search + +Document Processing: +- PyPDF → PDF text extraction +- Pytesseract → OCR for images +- Pillow → Image processing + +Infrastructure: +- Python 3.8+ +- CPU-only deployment (no GPU needed) +- PyTorch CPU version + + +DATA FLOW +--------- + +1. User Input Processing: + Raw Text → Profiling Agent → Structured JSON Profile + +2. Scheme Recommendation: + Profile → RAG Query → Vectorstore Search → Top-K Documents + Profile + Documents → Tavily Search (optional) → Web Results + Profile + Documents + Web Results → LLM → Recommendations + +3. Exam Recommendation: + Profile → RAG Query → Vectorstore Search → Top-K Documents + Profile + Documents → Tavily Search (optional) → Web Results + Profile + Documents + Web Results → LLM → Recommendations + +4. Benefit Calculation: + Profile + Scheme Recommendations → LLM → Missed Benefits Analysis + +5. Final Output: + All Results → JSON Compilation → File Save → User Display + + +API INTERACTIONS +---------------- + +1. Groq API: + - Used by: All LLM-powered agents + - Model: llama-3.3-70b-versatile + - Purpose: Natural language understanding & generation + - Rate: Per-request basis + +2. Tavily API: + - Used by: search_agent, scheme_agent, exam_agent + - Purpose: Live government website search + - Filter: .gov.in domains preferred + - Depth: Advanced search mode + +3. HuggingFace: + - Used by: embeddings module + - Model: sentence-transformers/all-MiniLM-L6-v2 + - Purpose: Document embeddings for RAG + - Local: Runs on CPU, cached after first download + + +VECTORSTORE ARCHITECTURE +------------------------ + +Scheme Vectorstore (rag/scheme_index/): +├── index.faiss → FAISS index file +├── index.pkl → Metadata pickle +└── [Embedded chunks from schemes_pdfs/] + +Exam Vectorstore (rag/exam_index/): +├── index.faiss → FAISS index file +├── index.pkl → Metadata pickle +└── [Embedded chunks from exams_pdfs/] + +Embedding Dimension: 384 +Similarity Metric: Cosine similarity +Chunk Size: Auto (from PyPDF) + + +AGENT SPECIALIZATIONS +--------------------- + +1. Profiling Agent: + - Extraction-focused + - Low temperature (0.1) + - JSON output required + - No external tools + +2. Scheme Agent: + - RAG + Web search + - Temperature: 0.3 + - Tools: Vectorstore, Tavily + - Output: Detailed scheme info + +3. Exam Agent: + - RAG + Web search + - Temperature: 0.3 + - Tools: Vectorstore, Tavily + - Output: Detailed exam info + +4. Benefit Agent: + - Calculation-focused + - Temperature: 0.2 + - No external tools + - Output: Financial analysis + +5. Search Agent: + - Web search only + - Tool: Tavily API + - Focus: .gov.in domains + - Output: Live search results + +6. RAG Agent: + - Vectorstore query only + - Tool: FAISS + - Similarity search + - Output: Relevant documents + +7. Document Agent: + - File processing + - Tools: PyPDF, Pytesseract + - Supports: PDF, Images + - Output: Extracted text + + +SECURITY & PRIVACY +------------------ + +- API keys stored in .env (not committed to git) +- User data processed locally except LLM calls +- No data stored on external servers (except API providers) +- PDF data remains local +- Vectorstores are local +- Output files saved locally + + +SCALABILITY NOTES +----------------- + +Current Setup (Single User): +- Synchronous workflow +- Local vectorstores +- CPU processing + +Potential Scaling: +- Add Redis for caching +- Use cloud vectorstore (Pinecone, Weaviate) +- Parallel agent execution +- GPU acceleration for embeddings +- Database for user profiles +- API service deployment + + +ERROR HANDLING +-------------- + +Each agent includes: +- Try-catch blocks +- Error state tracking +- Graceful degradation +- Partial results on failure +- Error reporting in final output + + +MONITORING & LOGGING +-------------------- + +Current: +- Console print statements +- Agent start/completion messages +- Error messages +- Final output summary + +Future Enhancement: +- Structured logging (logging module) +- Performance metrics +- API usage tracking +- User feedback collection + + +EXTENSIBILITY +------------- + +Adding New Agent: +1. Create agent file in agents/ +2. Add prompt template in prompts/ +3. Create node function in workflow.py +4. Add node to graph +5. Define edges (connections) +6. Optional: Create I/O handler + +Adding New Data Source: +1. Create vectorstore module in rag/ +2. Add PDFs to data/ subdirectory +3. Build vectorstore +4. Create agent or modify existing + +Adding New Tool: +1. Create tool in tools/ +2. Import in agent +3. Use in agent logic + + +PERFORMANCE BENCHMARKS (Typical) +--------------------------------- + +Vectorstore Building: +- 10 PDFs: ~2-5 minutes +- 100 PDFs: ~20-30 minutes + +Query Performance: +- Profiling: ~1-2 seconds +- RAG Search: ~0.5-1 second +- LLM Call: ~1-3 seconds +- Web Search: ~2-4 seconds +- Full Workflow: ~10-20 seconds + +Memory Usage: +- Base: ~500 MB +- With models: ~2-3 GB +- With large PDFs: +500 MB per 100 PDFs + + +FUTURE ENHANCEMENTS +------------------- + +1. Multilingual Support (Hindi, regional languages) +2. Voice input/output +3. Mobile app integration +4. Database for user history +5. Notification system for deadlines +6. Document upload interface +7. Real-time scheme updates +8. Community feedback integration +9. State-specific customization +10. Integration with government portals + + +END OF ARCHITECTURE DOCUMENT +""" diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..c7493831a94aa46d594af5b44b3bc3acbe470872 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,28 @@ +# HuggingFace Spaces Dockerfile +FROM python:3.12-slim + +WORKDIR /app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + build-essential \ + curl \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better caching +COPY requirements.txt . + +# Install Python dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application files +COPY . . + +# Expose port 7860 (HuggingFace Spaces default) +EXPOSE 7860 + +# Set environment variable for port +ENV PORT=7860 + +# Run the application +CMD ["python", "app.py"] diff --git a/PROJECT_STRUCTURE.txt b/PROJECT_STRUCTURE.txt new file mode 100644 index 0000000000000000000000000000000000000000..f31075baf19f314072829a85d606ed1bab2c61b3 --- /dev/null +++ b/PROJECT_STRUCTURE.txt @@ -0,0 +1,387 @@ +JanSahayak - Multi-Agent Government Intelligence System +======================================================== + +📦 JanSahayak/ +│ +├── 📄 main.py # Main entry point +├── 📄 setup.py # Setup wizard & utilities +├── 📄 config.py # Configuration loader +├── 📄 requirements.txt # Python dependencies +│ +├── 📄 README.md # Project overview +├── 📄 USAGE_GUIDE.md # Comprehensive usage guide +├── 📄 ARCHITECTURE.txt # System architecture +│ +├── 📄 .env.example # Example environment file +├── 📄 .gitignore # Git ignore rules +│ +├── 📁 agents/ # Agent modules +│ ├── __init__.py +│ ├── profiling_agent.py # 🧾 User profiling +│ ├── scheme_agent.py # 🏛️ Scheme recommendations +│ ├── exam_agent.py # 🎓 Exam recommendations +│ ├── search_agent.py # 🔎 Web search (Tavily) +│ ├── rag_agent.py # 📚 RAG retrieval +│ ├── document_agent.py # 📂 Document processing +│ └── benefit_agent.py # 💰 Benefit calculator +│ +├── 📁 prompts/ # Prompt templates +│ ├── __init__.py +│ ├── profiling_prompt.py # Profiling instructions +│ ├── scheme_prompt.py # Scheme recommendation template +│ ├── exam_prompt.py # Exam recommendation template +│ └── rag_prompt.py # RAG retrieval template +│ +├── 📁 rag/ # RAG system +│ ├── __init__.py +│ ├── embeddings.py # HuggingFace embeddings +│ ├── scheme_vectorstore.py # Scheme FAISS store +│ ├── exam_vectorstore.py # Exam FAISS store +│ ├── scheme_index/ # Generated vectorstore +│ │ ├── index.faiss +│ │ └── index.pkl +│ └── exam_index/ # Generated vectorstore +│ ├── index.faiss +│ └── index.pkl +│ +├── 📁 tools/ # External tools +│ ├── __init__.py +│ └── tavily_tool.py # Tavily search integration +│ +├── 📁 graph/ # Workflow orchestration +│ ├── __init__.py +│ └── workflow.py # LangGraph workflow +│ +├── 📁 agent_io/ # Agent I/O handlers +│ ├── __init__.py +│ ├── profiling_io.py # Profiling I/O +│ ├── scheme_io.py # Scheme I/O +│ ├── exam_io.py # Exam I/O +│ └── benefit_io.py # Benefit I/O +│ +├── 📁 data/ # PDF data +│ ├── schemes_pdfs/ # Government scheme PDFs +│ │ └── README.txt +│ └── exams_pdfs/ # Competitive exam PDFs +│ └── README.txt +│ +└── 📁 outputs/ # Generated results + ├── README.txt + └── results_*.json # Analysis results + + +KEY FILES DESCRIPTION +===================== + +📄 main.py +---------- +Main application entry point with: +- Interactive mode for user input +- File mode for batch processing +- Result saving and formatting +- Summary display + +📄 setup.py +----------- +Setup wizard that: +- Checks dependencies +- Verifies API keys +- Validates PDF data +- Builds vectorstores + +📄 config.py +------------ +Loads configuration from .env: +- GROQ_API_KEY +- TAVILY_API_KEY +- HF_TOKEN + +📁 agents/ +---------- +7 specialized agents: +1. profiling_agent.py → Extract user profile +2. scheme_agent.py → Recommend schemes +3. exam_agent.py → Recommend exams +4. search_agent.py → Live web search +5. rag_agent.py → Vector search +6. document_agent.py → Process PDFs/images +7. benefit_agent.py → Calculate missed benefits + +📁 prompts/ +----------- +Prompt engineering templates for: +- User profiling instructions +- Scheme recommendation format +- Exam recommendation format +- RAG retrieval guidance + +📁 rag/ +------- +RAG (Retrieval Augmented Generation) system: +- embeddings.py → HuggingFace embeddings +- scheme_vectorstore.py → Scheme database +- exam_vectorstore.py → Exam database +- *_index/ → Generated FAISS indexes + +📁 tools/ +--------- +External tool integrations: +- tavily_tool.py → Tavily API for government website search + +📁 graph/ +--------- +LangGraph workflow orchestration: +- workflow.py → Defines agent connections and execution flow + +📁 agent_io/ +------------ +Input/Output handlers for each agent: +- Separate I/O files for tracking +- JSON-based data exchange +- Timestamp tracking + +📁 data/ +-------- +Training data for RAG: +- schemes_pdfs/ → Government scheme documents +- exams_pdfs/ → Competitive exam documents + +📁 outputs/ +----------- +Generated analysis results: +- results_YYYYMMDD_HHMMSS.json +- Contains all agent outputs + + +WORKFLOW VISUALIZATION +====================== + + User Input (Text) + ↓ + ┌───────────────┐ + │ Profiling │ + │ Agent │ + └───────┬───────┘ + │ + Structured Profile + │ + ┌───────────────┼───────────────┐ + ↓ ↓ +┌───────────────┐ ┌───────────────┐ +│ Scheme │ │ Exam │ +│ Agent │ │ Agent │ +└───────┬───────┘ └───────┬───────┘ + │ │ + ├─→ RAG Search ├─→ RAG Search + ├─→ Web Search └─→ Web Search + ↓ │ +┌───────────────┐ │ +│ Benefit │ │ +│ Agent │ │ +└───────┬───────┘ │ + │ │ + └───────────────┬───────────────┘ + ↓ + ┌───────────────┐ + │ Final │ + │ Output │ + └───────────────┘ + ↓ + JSON File + + +TECHNOLOGY COMPONENTS +===================== + +🧠 Brain (LLM) +- Groq API (llama-3.3-70b-versatile) +- Fast inference (<2s per call) +- Powers all agents + +📚 Memory (RAG) +- HuggingFace embeddings (all-MiniLM-L6-v2) +- FAISS vectorstore (CPU) +- Semantic search + +🔍 Live Search +- Tavily API +- Government website focus +- Real-time information + +🔗 Orchestration +- LangChain (agent framework) +- LangGraph (workflow) +- State management + +📄 Document Processing +- PyPDF (PDF extraction) +- Pytesseract (OCR) +- Pillow (image handling) + + +QUICK START CHECKLIST +====================== + +□ 1. Install dependencies + pip install -r requirements.txt + +□ 2. Create .env file + Copy .env.example to .env + Add GROQ_API_KEY and TAVILY_API_KEY + +□ 3. Add PDF data + Place PDFs in data/schemes_pdfs/ + Place PDFs in data/exams_pdfs/ + +□ 4. Run setup + python setup.py + +□ 5. Build vectorstores + Automatic during setup, or: + python setup.py --build-vectorstores + +□ 6. Run the system + python main.py + + +USAGE EXAMPLES +============== + +Interactive Mode: +----------------- +$ python main.py + +Enter your details: +I am 25 years old, male, from Maharashtra. +My family income is 3 lakh per year. +I belong to OBC category. +I completed Bachelor's in Engineering. +I am unemployed and looking for government jobs. +I am interested in technical and banking sectors. + +[Press Enter twice to submit] + + +File Mode: +---------- +$ python main.py user_input.txt + + +Testing Individual Agents: +--------------------------- +# Test profiling +python -m agents.profiling_agent + +# Test scheme agent +python -m agents.scheme_agent + +# Test exam agent +python -m agents.exam_agent + + +Building Vectorstores: +----------------------- +python setup.py --build-vectorstores + +Or in Python: +from rag.scheme_vectorstore import build_scheme_vectorstore +from rag.exam_vectorstore import build_exam_vectorstore + +build_scheme_vectorstore() +build_exam_vectorstore() + + +OUTPUT FORMAT +============= + +Generated file: outputs/results_20260302_143022.json + +{ + "user_profile": { + "age": 25, + "gender": "Male", + "state": "Maharashtra", + "income": "300000", + "caste": "OBC", + "education": "Bachelor's in Engineering", + "employment_status": "Unemployed", + "interests": "Technical, Banking" + }, + "scheme_recommendations": "...", + "exam_recommendations": "...", + "missed_benefits_analysis": "...", + "errors": [] +} + + +SYSTEM REQUIREMENTS +=================== + +✅ Python 3.8 or higher +✅ 4GB RAM minimum (8GB recommended) +✅ 2GB storage for dependencies +✅ Internet connection (for APIs) +✅ CPU only (no GPU needed) + + +API KEYS REQUIRED +================= + +🔑 GROQ_API_KEY + Get from: https://console.groq.com/ + Purpose: LLM inference + Cost: Free tier available + +🔑 TAVILY_API_KEY + Get from: https://tavily.com/ + Purpose: Web search + Cost: Free tier available + +🔑 HF_TOKEN (Optional) + Get from: https://huggingface.co/settings/tokens + Purpose: Model downloads + Cost: Free + + +SUPPORT & DOCUMENTATION +======================== + +📖 Full Usage Guide: USAGE_GUIDE.md +🏗️ Architecture Details: ARCHITECTURE.txt +❓ Quick Start: README.md +🐛 Troubleshooting: See USAGE_GUIDE.md + +For issues: +1. Check setup: python setup.py --check +2. Verify .env file has correct API keys +3. Ensure PDFs are in data/ directories +4. Rebuild vectorstores if needed + + +PROJECT STATUS +============== + +✅ Core System: Complete +✅ All 7 Agents: Implemented +✅ RAG System: Functional +✅ Web Search: Integrated +✅ Workflow: Orchestrated +✅ I/O Handlers: Created +✅ Documentation: Comprehensive + +Ready for deployment and testing! + + +NEXT STEPS +========== + +1. Add your API keys to .env +2. Add government scheme and exam PDFs +3. Run setup wizard +4. Test the system +5. Customize prompts as needed +6. Add more PDF data over time +7. Monitor and improve + + +Happy Analyzing! 🎉 diff --git a/README.md b/README.md index 9cf5a9b73a2b7c66de1d1dc99146a280a182126d..1801ca7b20a59af8314c7f97eca43c264c24ceb3 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,238 @@ ---- -title: Jansahayak -emoji: 🐨 -colorFrom: green -colorTo: indigo -sdk: docker -pinned: false -license: mit ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +--- +title: JanSahayak +emoji: 🙏 +colorFrom: blue +colorTo: green +sdk: docker +pinned: false +--- + +# 🙏 JanSahayak - AI-Powered Government Schemes & Exams Assistant + +> Your personal AI assistant for discovering government schemes and competitive exam opportunities in India + +[![Hugging Face Spaces](https://img.shields.io/badge/🤗-Hugging%20Face-yellow)](https://huggingface.co/spaces) +[![Flask](https://img.shields.io/badge/Flask-2.3+-green)](https://flask.palletsprojects.com/) +[![LangChain](https://img.shields.io/badge/LangChain-Latest-blue)](https://www.langchain.com/) + +--- + +## 🌟 Features + +### 🤖 Multi-Agent AI System +- **Profiling Agent**: Extracts structured user information +- **Scheme Agent**: Recommends relevant government schemes +- **Exam Agent**: Suggests competitive exams based on qualifications +- **RAG Agent**: Retrieves information from curated document database + +### 💡 Intelligent Capabilities +- ✅ Natural language understanding of user profiles +- ✅ Smart recommendations based on eligibility criteria +- ✅ RAG (Retrieval-Augmented Generation) with FAISS vectorstore +- ✅ Real-time web search via Tavily API +- ✅ PDF generation for saving recommendations +- ✅ Beautiful web interface with modern UI + +--- + +## 🚀 Deploy to Hugging Face Spaces (Recommended) + +### Why Hugging Face Spaces? +- ✅ **16GB RAM for FREE** (perfect for RAG apps!) +- ✅ Built for ML/AI applications +- ✅ Git-based deployment +- ✅ Public URL instantly +- ✅ Persistent storage + +### Quick Deploy Steps: + +**Method 1: Using HF CLI (Easiest)** + +```bash +# Install HF CLI +pip install huggingface_hub[cli] + +# Login +huggingface-cli login + +# Create Space and push +huggingface-cli repo create jansahayak --type space --space_sdk gradio +git remote add hf https://huggingface.co/spaces/YOUR_USERNAME/jansahayak +git push hf main +``` + +**Method 2: Manual Setup** + +1. **Create Space** on [huggingface.co/spaces](https://huggingface.co/spaces) + - Click "Create new Space" + - Name: `jansahayak` + - SDK: Select "Gradio" (works with Flask) + - Hardware: CPU basic (Free - 16GB RAM!) + - License: MIT + +2. **Clone YOUR Space repo** (not GitHub!) + ```bash + git clone https://huggingface.co/spaces/YOUR_USERNAME/jansahayak + cd jansahayak + ``` + +3. **Copy your project files** + ```bash + # Copy all files from your JanSahayak folder to the cloned space folder + cp -r /path/to/JanSahayak/* . + ``` + +4. **Add Environment Variables** (Space Settings → Variables and secrets) + ``` + GROQ_API_KEY=your_groq_key + TAVILY_API_KEY=your_tavily_key + HF_TOKEN=your_hf_token (optional) + SKIP_VECTORSTORES=false + ``` + +5. **Push to Space** + ```bash + git add . + git commit -m "Initial commit" + git push + ``` + +Your app will be live at: `https://huggingface.co/spaces/YOUR_USERNAME/jansahayak` + +### Important Notes: +- HF Spaces uses its own Git repo (not GitHub directly) +- App runs on port 7860 by default (Flask uses 5000, update if needed) +- First deployment may take 5-10 minutes to install dependencies +- Check Space logs if deployment fails + +--- + +## 🛠️ Local Development + +```bash +# Clone and setup +git clone https://github.com/YOUR_USERNAME/JanSahayak.git +cd JanSahayak + +# Create virtual environment +python -m venv .venv +source .venv/bin/activate # Linux/Mac +.venv\Scripts\activate # Windows + +# Install dependencies +pip install -r requirements.txt + +# Configure API keys +cp .env.example .env +# Edit .env with your keys + +# Build vectorstores (optional - if you have PDFs) +python init_embeddings.py + +# Run app +python app.py +# or use launcher scripts: start_web.bat (Windows) / ./start_web.sh (Linux/Mac) +``` + +Visit `http://localhost:5000` + +--- + +## 🔑 Get API Keys + +| Service | URL | Free Tier | Used For | +|---------|-----|-----------|----------| +| **Groq** | [console.groq.com](https://console.groq.com) | ✅ Yes | LLM Inference | +| **Tavily** | [tavily.com](https://tavily.com) | 1000 searches/mo | Web Search | +| **HuggingFace** | [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens) | ✅ Yes | Model Downloads | + +--- + +## 💾 Adding Custom Documents + +### Government Schemes PDFs +1. Place PDFs in `data/schemes_pdfs/` +2. Run `python init_embeddings.py` +3. Restart app + +### Exam Information PDFs +1. Place PDFs in `data/exams_pdfs/` +2. Run `python init_embeddings.py` +3. Restart app + +Automatically indexed and searchable via RAG! + +--- + +## 🧪 Technology Stack + +- **Backend**: Flask +- **AI**: LangChain + LangGraph +- **LLM**: Groq (Llama 3.3 70B) +- **Embeddings**: sentence-transformers/all-MiniLM-L6-v2 +- **Vector DB**: FAISS (local) +- **Search**: Tavily API +- **Frontend**: HTML5 + CSS3 + JavaScript + +--- + +## 📁 Project Structure + +``` +JanSahayak/ +├── app.py # Flask web app +├── main.py # CLI interface +├── agents/ # AI agents +│ ├── profiling_agent.py +│ ├── scheme_agent.py +│ ├── exam_agent.py +│ └── rag_agent.py +├── rag/ # RAG components +│ ├── embeddings.py +│ ├── scheme_vectorstore.py +│ └── exam_vectorstore.py +├── data/ # Documents +│ ├── schemes_pdfs/ +│ └── exams_pdfs/ +├── templates/ # HTML templates +└── static/ # CSS/JS +``` + +--- + +## 🐛 Troubleshooting + +**Memory issues on local machine?** +```env +# Set in .env +SKIP_VECTORSTORES=true +``` +Uses web search only (no embeddings needed) + +**Vectorstore errors?** +```bash +rm -rf rag/scheme_index rag/exam_index +python init_embeddings.py +``` + +--- + +## 🤝 Contributing + +Contributions welcome! Fork → Create branch → Submit PR + +--- + +## 📜 License + +MIT License + +--- + +## 🙏 Acknowledgments + +Built with [LangChain](https://www.langchain.com/), [Groq](https://groq.com/), [Tavily](https://tavily.com/), and ❤️ + +--- + +Made for the people of India 🇮🇳 diff --git a/agent_io/__init__.py b/agent_io/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1ecb33471643095b9b8732834ef1d59fd70fc2df --- /dev/null +++ b/agent_io/__init__.py @@ -0,0 +1,3 @@ +""" +Agent I/O Module Init +""" diff --git a/agent_io/benefit_io.py b/agent_io/benefit_io.py new file mode 100644 index 0000000000000000000000000000000000000000..9044df428698666a423a0079edff57a32a23a5a7 --- /dev/null +++ b/agent_io/benefit_io.py @@ -0,0 +1,117 @@ +""" +Benefit Agent I/O Handler +Manages input/output for missed benefits calculator agent +""" + +import json +import os +from datetime import datetime + + +class BenefitIO: + """Handles input/output operations for benefit calculator agent""" + + def __init__(self, input_file: str = "agent_io/benefit_input.json", + output_file: str = "agent_io/benefit_output.json"): + self.input_file = input_file + self.output_file = output_file + self._ensure_directory() + + def _ensure_directory(self): + """Create agent_io directory if it doesn't exist""" + os.makedirs(os.path.dirname(self.input_file), exist_ok=True) + + def read_input(self) -> dict: + """ + Read benefit calculator input from file + + Returns: + Input configuration dictionary + """ + try: + if os.path.exists(self.input_file): + with open(self.input_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Input file not found"} + except Exception as e: + return {"error": str(e)} + + def write_input(self, profile_data: dict, scheme_recommendations: str, years: int = 5): + """ + Write input for benefit calculator + + Args: + profile_data: User profile dictionary + scheme_recommendations: Eligible schemes text + years: Number of years to calculate (default: 5) + """ + input_data = { + "timestamp": datetime.now().isoformat(), + "profile": profile_data, + "scheme_recommendations": scheme_recommendations, + "calculation_years": years, + "agent": "benefit_calculator" + } + + with open(self.input_file, 'w', encoding='utf-8') as f: + json.dump(input_data, f, indent=2, ensure_ascii=False) + + def write_output(self, calculation: dict, metadata: dict = None): + """ + Write benefit calculation to output file + + Args: + calculation: Missed benefits calculation + metadata: Optional metadata about calculation + """ + output_data = { + "timestamp": datetime.now().isoformat(), + "calculation": calculation, + "metadata": metadata or {}, + "agent": "benefit_calculator" + } + + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + def read_output(self) -> dict: + """ + Read previous benefit calculations + + Returns: + Previous calculations dictionary + """ + try: + if os.path.exists(self.output_file): + with open(self.output_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Output file not found"} + except Exception as e: + return {"error": str(e)} + + +if __name__ == "__main__": + # Test BenefitIO + io = BenefitIO() + + # Sample input + profile = { + "age": 25, + "income": "300000" + } + + schemes = "PM Kisan: ₹6000/year" + + io.write_input(profile, schemes, years=5) + print("Input written successfully") + + # Sample output + calculation = { + "total_missed": "₹30,000", + "breakdown": {"2022": "₹6000", "2023": "₹6000"} + } + + io.write_output(calculation) + print("Output written successfully") diff --git a/agent_io/exam_io.py b/agent_io/exam_io.py new file mode 100644 index 0000000000000000000000000000000000000000..7783b1886c201fe39e31343527450eebd090d380 --- /dev/null +++ b/agent_io/exam_io.py @@ -0,0 +1,115 @@ +""" +Exam Agent I/O Handler +Manages input/output for exam recommendation agent +""" + +import json +import os +from datetime import datetime + + +class ExamIO: + """Handles input/output operations for exam agent""" + + def __init__(self, input_file: str = "agent_io/exam_input.json", + output_file: str = "agent_io/exam_output.json"): + self.input_file = input_file + self.output_file = output_file + self._ensure_directory() + + def _ensure_directory(self): + """Create agent_io directory if it doesn't exist""" + os.makedirs(os.path.dirname(self.input_file), exist_ok=True) + + def read_input(self) -> dict: + """ + Read exam agent input from file + + Returns: + Input configuration dictionary + """ + try: + if os.path.exists(self.input_file): + with open(self.input_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Input file not found"} + except Exception as e: + return {"error": str(e)} + + def write_input(self, profile_data: dict, preferences: dict = None): + """ + Write input for exam agent + + Args: + profile_data: Student profile dictionary + preferences: Optional student preferences + """ + input_data = { + "timestamp": datetime.now().isoformat(), + "profile": profile_data, + "preferences": preferences or {}, + "agent": "exam_recommendation" + } + + with open(self.input_file, 'w', encoding='utf-8') as f: + json.dump(input_data, f, indent=2, ensure_ascii=False) + + def write_output(self, recommendations: dict, metadata: dict = None): + """ + Write exam recommendations to output file + + Args: + recommendations: Exam recommendations from agent + metadata: Optional metadata about the recommendation process + """ + output_data = { + "timestamp": datetime.now().isoformat(), + "recommendations": recommendations, + "metadata": metadata or {}, + "agent": "exam_recommendation" + } + + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + def read_output(self) -> dict: + """ + Read previous exam recommendations + + Returns: + Previous recommendations dictionary + """ + try: + if os.path.exists(self.output_file): + with open(self.output_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Output file not found"} + except Exception as e: + return {"error": str(e)} + + +if __name__ == "__main__": + # Test ExamIO + io = ExamIO() + + # Sample input + profile = { + "age": 25, + "education": "Bachelor's in Engineering", + "interests": "Technical jobs" + } + + io.write_input(profile, {"exam_type": "government"}) + print("Input written successfully") + + # Sample output + recommendations = { + "exams": [ + {"name": "SSC CGL", "eligibility": "Graduate"} + ] + } + + io.write_output(recommendations, {"sources": 5}) + print("Output written successfully") diff --git a/agent_io/profiling_io.py b/agent_io/profiling_io.py new file mode 100644 index 0000000000000000000000000000000000000000..d075a6e2bf5ee228eccfe4469b1f7f6b473eda13 --- /dev/null +++ b/agent_io/profiling_io.py @@ -0,0 +1,111 @@ +""" +Profiling Agent I/O Handler +Manages input/output for user profiling agent +""" + +import json +import os +from datetime import datetime + + +class ProfilingIO: + """Handles input/output operations for profiling agent""" + + def __init__(self, input_file: str = "agent_io/profiling_input.json", + output_file: str = "agent_io/profiling_output.json"): + self.input_file = input_file + self.output_file = output_file + self._ensure_directory() + + def _ensure_directory(self): + """Create agent_io directory if it doesn't exist""" + os.makedirs(os.path.dirname(self.input_file), exist_ok=True) + + def read_input(self) -> dict: + """ + Read profiling agent input from file + + Returns: + Raw user input dictionary + """ + try: + if os.path.exists(self.input_file): + with open(self.input_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Input file not found"} + except Exception as e: + return {"error": str(e)} + + def write_input(self, user_input: str, documents: list = None): + """ + Write raw user input for profiling + + Args: + user_input: Raw text input from user + documents: Optional list of uploaded documents + """ + input_data = { + "timestamp": datetime.now().isoformat(), + "user_input": user_input, + "documents": documents or [], + "agent": "user_profiling" + } + + with open(self.input_file, 'w', encoding='utf-8') as f: + json.dump(input_data, f, indent=2, ensure_ascii=False) + + def write_output(self, profile_data: dict, confidence: dict = None): + """ + Write extracted profile to output file + + Args: + profile_data: Structured profile data + confidence: Optional confidence scores for extracted fields + """ + output_data = { + "timestamp": datetime.now().isoformat(), + "profile": profile_data, + "confidence": confidence or {}, + "agent": "user_profiling" + } + + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + def read_output(self) -> dict: + """ + Read extracted profile + + Returns: + Structured profile dictionary + """ + try: + if os.path.exists(self.output_file): + with open(self.output_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Output file not found"} + except Exception as e: + return {"error": str(e)} + + +if __name__ == "__main__": + # Test ProfilingIO + io = ProfilingIO() + + # Sample input + user_text = "I am 25 years old from Maharashtra, OBC category, income 3 lakh." + io.write_input(user_text, documents=["resume.pdf"]) + print("Input written successfully") + + # Sample output + profile = { + "age": 25, + "state": "Maharashtra", + "caste": "OBC", + "income": "300000" + } + + io.write_output(profile, confidence={"age": 1.0, "state": 1.0}) + print("Output written successfully") diff --git a/agent_io/scheme_io.py b/agent_io/scheme_io.py new file mode 100644 index 0000000000000000000000000000000000000000..d53b89df1fe3b9c72b567e6e942a2743b85c20d8 --- /dev/null +++ b/agent_io/scheme_io.py @@ -0,0 +1,116 @@ +""" +Scheme Agent I/O Handler +Manages input/output for scheme recommendation agent +""" + +import json +import os +from datetime import datetime + + +class SchemeIO: + """Handles input/output operations for scheme agent""" + + def __init__(self, input_file: str = "agent_io/scheme_input.json", + output_file: str = "agent_io/scheme_output.json"): + self.input_file = input_file + self.output_file = output_file + self._ensure_directory() + + def _ensure_directory(self): + """Create agent_io directory if it doesn't exist""" + os.makedirs(os.path.dirname(self.input_file), exist_ok=True) + + def read_input(self) -> dict: + """ + Read scheme agent input from file + + Returns: + Input configuration dictionary + """ + try: + if os.path.exists(self.input_file): + with open(self.input_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Input file not found"} + except Exception as e: + return {"error": str(e)} + + def write_input(self, profile_data: dict, preferences: dict = None): + """ + Write input for scheme agent + + Args: + profile_data: User profile dictionary + preferences: Optional user preferences + """ + input_data = { + "timestamp": datetime.now().isoformat(), + "profile": profile_data, + "preferences": preferences or {}, + "agent": "scheme_recommendation" + } + + with open(self.input_file, 'w', encoding='utf-8') as f: + json.dump(input_data, f, indent=2, ensure_ascii=False) + + def write_output(self, recommendations: dict, metadata: dict = None): + """ + Write scheme recommendations to output file + + Args: + recommendations: Scheme recommendations from agent + metadata: Optional metadata about the recommendation process + """ + output_data = { + "timestamp": datetime.now().isoformat(), + "recommendations": recommendations, + "metadata": metadata or {}, + "agent": "scheme_recommendation" + } + + with open(self.output_file, 'w', encoding='utf-8') as f: + json.dump(output_data, f, indent=2, ensure_ascii=False) + + def read_output(self) -> dict: + """ + Read previous scheme recommendations + + Returns: + Previous recommendations dictionary + """ + try: + if os.path.exists(self.output_file): + with open(self.output_file, 'r', encoding='utf-8') as f: + return json.load(f) + else: + return {"error": "Output file not found"} + except Exception as e: + return {"error": str(e)} + + +if __name__ == "__main__": + # Test SchemeIO + io = SchemeIO() + + # Sample input + profile = { + "age": 25, + "income": "300000", + "state": "Maharashtra", + "caste": "OBC" + } + + io.write_input(profile, {"priority": "high_benefit"}) + print("Input written successfully") + + # Sample output + recommendations = { + "schemes": [ + {"name": "PM Kisan", "benefit": "₹6000/year"} + ] + } + + io.write_output(recommendations, {"sources": 5}) + print("Output written successfully") diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..793125a413f64ea2f50a3e8b6ec036ebfdc7c8b2 --- /dev/null +++ b/agents/__init__.py @@ -0,0 +1,3 @@ +""" +Agents Module Init +""" diff --git a/agents/benefit_agent.py b/agents/benefit_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..06e8ec1caaa438614d62c983968c0859b86bd629 --- /dev/null +++ b/agents/benefit_agent.py @@ -0,0 +1,213 @@ +""" +Missed Benefits Calculator Agent +Estimates potential benefits user might have missed +""" + +import json +from langchain_groq import ChatGroq +from langchain_core.messages import HumanMessage, SystemMessage +from config import GROQ_API_KEY + + +def get_llm(): + """Initialize Groq LLM""" + if not GROQ_API_KEY: + raise ValueError("GROQ_API_KEY not found in environment variables") + + return ChatGroq( + api_key=GROQ_API_KEY, + model="llama-3.3-70b-versatile", + temperature=0.2 + ) + + +def calculate_missed_benefits(profile_data: dict, scheme_recommendations: str) -> dict: + """ + Calculates potential benefits the user might have missed in the past + + Args: + profile_data: User profile dictionary + scheme_recommendations: Recommended schemes text + + Returns: + Dictionary with missed benefits calculation + """ + try: + llm = get_llm() + + profile_str = json.dumps(profile_data, indent=2) + + prompt = f""" +You are a financial analyst specializing in Indian government welfare schemes. + +Based on the user's profile and recommended schemes, calculate how much money/benefits +they might have missed in the past 5 years by not applying to eligible schemes. + +**USER PROFILE:** +{profile_str} + +**RECOMMENDED SCHEMES:** +{scheme_recommendations} + +**ANALYSIS REQUIREMENTS:** + +1. **Identify Eligible Schemes:** + - List schemes user was eligible for in past 5 years + - Consider age, income, education criteria over time + +2. **Calculate Monetary Benefits:** + - One-time payments missed + - Annual recurring benefits missed + - Subsidies or discounts not availed + - Total missed amount (conservative estimate) + +3. **Non-Monetary Benefits:** + - Training opportunities missed + - Healthcare benefits not utilized + - Educational scholarships lost + - Employment opportunities missed + +4. **Year-wise Breakdown:** + - Provide year-wise missed benefit estimate + - Account for scheme start dates + - Consider eligibility changes over time + +5. **Actionable Insights:** + - Can any benefits be claimed retroactively? + - Which schemes should be applied immediately? + - Priority ranking for current applications + +**OUTPUT FORMAT:** + +### Total Missed Benefits (Past 5 Years) +- **Monetary Loss:** ₹[Amount] +- **Non-Monetary Loss:** [Description] + +### Year-wise Breakdown +**2022:** +- Scheme Name: ₹[Amount] | [Benefit Description] + +**2023:** +- Scheme Name: ₹[Amount] | [Benefit Description] + +[Continue for all years] + +### Retroactive Claims Possible +- List schemes that allow backdated applications +- Required documentation for backdated claims + +### Immediate Action Items +1. [Highest priority scheme to apply now] +2. [Second priority scheme] +3. [Third priority scheme] + +### Future Projections +If user applies now, estimated benefits over next 5 years: ₹[Amount] + +--- + +**IMPORTANT NOTES:** +- Provide conservative estimates (lower bound) +- Mark assumptions clearly +- Only include verified government schemes +- Consider state-specific schemes based on user's state +- Factor in income bracket changes over time + +Proceed with calculation: +""" + + messages = [ + SystemMessage(content="You are a financial analyst for government welfare schemes. Provide realistic, conservative estimates."), + HumanMessage(content=prompt) + ] + + response = llm.invoke(messages) + + return { + "calculation": response.content, + "profile_considered": profile_data.get('age', 'N/A'), + "schemes_analyzed": "Available in recommendations" + } + + except Exception as e: + return { + "error": str(e), + "calculation": "Unable to calculate missed benefits" + } + + +def estimate_future_benefits(profile_data: dict, scheme_recommendations: str, years: int = 5) -> dict: + """ + Estimates potential benefits over the next N years if user applies now + + Args: + profile_data: User profile dictionary + scheme_recommendations: Recommended schemes text + years: Number of years to project (default: 5) + + Returns: + Dictionary with future benefits projection + """ + try: + llm = get_llm() + + profile_str = json.dumps(profile_data, indent=2) + + prompt = f""" +Based on the user's current profile and eligible schemes, estimate the total benefits +they can receive over the next {years} years if they apply immediately. + +**USER PROFILE:** +{profile_str} + +**ELIGIBLE SCHEMES:** +{scheme_recommendations} + +Provide: +1. Year-wise projected benefits +2. Total estimated benefits over {years} years +3. Required actions to maximize benefits +4. Key deadlines to watch + +Return structured calculation with conservative estimates. +""" + + messages = [ + SystemMessage(content="You are a financial projection analyst for government schemes."), + HumanMessage(content=prompt) + ] + + response = llm.invoke(messages) + + return { + "projection": response.content, + "years_projected": years, + "profile_age": profile_data.get('age', 'N/A') + } + + except Exception as e: + return { + "error": str(e), + "projection": "Unable to estimate future benefits" + } + + +if __name__ == "__main__": + # Test the agent + test_profile = { + "age": 25, + "income": "300000", + "caste": "OBC", + "state": "Maharashtra", + "education": "Bachelor's in Engineering", + "employment_status": "Unemployed" + } + + test_schemes = """ + 1. PM Kisan Samman Nidhi: ₹6000 per year + 2. Post Matric Scholarship (OBC): ₹5000-10000 per year + 3. Skill Development Scheme: Free training worth ₹20000 + """ + + result = calculate_missed_benefits(test_profile, test_schemes) + print(json.dumps(result, indent=2)) diff --git a/agents/document_agent.py b/agents/document_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..061cd78e8cb8fb0dee0b3451b45abcdf9c4cb8f0 --- /dev/null +++ b/agents/document_agent.py @@ -0,0 +1,165 @@ +""" +Document Processing Agent +Handles PDF and image text extraction +""" + +import os +import pytesseract +from PIL import Image +from pypdf import PdfReader + + +def process_pdf(file_path: str) -> dict: + """ + Extracts text from PDF file + + Args: + file_path: Path to PDF file + + Returns: + Dictionary with extracted text and metadata + """ + try: + if not os.path.exists(file_path): + return {"error": f"File not found: {file_path}", "text": ""} + + reader = PdfReader(file_path) + text = "" + + for page_num, page in enumerate(reader.pages): + page_text = page.extract_text() + text += f"\n--- Page {page_num + 1} ---\n{page_text}" + + return { + "file_path": file_path, + "pages": len(reader.pages), + "text": text, + "success": True + } + + except Exception as e: + return { + "error": str(e), + "file_path": file_path, + "text": "", + "success": False + } + + +def process_image(file_path: str, language: str = 'eng+hin') -> dict: + """ + Extracts text from image using OCR + + Args: + file_path: Path to image file + language: Tesseract language code (default: English + Hindi) + + Returns: + Dictionary with extracted text and metadata + """ + try: + if not os.path.exists(file_path): + return {"error": f"File not found: {file_path}", "text": ""} + + img = Image.open(file_path) + text = pytesseract.image_to_string(img, lang=language) + + return { + "file_path": file_path, + "image_size": img.size, + "text": text, + "success": True + } + + except Exception as e: + return { + "error": str(e), + "file_path": file_path, + "text": "", + "success": False + } + + +def process_resume(file_path: str) -> dict: + """ + Processes resume (PDF or image) and extracts relevant information + + Args: + file_path: Path to resume file + + Returns: + Extracted resume information + """ + file_ext = os.path.splitext(file_path)[1].lower() + + if file_ext == '.pdf': + result = process_pdf(file_path) + elif file_ext in ['.jpg', '.jpeg', '.png', '.tiff', '.bmp']: + result = process_image(file_path) + else: + return { + "error": f"Unsupported file format: {file_ext}", + "text": "", + "success": False + } + + if result.get("success"): + # Basic resume parsing (can be enhanced) + text = result["text"] + result["document_type"] = "resume" + result["contains_email"] = "@" in text + result["contains_phone"] = any(char.isdigit() for char in text) + + return result + + +def batch_process_documents(folder_path: str, file_type: str = "pdf") -> list: + """ + Processes multiple documents in a folder + + Args: + folder_path: Path to folder containing documents + file_type: Type of files to process ("pdf" or "image") + + Returns: + List of processing results for each document + """ + results = [] + + if not os.path.exists(folder_path): + return [{"error": f"Folder not found: {folder_path}"}] + + extensions = { + "pdf": [".pdf"], + "image": [".jpg", ".jpeg", ".png", ".tiff", ".bmp"] + } + + valid_extensions = extensions.get(file_type, [".pdf"]) + + for filename in os.listdir(folder_path): + file_path = os.path.join(folder_path, filename) + file_ext = os.path.splitext(filename)[1].lower() + + if file_ext in valid_extensions: + if file_type == "pdf": + result = process_pdf(file_path) + else: + result = process_image(file_path) + + results.append(result) + + return results + + +if __name__ == "__main__": + # Test the agent + # Note: You'll need to provide actual file paths to test + + # Example usage + print("Document Processing Agent") + print("=" * 50) + print("Available functions:") + print("1. process_pdf(file_path)") + print("2. process_image(file_path)") + print("3. process_resume(file_path)") + print("4. batch_process_documents(folder_path, file_type)") diff --git a/agents/exam_agent.py b/agents/exam_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..60d7e6c86780490e90ff46d33bed0d1b70cd1441 --- /dev/null +++ b/agents/exam_agent.py @@ -0,0 +1,138 @@ +""" +Exam Recommendation Agent +Provides competitive exam recommendations based on student profile +Uses FAISS for local vector storage +""" + +import json +from langchain_groq import ChatGroq +from langchain_core.messages import HumanMessage, SystemMessage +from rag.exam_vectorstore import load_exam_vectorstore +from prompts.exam_prompt import EXAM_PROMPT +from tools.tavily_tool import government_focused_search +from config import GROQ_API_KEY + + +def get_llm(): + """Initialize Groq LLM""" + if not GROQ_API_KEY: + raise ValueError("GROQ_API_KEY not found in environment variables") + + return ChatGroq( + api_key=GROQ_API_KEY, + model="llama-3.3-70b-versatile", + temperature=0.3 + ) + + +def run_exam_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict: + """ + Recommends competitive exams based on student profile + + Args: + profile_data: Structured user profile + use_web_search: Whether to use Tavily for live search + vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading) + + Returns: + Exam recommendations dictionary + """ + try: + # Use provided vectorstore or try to load it + context = "" + sources_used = 0 + + if vectorstore is not None: + print("✅ Using pre-loaded vectorstore") + try: + # Create search query from profile + search_query = f""" + Student Profile: + Education: {profile_data.get('education', 'N/A')} + Age: {profile_data.get('age', 'N/A')} + Interests: {profile_data.get('interests', 'N/A')} + Skills: {profile_data.get('skills', 'N/A')} + Occupation: {profile_data.get('occupation', 'N/A')} + """ + + # RAG retrieval + docs = vectorstore.similarity_search(search_query, k=5) + context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)]) + sources_used = len(docs) + print(f"✓ Retrieved {sources_used} exam documents from vectorstore") + except Exception as e: + print(f"⚠️ Error querying vectorstore: {str(e)}") + context = "Vectorstore query failed. Using live web search." + else: + print("ℹ️ No vectorstore provided, using web search only") + context = "No local exam database available. Using live web search." + + # Create profile string + profile_str = json.dumps(profile_data, indent=2) + + # Web search (fallback or enhancement) + web_context = "" + if use_web_search: + try: + education = profile_data.get('education', 'graduate') + interests = profile_data.get('interests', 'government jobs') + web_query = f"competitive exams India {education} {interests} eligibility 2026" + print(f"🔍 Searching web: {web_query}") + web_results = government_focused_search(web_query) + web_context = f"\n\nLive Web Search Results:\n{web_results}" + print("✓ Web search completed") + except Exception as e: + web_context = f"\n\nWeb search unavailable: {str(e)}" + print(f"⚠ Web search failed: {str(e)}") + + # Combine contexts + full_context = context + web_context + + # If no context at all, return helpful message + if not full_context.strip(): + return { + "recommendations": "Unable to retrieve exam information. Please ensure Tavily API key is configured or vectorstore is built.", + "sources_used": 0, + "web_search_used": use_web_search + } + + # Generate recommendations + llm = get_llm() + + prompt = EXAM_PROMPT.format( + context=full_context, + profile=profile_str + ) + + messages = [ + SystemMessage(content="You are an expert competitive exam advisor. Provide accurate, verified information only."), + HumanMessage(content=prompt) + ] + + response = llm.invoke(messages) + + return { + "recommendations": response.content, + "sources_used": sources_used, + "web_search_used": use_web_search + } + + except Exception as e: + return { + "error": str(e), + "recommendations": [] + } + + +if __name__ == "__main__": + # Test the agent + test_profile = { + "education": "Bachelor's in Engineering", + "age": 25, + "interests": "Technical jobs, government sector", + "skills": "Programming, problem solving", + "occupation": "Student" + } + + result = run_exam_agent(test_profile, use_web_search=False) + print(json.dumps(result, indent=2)) diff --git a/agents/profiling_agent.py b/agents/profiling_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..aa72ff20851eeee117786bd93c89f62ff01e75af --- /dev/null +++ b/agents/profiling_agent.py @@ -0,0 +1,149 @@ +""" +User Profiling Agent +Extracts structured user information for eligibility matching +""" + +import json +from langchain_groq import ChatGroq +from langchain_core.messages import HumanMessage, SystemMessage +from prompts.profiling_prompt import PROFILING_PROMPT +from config import GROQ_API_KEY + + +def get_llm(): + """Initialize Groq LLM""" + if not GROQ_API_KEY: + raise ValueError("GROQ_API_KEY not found in environment variables") + + return ChatGroq( + api_key=GROQ_API_KEY, + model="llama-3.3-70b-versatile", + temperature=0.1 # Low temperature for structured extraction + ) + + +def extract_json_from_text(text: str) -> dict: + """Extract JSON from text that might contain markdown or extra content""" + import re + + # Try direct JSON parse first + try: + return json.loads(text.strip()) + except json.JSONDecodeError: + pass + + # Try to extract JSON from markdown code blocks + json_pattern = r'```(?:json)?\s*(\{.*?\})\s*```' + matches = re.findall(json_pattern, text, re.DOTALL) + if matches: + try: + return json.loads(matches[0]) + except json.JSONDecodeError: + pass + + # Try to find complete JSON object (improved pattern) + # Match from first { to last } + start_idx = text.find('{') + end_idx = text.rfind('}') + + if start_idx != -1 and end_idx != -1 and end_idx > start_idx: + try: + potential_json = text[start_idx:end_idx+1] + return json.loads(potential_json) + except json.JSONDecodeError: + pass + + # Fallback: try to find any JSON-like structure + json_pattern = r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}' + matches = re.findall(json_pattern, text, re.DOTALL) + for match in matches: + try: + return json.loads(match) + except json.JSONDecodeError: + continue + + return None + + +def run_profiling_agent(user_input: str) -> dict: + """ + Extracts structured profile information from user input + + Args: + user_input: Raw user input text + + Returns: + Structured profile dictionary + """ + try: + llm = get_llm() + + prompt = PROFILING_PROMPT.format(user_input=user_input) + + messages = [ + SystemMessage(content="You are an expert user profiling agent. Return ONLY a valid JSON object, nothing else."), + HumanMessage(content=prompt) + ] + + response = llm.invoke(messages) + + print(f"\n🤖 LLM Response (first 200 chars): {response.content[:200]}...") + + # Extract JSON from response + profile_data = extract_json_from_text(response.content) + + if profile_data: + # Normalize keys to lowercase with underscores + normalized_profile = {} + for key, value in profile_data.items(): + normalized_key = key.lower().replace(' ', '_').replace('-', '_') + normalized_profile[normalized_key] = value + + print(f"✅ Profile extracted: {list(normalized_profile.keys())}") + return normalized_profile + else: + # Fallback: Create basic profile from user input + print("⚠️ Could not parse JSON, creating basic profile") + return { + "user_input": user_input, + "raw_profile": response.content, + "note": "Profile extraction incomplete. Using raw input." + } + + except Exception as e: + print(f"❌ Profiling error: {str(e)}") + return { + "error": str(e), + "user_input": user_input + } + + +def validate_profile(profile_data: dict) -> bool: + """ + Validates that profile has minimum required information + + Args: + profile_data: Profile dictionary + + Returns: + True if valid, False otherwise + """ + required_fields = ['age', 'state', 'education'] + + for field in required_fields: + if field not in profile_data or profile_data[field] == "Not Provided": + return False + + return True + + +if __name__ == "__main__": + # Test the agent + test_input = """ + I am a 25-year-old male from Maharashtra. I completed my Bachelor's in Engineering. + My family income is around 3 lakh per year. I belong to the OBC category. + I am currently unemployed and looking for government job opportunities. + """ + + result = run_profiling_agent(test_input) + print(json.dumps(result, indent=2)) diff --git a/agents/rag_agent.py b/agents/rag_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..397cc56e361f18cbbffc3c440a28e623413d455c --- /dev/null +++ b/agents/rag_agent.py @@ -0,0 +1,91 @@ +""" +RAG Retrieval Agent +Dedicated agent for vector database queries +Uses FAISS for local vector storage +""" + +import json +from rag.scheme_vectorstore import load_scheme_vectorstore +from rag.exam_vectorstore import load_exam_vectorstore + + +def run_rag_agent(query: str, database: str = "schemes", k: int = 5) -> dict: + """ + Performs RAG retrieval from specified vectorstore + + Args: + query: Search query + database: "schemes" or "exams" + k: Number of documents to retrieve + + Returns: + Retrieved documents dictionary + """ + try: + if database == "schemes": + vectorstore = load_scheme_vectorstore() + elif database == "exams": + vectorstore = load_exam_vectorstore() + else: + return { + "error": f"Invalid database: {database}. Use 'schemes' or 'exams'", + "documents": [] + } + + # Similarity search + docs = vectorstore.similarity_search(query, k=k) + + # Format results + formatted_docs = [] + for i, doc in enumerate(docs): + formatted_docs.append({ + "id": i + 1, + "content": doc.page_content, + "metadata": doc.metadata, + "source": doc.metadata.get('source', 'Unknown') + }) + + return { + "query": query, + "database": database, + "documents_found": len(formatted_docs), + "documents": formatted_docs + } + + except FileNotFoundError as e: + return { + "error": f"Vectorstore not found for {database}. Please build it first.", + "documents": [] + } + except Exception as e: + return { + "error": str(e), + "documents": [] + } + + +def hybrid_rag_search(query: str, k: int = 3) -> dict: + """ + Searches both scheme and exam databases + + Args: + query: Search query + k: Number of documents per database + + Returns: + Combined results from both databases + """ + scheme_results = run_rag_agent(query, database="schemes", k=k) + exam_results = run_rag_agent(query, database="exams", k=k) + + return { + "query": query, + "scheme_results": scheme_results, + "exam_results": exam_results + } + + +if __name__ == "__main__": + # Test the agent + result = run_rag_agent("agricultural schemes for farmers", database="schemes", k=3) + print(json.dumps(result, indent=2)) diff --git a/agents/scheme_agent.py b/agents/scheme_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..3201bb98a0324ad9bc96708e8af34e54a2ec64dc --- /dev/null +++ b/agents/scheme_agent.py @@ -0,0 +1,142 @@ +""" +Scheme Recommendation Agent +Provides RAG-based government scheme recommendations +Uses FAISS for local vector storage +""" + +import json +from langchain_groq import ChatGroq +from langchain_core.messages import HumanMessage, SystemMessage +from rag.scheme_vectorstore import load_scheme_vectorstore +from prompts.scheme_prompt import SCHEME_PROMPT +from tools.tavily_tool import government_focused_search +from config import GROQ_API_KEY + + +def get_llm(): + """Initialize Groq LLM""" + if not GROQ_API_KEY: + raise ValueError("GROQ_API_KEY not found in environment variables") + + return ChatGroq( + api_key=GROQ_API_KEY, + model="llama-3.3-70b-versatile", + temperature=0.3 + ) + + +def run_scheme_agent(profile_data: dict, use_web_search: bool = True, vectorstore=None) -> dict: + """ + Recommends government schemes based on user profile + + Args: + profile_data: Structured user profile + use_web_search: Whether to use Tavily for live search + vectorstore: Pre-loaded FAISS vectorstore (optional, avoids repeated loading) + + Returns: + Scheme recommendations dictionary + """ + try: + # Use provided vectorstore or try to load it + context = "" + sources_used = 0 + + if vectorstore is not None: + print("✅ Using pre-loaded vectorstore") + try: + # Create search query from profile + search_query = f""" + User Profile: + Income: {profile_data.get('income', 'N/A')} + Caste: {profile_data.get('caste', 'N/A')} + State: {profile_data.get('state', 'N/A')} + Age: {profile_data.get('age', 'N/A')} + Gender: {profile_data.get('gender', 'N/A')} + Employment: {profile_data.get('employment_status', 'N/A')} + """ + + # RAG retrieval + docs = vectorstore.similarity_search(search_query, k=5) + context = "\n\n".join([f"Document {i+1}:\n{d.page_content}" for i, d in enumerate(docs)]) + sources_used = len(docs) + print(f"✓ Retrieved {sources_used} scheme documents from vectorstore") + except Exception as e: + print(f"⚠️ Error querying vectorstore: {str(e)}") + context = "Vectorstore query failed. Using live web search." + else: + print("ℹ️ No vectorstore provided, using web search only") + context = "No local scheme database available. Using live web search." + + # Create profile string + profile_str = json.dumps(profile_data, indent=2) + + # Web search (fallback or enhancement) + web_context = "" + if use_web_search: + try: + state = profile_data.get('state', 'India') + caste = profile_data.get('caste', '') + income = profile_data.get('income', '') + web_query = f"government schemes India {state} {caste} eligibility benefits 2026" + print(f"🔍 Searching web: {web_query}") + web_results = government_focused_search(web_query) + web_context = f"\n\nLive Web Search Results:\n{web_results}" + print("✓ Web search completed") + except Exception as e: + web_context = f"\n\nWeb search unavailable: {str(e)}" + print(f"⚠ Web search failed: {str(e)}") + + # Combine contexts + full_context = context + web_context + + # If no context at all, return helpful message + if not full_context.strip(): + return { + "recommendations": "Unable to retrieve scheme information. Please ensure Tavily API key is configured or vectorstore is built.", + "sources_used": 0, + "web_search_used": use_web_search + } + + # Generate recommendations + llm = get_llm() + + prompt = SCHEME_PROMPT.format( + context=full_context, + profile=profile_str + ) + + messages = [ + SystemMessage(content="You are an expert government scheme advisor. Provide accurate, verified information only."), + HumanMessage(content=prompt) + ] + + response = llm.invoke(messages) + + return { + "recommendations": response.content, + "sources_used": sources_used, + "web_search_used": use_web_search + } + + except Exception as e: + return { + "error": str(e), + "recommendations": [] + } + + +if __name__ == "__main__": + # Test the agent + test_profile = { + "income": "300000", + "caste": "OBC", + "state": "Maharashtra", + "age": 25, + "gender": "Male", + "employment_status": "Unemployed", + "education": "Bachelor's in Engineering" + } + + result = run_scheme_agent(test_profile, use_web_search=False) + print(json.dumps(result, indent=2)) diff --git a/agents/search_agent.py b/agents/search_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..d9c1ee7e170dd2e1c3a6f226465127ce9d819a66 --- /dev/null +++ b/agents/search_agent.py @@ -0,0 +1,71 @@ +""" +Web Search Agent +Uses Tavily to search government websites for real-time information +""" + +from tools.tavily_tool import tavily_search, government_focused_search + + +def run_search_agent(query: str, government_only: bool = True) -> dict: + """ + Performs web search for government information + + Args: + query: Search query + government_only: If True, restricts to .gov.in domains + + Returns: + Search results dictionary + """ + try: + if government_only: + results = government_focused_search(query) + else: + results = tavily_search(query) + + return { + "query": query, + "results": results, + "government_only": government_only + } + + except Exception as e: + return { + "query": query, + "error": str(e), + "results": [] + } + + +def search_scheme_details(scheme_name: str) -> dict: + """ + Search for specific scheme details + + Args: + scheme_name: Name of the government scheme + + Returns: + Scheme details from official sources + """ + query = f"{scheme_name} official website application process eligibility" + return run_search_agent(query, government_only=True) + + +def search_exam_details(exam_name: str) -> dict: + """ + Search for specific exam details + + Args: + exam_name: Name of the competitive exam + + Returns: + Exam details from official sources + """ + query = f"{exam_name} official notification eligibility exam pattern 2026" + return run_search_agent(query, government_only=True) + + +if __name__ == "__main__": + # Test the agent + result = run_search_agent("pradhan mantri kisan samman nidhi yojana", government_only=True) + print(result) diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..34e80ae1a948cfb843a5ed70e589b13a878ee7e6 --- /dev/null +++ b/app.py @@ -0,0 +1,599 @@ +""" +JanSahayak Flask Web Application +Beautiful UI for Multi-Agent Government Intelligence System +""" + +from flask import Flask, render_template, request, jsonify, session, send_file +import json +import os +from datetime import datetime +from graph.workflow import run_workflow +import uuid +import io +import re +from reportlab.lib.pagesizes import letter, A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import inch +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak +from reportlab.lib import colors +from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_JUSTIFY + +app = Flask(__name__) +app.secret_key = os.urandom(24) # For session management + +# Store active sessions +sessions = {} + +# Global vectorstores (loaded on first use for faster startup) +SCHEME_VECTORSTORE = None +EXAM_VECTORSTORE = None +VECTORSTORES_INITIALIZED = False + +# Check if running on a memory-constrained platform +SKIP_VECTORSTORES = os.environ.get('SKIP_VECTORSTORES', 'false').lower() == 'true' + + +def initialize_vectorstores(): + """Load vectorstores lazily on first use to avoid blocking port binding""" + global SCHEME_VECTORSTORE, EXAM_VECTORSTORE, VECTORSTORES_INITIALIZED + + if VECTORSTORES_INITIALIZED: + return # Already initialized + + # Skip vectorstore loading on memory-constrained platforms (use web search only) + if SKIP_VECTORSTORES: + print("\n" + "="*70) + print("⚡ LIGHTWEIGHT MODE: Skipping vectorstore loading") + print("="*70) + print("✅ Using Tavily web search only (no embeddings model)") + print("✅ Low memory usage (<200MB)") + print("✅ Real-time, up-to-date information") + print("="*70 + "\n") + SCHEME_VECTORSTORE = None + EXAM_VECTORSTORE = None + VECTORSTORES_INITIALIZED = True + return + + print("\n" + "="*70) + print("📚 Initializing Vector Stores (lazy loading)") + print("="*70) + + # Load scheme vectorstore + try: + from rag.scheme_vectorstore import load_scheme_vectorstore + SCHEME_VECTORSTORE = load_scheme_vectorstore() + print("✅ Scheme vectorstore loaded successfully") + except Exception as e: + print(f"⚠️ Scheme vectorstore not available: {str(e)}") + print(" Will use web search only for schemes") + SCHEME_VECTORSTORE = None + + # Load exam vectorstore + try: + from rag.exam_vectorstore import load_exam_vectorstore + EXAM_VECTORSTORE = load_exam_vectorstore() + print("✅ Exam vectorstore loaded successfully") + except Exception as e: + print(f"⚠️ Exam vectorstore not available: {str(e)}") + print(" Will use web search only for exams") + EXAM_VECTORSTORE = None + + VECTORSTORES_INITIALIZED = True + print("="*70 + "\n") + + +def format_markdown(text): + """Convert markdown-style text to HTML""" + if not text or not isinstance(text, str): + return text + + import re + + # Convert headers (### heading) + text = re.sub(r'###\s+(.+?)(?=\n|$)', r'

\1

', text) + text = re.sub(r'##\s+(.+?)(?=\n|$)', r'

\1

', text) + + # Convert bold (**text**) + text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) + + # Convert italic (*text*) + text = re.sub(r'\*(.+?)\*', r'\1', text) + + # Convert bullet points (- item or * item) + text = re.sub(r'^[\-\*]\s+(.+)$', r'
  • \1
  • ', text, flags=re.MULTILINE) + text = re.sub(r'(
  • .*?
  • )', r'', text, flags=re.DOTALL) + text = text.replace('\n