Spaces:
Sleeping
Sleeping
Upload 33 files
Browse files- README.md +526 -12
- app.py +553 -0
- examples/business_data.csv +13 -0
- examples/sample_documents.txt +31 -0
- examples/sample_email_complaint.txt +14 -0
- examples/sample_email_inquiry.txt +21 -0
- examples/sample_email_urgent.txt +23 -0
- examples/sample_report.txt +42 -0
- requirements.txt +37 -0
- tools/__init__.py +3 -0
- tools/__pycache__/__init__.cpython-312.pyc +0 -0
- tools/__pycache__/data_visualizer.cpython-312.pyc +0 -0
- tools/__pycache__/email_intent_classifier.cpython-312.pyc +0 -0
- tools/__pycache__/file_converter.cpython-312.pyc +0 -0
- tools/__pycache__/kpi_generator.cpython-312.pyc +0 -0
- tools/__pycache__/pdf_reader.cpython-312.pyc +0 -0
- tools/__pycache__/rag_search.cpython-312.pyc +0 -0
- tools/__pycache__/text_extractor.cpython-312.pyc +0 -0
- tools/__pycache__/web_fetcher.cpython-312.pyc +0 -0
- tools/data_visualizer.py +231 -0
- tools/email_intent_classifier.py +234 -0
- tools/file_converter.py +200 -0
- tools/kpi_generator.py +292 -0
- tools/pdf_reader.py +93 -0
- tools/rag_search.py +153 -0
- tools/text_extractor.py +114 -0
- tools/web_fetcher.py +179 -0
- utils/__init__.py +3 -0
- utils/__pycache__/__init__.cpython-312.pyc +0 -0
- utils/__pycache__/helpers.cpython-312.pyc +0 -0
- utils/__pycache__/rag_utils.cpython-312.pyc +0 -0
- utils/helpers.py +180 -0
- utils/rag_utils.py +141 -0
README.md
CHANGED
|
@@ -1,12 +1,526 @@
|
|
| 1 |
-
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
---
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- building-mcp-track-enterprise
|
| 4 |
+
- mcp
|
| 5 |
+
- anthropic
|
| 6 |
+
- enterprise-automation
|
| 7 |
+
- gradio-hackathon
|
| 8 |
+
- ai-agents
|
| 9 |
+
- mcp-server
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
# 🚀 MissionControlMCP
|
| 13 |
+
|
| 14 |
+
**Enterprise Automation MCP Server for Document Analysis, Data Processing & Business Intelligence**
|
| 15 |
+
|
| 16 |
+
A fully functional Model Context Protocol (MCP) server providing 8 powerful enterprise automation tools for document processing, web scraping, semantic search, data visualization, and business analytics.
|
| 17 |
+
|
| 18 |
+
Built for the **MCP 1st Birthday Hackathon – Winter 2025** (Track 1: Building MCP - Enterprise Category).
|
| 19 |
+
|
| 20 |
+
🏆 **Hackathon Submission** | 🔧 **Track 1: Building MCP** | 🏢 **Enterprise Category**
|
| 21 |
+
|
| 22 |
+
---
|
| 23 |
+
|
| 24 |
+
## 📋 Table of Contents
|
| 25 |
+
|
| 26 |
+
- [Overview](#overview)
|
| 27 |
+
- [Features](#features)
|
| 28 |
+
- [Tools](#tools)
|
| 29 |
+
- [Installation](#installation)
|
| 30 |
+
- [Usage](#usage)
|
| 31 |
+
- [Tool Examples](#tool-examples)
|
| 32 |
+
- [Claude Desktop Integration](#claude-desktop-integration)
|
| 33 |
+
- [Development](#development)
|
| 34 |
+
- [Testing](#testing)
|
| 35 |
+
- [Architecture](#architecture)
|
| 36 |
+
- [Hackathon Submission](#hackathon-submission)
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## 🎯 Overview
|
| 41 |
+
|
| 42 |
+
**MissionControlMCP** is an enterprise-grade MCP server that provides intelligent automation capabilities through 8 specialized tools. It enables AI assistants like Claude to perform complex document processing, data analysis, web research, and business intelligence tasks.
|
| 43 |
+
|
| 44 |
+
### Key Capabilities
|
| 45 |
+
|
| 46 |
+
- **📄 Document Processing**: Extract text from PDFs, process and summarize content
|
| 47 |
+
- **🌐 Web Intelligence**: Fetch and parse web content with clean text extraction
|
| 48 |
+
- **🔍 Semantic Search**: RAG-based vector search using FAISS and sentence transformers
|
| 49 |
+
- **📊 Data Visualization**: Generate charts from CSV/JSON data
|
| 50 |
+
- **🔄 File Conversion**: Convert between PDF, TXT, and CSV formats
|
| 51 |
+
- **📧 Email Classification**: Classify email intents using NLP
|
| 52 |
+
- **📈 KPI Generation**: Calculate business metrics and generate insights
|
| 53 |
+
|
| 54 |
+
---
|
| 55 |
+
|
| 56 |
+
## 🧪 Quick Test
|
| 57 |
+
|
| 58 |
+
```bash
|
| 59 |
+
# Test all tools with sample files
|
| 60 |
+
python test_samples.py
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
**See [TESTING.md](TESTING.md) for complete testing guide with examples!**
|
| 64 |
+
|
| 65 |
+
---
|
| 66 |
+
|
| 67 |
+
## ✨ Features
|
| 68 |
+
|
| 69 |
+
- ✅ **8 Production-Ready Tools** for enterprise automation
|
| 70 |
+
- ✅ **MCP Compliant** - Works with Claude Desktop and any MCP client
|
| 71 |
+
- ✅ **Type-Safe** - Built with Python 3.11+ and type hints
|
| 72 |
+
- ✅ **Modular Architecture** - Clean separation of concerns
|
| 73 |
+
- ✅ **Comprehensive Testing** - Test suite included
|
| 74 |
+
- ✅ **Well Documented** - Clear schemas and examples
|
| 75 |
+
- ✅ **Vector Search** - RAG implementation with FAISS
|
| 76 |
+
- ✅ **Data Visualization** - Base64 encoded chart generation
|
| 77 |
+
- ✅ **NLP Classification** - Rule-based intent detection
|
| 78 |
+
|
| 79 |
+
---
|
| 80 |
+
|
| 81 |
+
## 🛠️ Tools
|
| 82 |
+
|
| 83 |
+
### 1. **pdf_reader**
|
| 84 |
+
Extract text and metadata from PDF files.
|
| 85 |
+
|
| 86 |
+
**Input:**
|
| 87 |
+
- `file_path`: Path to PDF file
|
| 88 |
+
|
| 89 |
+
**Output:**
|
| 90 |
+
- Extracted text from all pages
|
| 91 |
+
- Page count
|
| 92 |
+
- Document metadata (author, title, dates)
|
| 93 |
+
|
| 94 |
+
---
|
| 95 |
+
|
| 96 |
+
### 2. **text_extractor**
|
| 97 |
+
Process and extract information from text.
|
| 98 |
+
|
| 99 |
+
**Input:**
|
| 100 |
+
- `text`: Raw text to process
|
| 101 |
+
- `operation`: 'clean', 'summarize', 'chunk', or 'keywords'
|
| 102 |
+
- `max_length`: Max length for summaries (default: 500)
|
| 103 |
+
|
| 104 |
+
**Output:**
|
| 105 |
+
- Processed text
|
| 106 |
+
- Word count
|
| 107 |
+
- Operation metadata
|
| 108 |
+
|
| 109 |
+
---
|
| 110 |
+
|
| 111 |
+
### 3. **web_fetcher**
|
| 112 |
+
Fetch and extract content from web URLs.
|
| 113 |
+
|
| 114 |
+
**Input:**
|
| 115 |
+
- `url`: URL to fetch
|
| 116 |
+
- `extract_text_only`: Extract text only (default: true)
|
| 117 |
+
|
| 118 |
+
**Output:**
|
| 119 |
+
- Clean text content or HTML
|
| 120 |
+
- HTTP status code
|
| 121 |
+
- Response metadata
|
| 122 |
+
|
| 123 |
+
---
|
| 124 |
+
|
| 125 |
+
### 4. **rag_search**
|
| 126 |
+
Semantic search using RAG (Retrieval Augmented Generation).
|
| 127 |
+
|
| 128 |
+
**Input:**
|
| 129 |
+
- `query`: Search query
|
| 130 |
+
- `documents`: List of documents to search
|
| 131 |
+
- `top_k`: Number of results (default: 3)
|
| 132 |
+
|
| 133 |
+
**Output:**
|
| 134 |
+
- Ranked search results with similarity scores
|
| 135 |
+
- Document snippets
|
| 136 |
+
- Relevance rankings
|
| 137 |
+
|
| 138 |
+
---
|
| 139 |
+
|
| 140 |
+
### 5. **data_visualizer**
|
| 141 |
+
Create data visualizations and charts.
|
| 142 |
+
|
| 143 |
+
**Input:**
|
| 144 |
+
- `data`: JSON or CSV string data
|
| 145 |
+
- `chart_type`: 'bar', 'line', 'pie', or 'scatter'
|
| 146 |
+
- `x_column`, `y_column`: Column names
|
| 147 |
+
- `title`: Chart title
|
| 148 |
+
|
| 149 |
+
**Output:**
|
| 150 |
+
- Base64 encoded PNG image
|
| 151 |
+
- Chart dimensions
|
| 152 |
+
- Column information
|
| 153 |
+
|
| 154 |
+
---
|
| 155 |
+
|
| 156 |
+
### 6. **file_converter**
|
| 157 |
+
Convert files between formats.
|
| 158 |
+
|
| 159 |
+
**Input:**
|
| 160 |
+
- `input_path`: Path to input file
|
| 161 |
+
- `output_format`: 'txt', 'csv', or 'pdf'
|
| 162 |
+
- `output_path`: Optional output path
|
| 163 |
+
|
| 164 |
+
**Output:**
|
| 165 |
+
- Output file path
|
| 166 |
+
- Conversion status
|
| 167 |
+
- File size
|
| 168 |
+
|
| 169 |
+
**Supported Conversions:**
|
| 170 |
+
- PDF → TXT
|
| 171 |
+
- TXT → CSV
|
| 172 |
+
- CSV → TXT
|
| 173 |
+
|
| 174 |
+
---
|
| 175 |
+
|
| 176 |
+
### 7. **email_intent_classifier**
|
| 177 |
+
Classify email intent using NLP.
|
| 178 |
+
|
| 179 |
+
**Input:**
|
| 180 |
+
- `email_text`: Email content to classify
|
| 181 |
+
|
| 182 |
+
**Output:**
|
| 183 |
+
- Primary intent (inquiry, complaint, request, feedback, meeting, order, urgent, follow_up, thank_you, application)
|
| 184 |
+
- Confidence score
|
| 185 |
+
- Secondary intents
|
| 186 |
+
|
| 187 |
+
---
|
| 188 |
+
|
| 189 |
+
### 8. **kpi_generator**
|
| 190 |
+
Generate business KPIs and insights.
|
| 191 |
+
|
| 192 |
+
**Input:**
|
| 193 |
+
- `data`: JSON string with business data
|
| 194 |
+
- `metrics`: List of metrics - 'revenue', 'growth', 'efficiency', 'customer', 'operational'
|
| 195 |
+
|
| 196 |
+
**Output:**
|
| 197 |
+
- Calculated KPIs
|
| 198 |
+
- Executive summary
|
| 199 |
+
- Key trends and insights
|
| 200 |
+
|
| 201 |
+
---
|
| 202 |
+
|
| 203 |
+
## 📦 Installation
|
| 204 |
+
|
| 205 |
+
### Prerequisites
|
| 206 |
+
|
| 207 |
+
- Python 3.11 or higher
|
| 208 |
+
- pip or uv package manager
|
| 209 |
+
|
| 210 |
+
### Setup
|
| 211 |
+
|
| 212 |
+
1. **Clone or download the repository:**
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
cd mission_control_mcp
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
+
2. **Install dependencies:**
|
| 219 |
+
|
| 220 |
+
```bash
|
| 221 |
+
pip install -r requirements.txt
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
Or using `uv`:
|
| 225 |
+
|
| 226 |
+
```bash
|
| 227 |
+
uv pip install -r requirements.txt
|
| 228 |
+
```
|
| 229 |
+
|
| 230 |
+
### Dependencies
|
| 231 |
+
|
| 232 |
+
- `mcp` - Model Context Protocol SDK
|
| 233 |
+
- `pypdf2` - PDF processing
|
| 234 |
+
- `requests` + `beautifulsoup4` - Web scraping
|
| 235 |
+
- `pandas` + `numpy` - Data processing
|
| 236 |
+
- `faiss-cpu` + `sentence-transformers` - Vector search
|
| 237 |
+
- `matplotlib` + `seaborn` - Data visualization
|
| 238 |
+
- `scikit-learn` + `nltk` - NLP and ML
|
| 239 |
+
|
| 240 |
+
---
|
| 241 |
+
|
| 242 |
+
## 🚀 Usage
|
| 243 |
+
|
| 244 |
+
### Running the Server
|
| 245 |
+
|
| 246 |
+
#### For Development/Testing:
|
| 247 |
+
|
| 248 |
+
```bash
|
| 249 |
+
uvx mcp dev mission_control_mcp/mcp_server.py
|
| 250 |
+
```
|
| 251 |
+
|
| 252 |
+
Or with Python directly:
|
| 253 |
+
|
| 254 |
+
```bash
|
| 255 |
+
python mcp_server.py
|
| 256 |
+
```
|
| 257 |
+
|
| 258 |
+
#### For Production:
|
| 259 |
+
|
| 260 |
+
The server runs via stdio and is designed to be integrated with MCP clients like Claude Desktop.
|
| 261 |
+
|
| 262 |
+
---
|
| 263 |
+
|
| 264 |
+
## 💡 Tool Examples
|
| 265 |
+
|
| 266 |
+
### Example 1: Text Extraction & Summarization
|
| 267 |
+
|
| 268 |
+
```json
|
| 269 |
+
{
|
| 270 |
+
"tool": "text_extractor",
|
| 271 |
+
"arguments": {
|
| 272 |
+
"text": "Your long document text here...",
|
| 273 |
+
"operation": "summarize",
|
| 274 |
+
"max_length": 200
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
```
|
| 278 |
+
|
| 279 |
+
### Example 2: Web Content Fetching
|
| 280 |
+
|
| 281 |
+
```json
|
| 282 |
+
{
|
| 283 |
+
"tool": "web_fetcher",
|
| 284 |
+
"arguments": {
|
| 285 |
+
"url": "https://example.com/article",
|
| 286 |
+
"extract_text_only": true
|
| 287 |
+
}
|
| 288 |
+
}
|
| 289 |
+
```
|
| 290 |
+
|
| 291 |
+
### Example 3: Semantic Search
|
| 292 |
+
|
| 293 |
+
```json
|
| 294 |
+
{
|
| 295 |
+
"tool": "rag_search",
|
| 296 |
+
"arguments": {
|
| 297 |
+
"query": "machine learning algorithms",
|
| 298 |
+
"documents": [
|
| 299 |
+
"Document 1 about neural networks...",
|
| 300 |
+
"Document 2 about decision trees...",
|
| 301 |
+
"Document 3 about clustering..."
|
| 302 |
+
],
|
| 303 |
+
"top_k": 3
|
| 304 |
+
}
|
| 305 |
+
}
|
| 306 |
+
```
|
| 307 |
+
|
| 308 |
+
### Example 4: Data Visualization
|
| 309 |
+
|
| 310 |
+
```json
|
| 311 |
+
{
|
| 312 |
+
"tool": "data_visualizer",
|
| 313 |
+
"arguments": {
|
| 314 |
+
"data": "{\"month\": [\"Jan\", \"Feb\", \"Mar\"], \"sales\": [1000, 1500, 1200]}",
|
| 315 |
+
"chart_type": "bar",
|
| 316 |
+
"x_column": "month",
|
| 317 |
+
"y_column": "sales",
|
| 318 |
+
"title": "Q1 Sales Report"
|
| 319 |
+
}
|
| 320 |
+
}
|
| 321 |
+
```
|
| 322 |
+
|
| 323 |
+
### Example 5: Email Intent Classification
|
| 324 |
+
|
| 325 |
+
```json
|
| 326 |
+
{
|
| 327 |
+
"tool": "email_intent_classifier",
|
| 328 |
+
"arguments": {
|
| 329 |
+
"email_text": "Hi, I need help with my recent order. It hasn't arrived yet and I'm wondering about the tracking status."
|
| 330 |
+
}
|
| 331 |
+
}
|
| 332 |
+
```
|
| 333 |
+
|
| 334 |
+
### Example 6: KPI Generation
|
| 335 |
+
|
| 336 |
+
```json
|
| 337 |
+
{
|
| 338 |
+
"tool": "kpi_generator",
|
| 339 |
+
"arguments": {
|
| 340 |
+
"data": "{\"revenue\": 1000000, \"costs\": 600000, \"customers\": 500, \"current_revenue\": 1000000, \"previous_revenue\": 800000}",
|
| 341 |
+
"metrics": ["revenue", "growth", "efficiency"]
|
| 342 |
+
}
|
| 343 |
+
}
|
| 344 |
+
```
|
| 345 |
+
|
| 346 |
+
---
|
| 347 |
+
|
| 348 |
+
## 🖥️ Claude Desktop Integration
|
| 349 |
+
|
| 350 |
+
### Configuration
|
| 351 |
+
|
| 352 |
+
Add to your Claude Desktop config file (`claude_desktop_config.json`):
|
| 353 |
+
|
| 354 |
+
**Windows:** `%APPDATA%\Claude\claude_desktop_config.json`
|
| 355 |
+
**macOS:** `~/Library/Application Support/Claude/claude_desktop_config.json`
|
| 356 |
+
|
| 357 |
+
```json
|
| 358 |
+
{
|
| 359 |
+
"mcpServers": {
|
| 360 |
+
"mission-control": {
|
| 361 |
+
"command": "python",
|
| 362 |
+
"args": [
|
| 363 |
+
"C:/Users/YourUser/path/to/mission_control_mcp/mcp_server.py"
|
| 364 |
+
]
|
| 365 |
+
}
|
| 366 |
+
}
|
| 367 |
+
}
|
| 368 |
+
```
|
| 369 |
+
|
| 370 |
+
Or with `uvx`:
|
| 371 |
+
|
| 372 |
+
```json
|
| 373 |
+
{
|
| 374 |
+
"mcpServers": {
|
| 375 |
+
"mission-control": {
|
| 376 |
+
"command": "uvx",
|
| 377 |
+
"args": [
|
| 378 |
+
"mcp",
|
| 379 |
+
"run",
|
| 380 |
+
"C:/Users/YourUser/path/to/mission_control_mcp/mcp_server.py"
|
| 381 |
+
]
|
| 382 |
+
}
|
| 383 |
+
}
|
| 384 |
+
}
|
| 385 |
+
```
|
| 386 |
+
|
| 387 |
+
### Usage in Claude
|
| 388 |
+
|
| 389 |
+
After configuration, restart Claude Desktop. You can then ask Claude to:
|
| 390 |
+
|
| 391 |
+
- "Extract text from this PDF file"
|
| 392 |
+
- "Fetch content from this website and summarize it"
|
| 393 |
+
- "Search these documents for information about X"
|
| 394 |
+
- "Create a bar chart from this sales data"
|
| 395 |
+
- "Classify the intent of this email"
|
| 396 |
+
- "Generate KPIs from this business data"
|
| 397 |
+
|
| 398 |
+
---
|
| 399 |
+
|
| 400 |
+
## 🧪 Testing
|
| 401 |
+
|
| 402 |
+
Run the comprehensive test suite:
|
| 403 |
+
|
| 404 |
+
```bash
|
| 405 |
+
python test_server.py
|
| 406 |
+
```
|
| 407 |
+
|
| 408 |
+
The test suite includes:
|
| 409 |
+
- Text extraction and processing tests
|
| 410 |
+
- Web fetching tests
|
| 411 |
+
- RAG search demonstrations
|
| 412 |
+
- Data visualization generation
|
| 413 |
+
- Email classification examples
|
| 414 |
+
- KPI calculation tests
|
| 415 |
+
- Example JSON inputs for all tools
|
| 416 |
+
|
| 417 |
+
---
|
| 418 |
+
|
| 419 |
+
## 🏗️ Architecture
|
| 420 |
+
|
| 421 |
+
```
|
| 422 |
+
mission_control_mcp/
|
| 423 |
+
├── mcp_server.py # Main MCP server
|
| 424 |
+
├── requirements.txt # Dependencies
|
| 425 |
+
├── test_server.py # Test suite
|
| 426 |
+
├── README.md # Documentation
|
| 427 |
+
│
|
| 428 |
+
├── tools/ # Tool implementations
|
| 429 |
+
│ ├── pdf_reader.py
|
| 430 |
+
│ ├── text_extractor.py
|
| 431 |
+
│ ├── web_fetcher.py
|
| 432 |
+
│ ├── rag_search.py
|
| 433 |
+
│ ├── data_visualizer.py
|
| 434 |
+
│ ├── file_converter.py
|
| 435 |
+
│ ├── email_intent_classifier.py
|
| 436 |
+
│ └── kpi_generator.py
|
| 437 |
+
│
|
| 438 |
+
├── models/ # Data schemas
|
| 439 |
+
│ └── schemas.py
|
| 440 |
+
│
|
| 441 |
+
└── utils/ # Utilities
|
| 442 |
+
├── helpers.py # Helper functions
|
| 443 |
+
└── rag_utils.py # RAG/vector search utilities
|
| 444 |
+
```
|
| 445 |
+
|
| 446 |
+
### Design Principles
|
| 447 |
+
|
| 448 |
+
- **Modularity**: Each tool is independently implemented
|
| 449 |
+
- **Type Safety**: Pydantic schemas for validation
|
| 450 |
+
- **Error Handling**: Comprehensive error catching and logging
|
| 451 |
+
- **Clean Code**: Well-documented with docstrings
|
| 452 |
+
- **Testability**: Easy to test individual components
|
| 453 |
+
|
| 454 |
+
---
|
| 455 |
+
|
| 456 |
+
## 🎖️ Hackathon Submission
|
| 457 |
+
|
| 458 |
+
### Track 1: MCP Server
|
| 459 |
+
|
| 460 |
+
**Server Name:** MissionControlMCP
|
| 461 |
+
|
| 462 |
+
**Description:** Enterprise automation MCP server providing 8 specialized tools for document processing, web intelligence, semantic search, data visualization, and business analytics.
|
| 463 |
+
|
| 464 |
+
### Key Features for Judges
|
| 465 |
+
|
| 466 |
+
1. **Production-Ready**: All 8 tools are fully implemented and tested
|
| 467 |
+
2. **MCP Compliant**: Follows MCP specification precisely
|
| 468 |
+
3. **Real-World Value**: Solves actual enterprise automation needs
|
| 469 |
+
4. **Clean Architecture**: Modular, maintainable, well-documented code
|
| 470 |
+
5. **Advanced Features**: RAG search with FAISS, data visualization, NLP classification
|
| 471 |
+
6. **Comprehensive Testing**: Full test suite with examples
|
| 472 |
+
7. **Easy Integration**: Works seamlessly with Claude Desktop
|
| 473 |
+
|
| 474 |
+
### Technical Highlights
|
| 475 |
+
|
| 476 |
+
- **Vector Search**: FAISS-based semantic search with sentence transformers
|
| 477 |
+
- **NLP Classification**: Rule-based email intent classifier with confidence scoring
|
| 478 |
+
- **Data Visualization**: Dynamic chart generation with matplotlib
|
| 479 |
+
- **File Processing**: Multi-format support (PDF, TXT, CSV)
|
| 480 |
+
- **Web Intelligence**: Smart web scraping with clean text extraction
|
| 481 |
+
- **Business Intelligence**: KPI calculation with trend analysis
|
| 482 |
+
|
| 483 |
+
---
|
| 484 |
+
|
| 485 |
+
## � Documentation & Examples
|
| 486 |
+
|
| 487 |
+
- **[EXAMPLES.md](EXAMPLES.md)** - Real-world use cases, workflows, and ROI examples
|
| 488 |
+
- **[HACKATHON_SUBMISSION.md](HACKATHON_SUBMISSION.md)** - Judge evaluation guide
|
| 489 |
+
- **[VIDEO_SCRIPT.md](VIDEO_SCRIPT.md)** - Demo video script and talking points
|
| 490 |
+
- **[examples/](examples/)** - Sample files for testing all tools:
|
| 491 |
+
- `sample_report.txt` - Business report for text extraction
|
| 492 |
+
- `business_data.csv` - Financial data for visualization & KPIs
|
| 493 |
+
- `sample_email_*.txt` - Email samples for intent classification
|
| 494 |
+
- `sample_documents.txt` - Documents for RAG search testing
|
| 495 |
+
|
| 496 |
+
---
|
| 497 |
+
|
| 498 |
+
## �📝 License
|
| 499 |
+
|
| 500 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
| 501 |
+
|
| 502 |
+
Created for the MCP 1st Birthday Hackathon – Winter 2025.
|
| 503 |
+
|
| 504 |
+
---
|
| 505 |
+
|
| 506 |
+
## 🤝 Contributing
|
| 507 |
+
|
| 508 |
+
This project was built for the hackathon, but improvements and suggestions are welcome! Check out [EXAMPLES.md](EXAMPLES.md) for usage patterns and best practices.
|
| 509 |
+
|
| 510 |
+
---
|
| 511 |
+
|
| 512 |
+
## 📧 Contact
|
| 513 |
+
|
| 514 |
+
For questions about this MCP server, please reach out through the hackathon channels.
|
| 515 |
+
|
| 516 |
+
---
|
| 517 |
+
|
| 518 |
+
## 🌟 Acknowledgments
|
| 519 |
+
|
| 520 |
+
- Built with the [Model Context Protocol SDK](https://github.com/modelcontextprotocol)
|
| 521 |
+
- Powered by sentence-transformers, FAISS, and other open-source libraries
|
| 522 |
+
- Created for the MCP 1st Birthday Hackathon 2025
|
| 523 |
+
|
| 524 |
+
---
|
| 525 |
+
|
| 526 |
+
**Happy Automating! 🚀**
|
app.py
ADDED
|
@@ -0,0 +1,553 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
🚀 MissionControlMCP - Gradio Web Interface
|
| 3 |
+
Beautiful GUI demo for all 8 tools!
|
| 4 |
+
|
| 5 |
+
Run: python demo_gui.py
|
| 6 |
+
Then share the public URL on LinkedIn!
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import gradio as gr
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
import json
|
| 13 |
+
import base64
|
| 14 |
+
from io import BytesIO
|
| 15 |
+
from PIL import Image
|
| 16 |
+
|
| 17 |
+
# Setup paths
|
| 18 |
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 19 |
+
sys.path.append(SCRIPT_DIR)
|
| 20 |
+
EXAMPLES_DIR = os.path.join(SCRIPT_DIR, "examples")
|
| 21 |
+
|
| 22 |
+
# Import tools
|
| 23 |
+
from tools.pdf_reader import read_pdf
|
| 24 |
+
from tools.text_extractor import extract_text
|
| 25 |
+
from tools.web_fetcher import fetch_web_content
|
| 26 |
+
from tools.rag_search import search_documents
|
| 27 |
+
from tools.data_visualizer import visualize_data
|
| 28 |
+
from tools.file_converter import convert_file
|
| 29 |
+
from tools.email_intent_classifier import classify_email_intent
|
| 30 |
+
from tools.kpi_generator import generate_kpis
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
# ============================================================================
|
| 34 |
+
# TOOL FUNCTIONS
|
| 35 |
+
# ============================================================================
|
| 36 |
+
|
| 37 |
+
def tool_pdf_reader(pdf_file):
|
| 38 |
+
"""PDF Reader tool"""
|
| 39 |
+
try:
|
| 40 |
+
if pdf_file is None:
|
| 41 |
+
return "❌ Please upload a PDF file!", None
|
| 42 |
+
|
| 43 |
+
result = read_pdf(pdf_file.name)
|
| 44 |
+
|
| 45 |
+
output = f"""✅ **PDF Analysis Complete!**
|
| 46 |
+
|
| 47 |
+
📄 **Metadata:**
|
| 48 |
+
- Pages: {result['pages']}
|
| 49 |
+
- Characters: {len(result['text']):,}
|
| 50 |
+
- Author: {result['metadata'].get('author', 'N/A')}
|
| 51 |
+
- Title: {result['metadata'].get('title', 'N/A')}
|
| 52 |
+
|
| 53 |
+
📝 **Extracted Text (first 1000 chars):**
|
| 54 |
+
{result['text'][:1000]}...
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
# Extract keywords
|
| 58 |
+
keywords = extract_text(result['text'], operation="keywords")
|
| 59 |
+
output += f"\n\n🔑 **Keywords:** {keywords['result']}"
|
| 60 |
+
|
| 61 |
+
return output, None
|
| 62 |
+
|
| 63 |
+
except Exception as e:
|
| 64 |
+
return f"❌ Error: {str(e)}", None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def tool_text_extractor(text, operation, max_length):
|
| 68 |
+
"""Text Extractor tool"""
|
| 69 |
+
try:
|
| 70 |
+
if not text.strip():
|
| 71 |
+
return "❌ Please enter some text!"
|
| 72 |
+
|
| 73 |
+
result = extract_text(text, operation=operation, max_length=max_length)
|
| 74 |
+
|
| 75 |
+
output = f"""✅ **Text Processing Complete!**
|
| 76 |
+
|
| 77 |
+
📊 **Operation:** {operation.upper()}
|
| 78 |
+
📏 **Word Count:** {result['word_count']}
|
| 79 |
+
|
| 80 |
+
📝 **Result:**
|
| 81 |
+
{result['result']}
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
return output
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
return f"❌ Error: {str(e)}"
|
| 88 |
+
|
| 89 |
+
|
| 90 |
+
def tool_web_fetcher(url):
|
| 91 |
+
"""Web Fetcher tool"""
|
| 92 |
+
try:
|
| 93 |
+
if not url.strip():
|
| 94 |
+
return "❌ Please enter a URL!"
|
| 95 |
+
|
| 96 |
+
result = fetch_web_content(url)
|
| 97 |
+
|
| 98 |
+
if result['status_code'] == 999:
|
| 99 |
+
return f"""⚠️ **Status 999 - Bot Detection**
|
| 100 |
+
|
| 101 |
+
The website is blocking automated requests.
|
| 102 |
+
This is common for LinkedIn, Facebook, etc.
|
| 103 |
+
|
| 104 |
+
Try a different website!"""
|
| 105 |
+
|
| 106 |
+
output = f"""✅ **Website Fetched Successfully!**
|
| 107 |
+
|
| 108 |
+
🌐 **URL:** {url}
|
| 109 |
+
📊 **Status:** {result['status_code']}
|
| 110 |
+
📄 **Title:** {result.get('title', 'N/A')}
|
| 111 |
+
📏 **Content Length:** {len(result['content']):,} characters
|
| 112 |
+
🔗 **Links Found:** {len(result.get('links', []))}
|
| 113 |
+
|
| 114 |
+
📝 **Content Preview (first 1000 chars):**
|
| 115 |
+
{result['content'][:1000]}...
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
# Extract keywords
|
| 119 |
+
if len(result['content']) > 50:
|
| 120 |
+
keywords = extract_text(result['content'], operation="keywords")
|
| 121 |
+
output += f"\n\n🔑 **Keywords:** {keywords['result']}"
|
| 122 |
+
|
| 123 |
+
return output
|
| 124 |
+
|
| 125 |
+
except Exception as e:
|
| 126 |
+
return f"❌ Error: {str(e)}"
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def tool_rag_search(query):
|
| 130 |
+
"""RAG Search tool"""
|
| 131 |
+
try:
|
| 132 |
+
if not query.strip():
|
| 133 |
+
return "❌ Please enter a search query!"
|
| 134 |
+
|
| 135 |
+
# Load sample documents
|
| 136 |
+
docs_file = os.path.join(EXAMPLES_DIR, "sample_documents.txt")
|
| 137 |
+
with open(docs_file, "r", encoding="utf-8") as f:
|
| 138 |
+
content = f.read()
|
| 139 |
+
|
| 140 |
+
documents = [doc.strip() for doc in content.split("##") if doc.strip()]
|
| 141 |
+
|
| 142 |
+
result = search_documents(query, documents, top_k=3)
|
| 143 |
+
|
| 144 |
+
output = f"""✅ **Search Complete!**
|
| 145 |
+
|
| 146 |
+
🔍 **Query:** "{query}"
|
| 147 |
+
📚 **Documents Searched:** {len(documents)}
|
| 148 |
+
📊 **Results Found:** {len(result['results'])}
|
| 149 |
+
|
| 150 |
+
🎯 **Top Results:**
|
| 151 |
+
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
for i, res in enumerate(result['results'], 1):
|
| 155 |
+
preview = res['document'][:200].replace('\n', ' ')
|
| 156 |
+
output += f"""
|
| 157 |
+
**Result {i}** (Score: {res['score']:.4f})
|
| 158 |
+
{preview}...
|
| 159 |
+
|
| 160 |
+
"""
|
| 161 |
+
|
| 162 |
+
return output
|
| 163 |
+
|
| 164 |
+
except Exception as e:
|
| 165 |
+
return f"❌ Error: {str(e)}"
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def tool_data_visualizer(csv_data, chart_type, x_col, y_col, title):
|
| 169 |
+
"""Data Visualizer tool"""
|
| 170 |
+
try:
|
| 171 |
+
if not csv_data.strip():
|
| 172 |
+
return "❌ Please enter CSV data!", None
|
| 173 |
+
|
| 174 |
+
result = visualize_data(
|
| 175 |
+
data=csv_data,
|
| 176 |
+
chart_type=chart_type,
|
| 177 |
+
x_column=x_col,
|
| 178 |
+
y_column=y_col,
|
| 179 |
+
title=title
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
# Convert base64 to image
|
| 183 |
+
img_data = base64.b64decode(result['image_base64'])
|
| 184 |
+
image = Image.open(BytesIO(img_data))
|
| 185 |
+
|
| 186 |
+
output = f"""✅ **Chart Created!**
|
| 187 |
+
|
| 188 |
+
📊 **Chart Type:** {chart_type.upper()}
|
| 189 |
+
📏 **Dimensions:** {result['dimensions']}
|
| 190 |
+
📈 **Title:** {title}
|
| 191 |
+
"""
|
| 192 |
+
|
| 193 |
+
return output, image
|
| 194 |
+
|
| 195 |
+
except Exception as e:
|
| 196 |
+
return f"❌ Error: {str(e)}", None
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def tool_email_classifier(email_text):
|
| 200 |
+
"""Email Intent Classifier tool"""
|
| 201 |
+
try:
|
| 202 |
+
if not email_text.strip():
|
| 203 |
+
return "❌ Please enter email text!"
|
| 204 |
+
|
| 205 |
+
result = classify_email_intent(email_text)
|
| 206 |
+
|
| 207 |
+
output = f"""✅ **Email Classified!**
|
| 208 |
+
|
| 209 |
+
🎯 **Primary Intent:** {result['intent'].upper()}
|
| 210 |
+
📊 **Confidence:** {result['confidence']:.2%}
|
| 211 |
+
|
| 212 |
+
💬 **Explanation:**
|
| 213 |
+
{result['explanation']}
|
| 214 |
+
"""
|
| 215 |
+
|
| 216 |
+
if result['secondary_intents']:
|
| 217 |
+
output += "\n\n📋 **Secondary Intents:**\n"
|
| 218 |
+
for intent in result['secondary_intents'][:3]:
|
| 219 |
+
output += f"- {intent['intent']}: {intent['confidence']:.2%}\n"
|
| 220 |
+
|
| 221 |
+
return output
|
| 222 |
+
|
| 223 |
+
except Exception as e:
|
| 224 |
+
return f"❌ Error: {str(e)}"
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
def tool_kpi_generator(business_json, metrics):
|
| 228 |
+
"""KPI Generator tool"""
|
| 229 |
+
try:
|
| 230 |
+
if not business_json.strip():
|
| 231 |
+
return "❌ Please enter business data!"
|
| 232 |
+
|
| 233 |
+
# Validate JSON
|
| 234 |
+
json.loads(business_json)
|
| 235 |
+
|
| 236 |
+
result = generate_kpis(business_json, metrics=metrics)
|
| 237 |
+
|
| 238 |
+
output = f"""✅ **KPIs Generated!**
|
| 239 |
+
|
| 240 |
+
📊 **Total KPIs Calculated:** {len(result['kpis'])}
|
| 241 |
+
|
| 242 |
+
📈 **Key Metrics:**
|
| 243 |
+
|
| 244 |
+
"""
|
| 245 |
+
|
| 246 |
+
# Display top 15 KPIs
|
| 247 |
+
for i, (name, value) in enumerate(list(result['kpis'].items())[:15], 1):
|
| 248 |
+
# Format based on metric type
|
| 249 |
+
if 'percent' in name or 'rate' in name or 'margin' in name:
|
| 250 |
+
formatted = f"{value:.1f}%"
|
| 251 |
+
elif 'revenue' in name or 'profit' in name or 'cost' in name:
|
| 252 |
+
formatted = f"${value:,.0f}"
|
| 253 |
+
else:
|
| 254 |
+
formatted = f"{value:,.2f}"
|
| 255 |
+
|
| 256 |
+
display_name = name.replace('_', ' ').title()
|
| 257 |
+
output += f"{i}. **{display_name}:** {formatted}\n"
|
| 258 |
+
|
| 259 |
+
output += f"\n\n📝 **Executive Summary:**\n{result['summary']}"
|
| 260 |
+
|
| 261 |
+
if result.get('trends'):
|
| 262 |
+
output += "\n\n📊 **Key Trends:**\n"
|
| 263 |
+
for trend in result['trends'][:5]:
|
| 264 |
+
output += f"- {trend}\n"
|
| 265 |
+
|
| 266 |
+
return output
|
| 267 |
+
|
| 268 |
+
except json.JSONDecodeError:
|
| 269 |
+
return "❌ Invalid JSON format! Please check your data."
|
| 270 |
+
except Exception as e:
|
| 271 |
+
return f"❌ Error: {str(e)}"
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
# ============================================================================
|
| 275 |
+
# LOAD SAMPLE DATA
|
| 276 |
+
# ============================================================================
|
| 277 |
+
|
| 278 |
+
def load_sample_csv():
|
| 279 |
+
csv_file = os.path.join(EXAMPLES_DIR, "business_data.csv")
|
| 280 |
+
with open(csv_file, "r") as f:
|
| 281 |
+
return f.read()
|
| 282 |
+
|
| 283 |
+
def load_sample_email():
|
| 284 |
+
email_file = os.path.join(EXAMPLES_DIR, "sample_email_complaint.txt")
|
| 285 |
+
with open(email_file, "r", encoding="utf-8") as f:
|
| 286 |
+
return f.read()
|
| 287 |
+
|
| 288 |
+
def load_sample_json():
|
| 289 |
+
return """{
|
| 290 |
+
"revenue": 5500000,
|
| 291 |
+
"costs": 3400000,
|
| 292 |
+
"customers": 2700,
|
| 293 |
+
"current_revenue": 5500000,
|
| 294 |
+
"previous_revenue": 5400000,
|
| 295 |
+
"current_customers": 2700,
|
| 296 |
+
"previous_customers": 2650,
|
| 297 |
+
"employees": 50,
|
| 298 |
+
"marketing_spend": 500000,
|
| 299 |
+
"sales": 5500000,
|
| 300 |
+
"cogs": 2000000
|
| 301 |
+
}"""
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
# ============================================================================
|
| 305 |
+
# GRADIO INTERFACE
|
| 306 |
+
# ============================================================================
|
| 307 |
+
|
| 308 |
+
# Custom CSS
|
| 309 |
+
custom_css = """
|
| 310 |
+
.gradio-container {
|
| 311 |
+
font-family: 'Arial', sans-serif;
|
| 312 |
+
}
|
| 313 |
+
.tab-label {
|
| 314 |
+
font-size: 16px !important;
|
| 315 |
+
}
|
| 316 |
+
"""
|
| 317 |
+
|
| 318 |
+
# Create Gradio interface
|
| 319 |
+
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="MissionControlMCP Demo") as demo:
|
| 320 |
+
|
| 321 |
+
gr.Markdown("""
|
| 322 |
+
# 🚀 MissionControlMCP - Enterprise Automation Tools
|
| 323 |
+
|
| 324 |
+
**Try all 8 powerful tools in your browser!** No installation needed.
|
| 325 |
+
|
| 326 |
+
📌 Built for the HuggingFace Gradio Hackathon | 🏆 Claude MCP Integration
|
| 327 |
+
""")
|
| 328 |
+
|
| 329 |
+
with gr.Tabs():
|
| 330 |
+
|
| 331 |
+
# ====== TAB 1: PDF READER ======
|
| 332 |
+
with gr.Tab("📄 PDF Reader"):
|
| 333 |
+
gr.Markdown("### Extract text and metadata from PDF documents")
|
| 334 |
+
|
| 335 |
+
with gr.Row():
|
| 336 |
+
with gr.Column():
|
| 337 |
+
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
|
| 338 |
+
pdf_btn = gr.Button("🔍 Extract Text", variant="primary")
|
| 339 |
+
|
| 340 |
+
with gr.Column():
|
| 341 |
+
pdf_output = gr.Textbox(label="Results", lines=15)
|
| 342 |
+
pdf_img = gr.Image(label="Preview", visible=False)
|
| 343 |
+
|
| 344 |
+
pdf_btn.click(tool_pdf_reader, inputs=[pdf_input], outputs=[pdf_output, pdf_img])
|
| 345 |
+
|
| 346 |
+
gr.Examples([["Use the file upload above to try with your own PDF!"]], inputs=[])
|
| 347 |
+
|
| 348 |
+
# ====== TAB 2: TEXT EXTRACTOR ======
|
| 349 |
+
with gr.Tab("📝 Text Extractor"):
|
| 350 |
+
gr.Markdown("### Extract keywords, generate summaries, clean text")
|
| 351 |
+
|
| 352 |
+
with gr.Row():
|
| 353 |
+
with gr.Column():
|
| 354 |
+
text_input = gr.Textbox(
|
| 355 |
+
label="Enter Text",
|
| 356 |
+
lines=8,
|
| 357 |
+
placeholder="Paste your text here..."
|
| 358 |
+
)
|
| 359 |
+
text_operation = gr.Radio(
|
| 360 |
+
["keywords", "summarize", "clean", "chunk"],
|
| 361 |
+
label="Operation",
|
| 362 |
+
value="keywords"
|
| 363 |
+
)
|
| 364 |
+
text_length = gr.Slider(100, 1000, 300, label="Max Length (for summarize/chunk)")
|
| 365 |
+
text_btn = gr.Button("✨ Process Text", variant="primary")
|
| 366 |
+
|
| 367 |
+
with gr.Column():
|
| 368 |
+
text_output = gr.Textbox(label="Results", lines=15)
|
| 369 |
+
|
| 370 |
+
text_btn.click(
|
| 371 |
+
tool_text_extractor,
|
| 372 |
+
inputs=[text_input, text_operation, text_length],
|
| 373 |
+
outputs=[text_output]
|
| 374 |
+
)
|
| 375 |
+
|
| 376 |
+
gr.Examples([
|
| 377 |
+
["Artificial Intelligence is transforming businesses worldwide. Companies are leveraging AI for automation, decision-making, and customer service. Machine learning models can now process vast amounts of data and provide actionable insights.", "keywords", 300],
|
| 378 |
+
["Climate change is one of the most pressing challenges of our time. Rising temperatures, extreme weather events, and environmental degradation require urgent action.", "summarize", 300]
|
| 379 |
+
], inputs=[text_input, text_operation, text_length])
|
| 380 |
+
|
| 381 |
+
# ====== TAB 3: WEB FETCHER ======
|
| 382 |
+
with gr.Tab("🌐 Web Fetcher"):
|
| 383 |
+
gr.Markdown("### Scrape and analyze web content")
|
| 384 |
+
|
| 385 |
+
with gr.Row():
|
| 386 |
+
with gr.Column():
|
| 387 |
+
web_input = gr.Textbox(
|
| 388 |
+
label="Website URL",
|
| 389 |
+
placeholder="https://example.com",
|
| 390 |
+
value="https://example.com"
|
| 391 |
+
)
|
| 392 |
+
web_btn = gr.Button("🌐 Fetch Website", variant="primary")
|
| 393 |
+
|
| 394 |
+
with gr.Column():
|
| 395 |
+
web_output = gr.Textbox(label="Results", lines=15)
|
| 396 |
+
|
| 397 |
+
web_btn.click(tool_web_fetcher, inputs=[web_input], outputs=[web_output])
|
| 398 |
+
|
| 399 |
+
gr.Examples([
|
| 400 |
+
["https://example.com"],
|
| 401 |
+
["https://python.org"],
|
| 402 |
+
["https://github.com"]
|
| 403 |
+
], inputs=[web_input])
|
| 404 |
+
|
| 405 |
+
# ====== TAB 4: RAG SEARCH ======
|
| 406 |
+
with gr.Tab("🔍 RAG Search"):
|
| 407 |
+
gr.Markdown("### Semantic document search with AI embeddings")
|
| 408 |
+
|
| 409 |
+
with gr.Row():
|
| 410 |
+
with gr.Column():
|
| 411 |
+
rag_input = gr.Textbox(
|
| 412 |
+
label="Search Query",
|
| 413 |
+
placeholder="What are you looking for?",
|
| 414 |
+
value="What is machine learning?"
|
| 415 |
+
)
|
| 416 |
+
rag_btn = gr.Button("🔍 Search Documents", variant="primary")
|
| 417 |
+
|
| 418 |
+
with gr.Column():
|
| 419 |
+
rag_output = gr.Textbox(label="Search Results", lines=15)
|
| 420 |
+
|
| 421 |
+
rag_btn.click(tool_rag_search, inputs=[rag_input], outputs=[rag_output])
|
| 422 |
+
|
| 423 |
+
gr.Examples([
|
| 424 |
+
["What is machine learning?"],
|
| 425 |
+
["How to reduce carbon emissions?"],
|
| 426 |
+
["What are modern web frameworks?"],
|
| 427 |
+
["Digital marketing strategies"]
|
| 428 |
+
], inputs=[rag_input])
|
| 429 |
+
|
| 430 |
+
# ====== TAB 5: DATA VISUALIZER ======
|
| 431 |
+
with gr.Tab("📊 Data Visualizer"):
|
| 432 |
+
gr.Markdown("### Create beautiful charts from CSV data")
|
| 433 |
+
|
| 434 |
+
with gr.Row():
|
| 435 |
+
with gr.Column():
|
| 436 |
+
viz_csv = gr.Textbox(
|
| 437 |
+
label="CSV Data",
|
| 438 |
+
lines=8,
|
| 439 |
+
value=load_sample_csv(),
|
| 440 |
+
placeholder="month,revenue,costs\nJan,100000,60000"
|
| 441 |
+
)
|
| 442 |
+
viz_chart = gr.Radio(
|
| 443 |
+
["line", "bar", "pie", "scatter"],
|
| 444 |
+
label="Chart Type",
|
| 445 |
+
value="line"
|
| 446 |
+
)
|
| 447 |
+
viz_x = gr.Textbox(label="X Column", value="month")
|
| 448 |
+
viz_y = gr.Textbox(label="Y Column", value="revenue")
|
| 449 |
+
viz_title = gr.Textbox(label="Chart Title", value="Monthly Revenue")
|
| 450 |
+
viz_btn = gr.Button("📊 Create Chart", variant="primary")
|
| 451 |
+
|
| 452 |
+
with gr.Column():
|
| 453 |
+
viz_output = gr.Textbox(label="Status", lines=5)
|
| 454 |
+
viz_img = gr.Image(label="Chart")
|
| 455 |
+
|
| 456 |
+
viz_btn.click(
|
| 457 |
+
tool_data_visualizer,
|
| 458 |
+
inputs=[viz_csv, viz_chart, viz_x, viz_y, viz_title],
|
| 459 |
+
outputs=[viz_output, viz_img]
|
| 460 |
+
)
|
| 461 |
+
|
| 462 |
+
# ====== TAB 6: EMAIL CLASSIFIER ======
|
| 463 |
+
with gr.Tab("📧 Email Classifier"):
|
| 464 |
+
gr.Markdown("### Detect email intent with AI")
|
| 465 |
+
|
| 466 |
+
with gr.Row():
|
| 467 |
+
with gr.Column():
|
| 468 |
+
email_input = gr.Textbox(
|
| 469 |
+
label="Email Text",
|
| 470 |
+
lines=10,
|
| 471 |
+
value=load_sample_email(),
|
| 472 |
+
placeholder="Paste email content here..."
|
| 473 |
+
)
|
| 474 |
+
email_btn = gr.Button("🎯 Classify Email", variant="primary")
|
| 475 |
+
|
| 476 |
+
with gr.Column():
|
| 477 |
+
email_output = gr.Textbox(label="Classification Results", lines=15)
|
| 478 |
+
|
| 479 |
+
email_btn.click(tool_email_classifier, inputs=[email_input], outputs=[email_output])
|
| 480 |
+
|
| 481 |
+
gr.Examples([
|
| 482 |
+
["I am writing to complain about the poor service I received at your store yesterday."],
|
| 483 |
+
["Could you please send me more information about your pricing plans?"],
|
| 484 |
+
["URGENT: The server is down and customers cannot access the website!"]
|
| 485 |
+
], inputs=[email_input])
|
| 486 |
+
|
| 487 |
+
# ====== TAB 7: KPI GENERATOR ======
|
| 488 |
+
with gr.Tab("📈 KPI Generator"):
|
| 489 |
+
gr.Markdown("### Calculate business metrics and KPIs")
|
| 490 |
+
|
| 491 |
+
with gr.Row():
|
| 492 |
+
with gr.Column():
|
| 493 |
+
kpi_json = gr.Textbox(
|
| 494 |
+
label="Business Data (JSON)",
|
| 495 |
+
lines=12,
|
| 496 |
+
value=load_sample_json(),
|
| 497 |
+
placeholder='{"revenue": 1000000, "costs": 600000}'
|
| 498 |
+
)
|
| 499 |
+
kpi_metrics = gr.CheckboxGroup(
|
| 500 |
+
["revenue", "growth", "efficiency", "customer", "operational"],
|
| 501 |
+
label="Metrics to Calculate",
|
| 502 |
+
value=["revenue", "growth", "efficiency"]
|
| 503 |
+
)
|
| 504 |
+
kpi_btn = gr.Button("📈 Generate KPIs", variant="primary")
|
| 505 |
+
|
| 506 |
+
with gr.Column():
|
| 507 |
+
kpi_output = gr.Textbox(label="KPI Report", lines=20)
|
| 508 |
+
|
| 509 |
+
kpi_btn.click(
|
| 510 |
+
tool_kpi_generator,
|
| 511 |
+
inputs=[kpi_json, kpi_metrics],
|
| 512 |
+
outputs=[kpi_output]
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
# Footer
|
| 516 |
+
gr.Markdown("""
|
| 517 |
+
---
|
| 518 |
+
|
| 519 |
+
### 🎯 About MissionControlMCP
|
| 520 |
+
|
| 521 |
+
8 enterprise-grade automation tools integrated with Claude Desktop via Model Context Protocol (MCP).
|
| 522 |
+
|
| 523 |
+
- **PDF Reader** - Extract text from documents
|
| 524 |
+
- **Text Extractor** - Keywords, summaries, cleaning
|
| 525 |
+
- **Web Fetcher** - Scrape websites
|
| 526 |
+
- **RAG Search** - Semantic document search
|
| 527 |
+
- **Data Visualizer** - Create charts
|
| 528 |
+
- **File Converter** - Format conversions
|
| 529 |
+
- **Email Classifier** - Intent detection
|
| 530 |
+
- **KPI Generator** - Business analytics
|
| 531 |
+
|
| 532 |
+
🔗 **GitHub:** [AlBaraa-1/CleanEye-Hackathon](https://github.com/AlBaraa-1/CleanEye-Hackathon)
|
| 533 |
+
|
| 534 |
+
🏆 Built for HuggingFace Gradio x BuildWithMCP Hackathon
|
| 535 |
+
""")
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
# ============================================================================
|
| 539 |
+
# LAUNCH
|
| 540 |
+
# ============================================================================
|
| 541 |
+
|
| 542 |
+
if __name__ == "__main__":
|
| 543 |
+
print("\n" + "="*80)
|
| 544 |
+
print("🚀 Launching MissionControlMCP Web Interface...")
|
| 545 |
+
print("="*80)
|
| 546 |
+
|
| 547 |
+
# Launch with public sharing enabled
|
| 548 |
+
demo.launch(
|
| 549 |
+
share=True, # Creates public URL!
|
| 550 |
+
server_name="0.0.0.0",
|
| 551 |
+
server_port=7860,
|
| 552 |
+
show_error=True
|
| 553 |
+
)
|
examples/business_data.csv
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
month,revenue,costs,customers,employees
|
| 2 |
+
January,4200000,2800000,2100,42
|
| 3 |
+
February,4350000,2850000,2200,44
|
| 4 |
+
March,4500000,2900000,2300,45
|
| 5 |
+
April,4650000,2950000,2350,46
|
| 6 |
+
May,4800000,3000000,2400,47
|
| 7 |
+
June,4900000,3100000,2450,48
|
| 8 |
+
July,4950000,3050000,2480,48
|
| 9 |
+
August,5100000,3150000,2520,49
|
| 10 |
+
September,5200000,3200000,2550,49
|
| 11 |
+
October,5300000,3250000,2600,50
|
| 12 |
+
November,5400000,3300000,2650,50
|
| 13 |
+
December,5500000,3400000,2700,50
|
examples/sample_documents.txt
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sample Test Documents for RAG Search
|
| 2 |
+
|
| 3 |
+
## Document 1: Artificial Intelligence Overview
|
| 4 |
+
|
| 5 |
+
Artificial intelligence (AI) represents one of the most transformative technologies of our time. Machine learning, a subset of AI, enables computers to learn from data without explicit programming. Deep learning takes this further by using neural networks with multiple layers to process complex patterns.
|
| 6 |
+
|
| 7 |
+
AI applications span numerous industries including healthcare, finance, transportation, and entertainment. Natural language processing allows machines to understand and generate human language, while computer vision enables image and video analysis.
|
| 8 |
+
|
| 9 |
+
## Document 2: Climate Change and Sustainability
|
| 10 |
+
|
| 11 |
+
Climate change poses one of the greatest challenges facing humanity. Rising global temperatures, extreme weather events, and sea level rise threaten ecosystems and human societies. Sustainable practices are essential for reducing carbon emissions and protecting our planet.
|
| 12 |
+
|
| 13 |
+
Renewable energy sources like solar and wind power offer alternatives to fossil fuels. Energy efficiency improvements in buildings and transportation can significantly reduce environmental impact. Individual actions, from recycling to reducing consumption, contribute to collective climate solutions.
|
| 14 |
+
|
| 15 |
+
## Document 3: Modern Web Development
|
| 16 |
+
|
| 17 |
+
Web development has evolved significantly with modern frameworks and technologies. React, Vue, and Angular dominate frontend development, while Node.js enables JavaScript on the backend. Progressive Web Apps blur the line between web and native applications.
|
| 18 |
+
|
| 19 |
+
Cloud platforms like AWS, Azure, and Google Cloud provide scalable infrastructure for web applications. DevOps practices automate deployment and monitoring. Security considerations, including authentication and data protection, are paramount in web development.
|
| 20 |
+
|
| 21 |
+
## Document 4: Digital Marketing Strategies
|
| 22 |
+
|
| 23 |
+
Digital marketing encompasses SEO, content marketing, social media, and paid advertising. Understanding your target audience is crucial for effective campaigns. Analytics tools provide insights into user behavior and campaign performance.
|
| 24 |
+
|
| 25 |
+
Content marketing focuses on creating valuable content to attract and engage audiences. Social media platforms offer direct communication channels with customers. Email marketing remains one of the highest ROI channels for businesses.
|
| 26 |
+
|
| 27 |
+
## Document 5: Financial Technology Innovation
|
| 28 |
+
|
| 29 |
+
Fintech is revolutionizing financial services through technology. Mobile payments, cryptocurrency, and blockchain are transforming how we transact. Robo-advisors use algorithms to provide investment advice at lower costs than traditional advisors.
|
| 30 |
+
|
| 31 |
+
Open banking APIs enable third-party developers to build financial applications. Peer-to-peer lending platforms connect borrowers directly with lenders. Regulatory technology helps firms comply with complex financial regulations.
|
examples/sample_email_complaint.txt
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Subject: Issue with Order #45678
|
| 2 |
+
From: customer@example.com
|
| 3 |
+
Date: January 15, 2025
|
| 4 |
+
|
| 5 |
+
Hello,
|
| 6 |
+
|
| 7 |
+
I placed order #45678 on January 10th, but it still hasn't arrived. The tracking shows it's been stuck at the distribution center for 3 days now. This is extremely frustrating as I needed these items for an important event this weekend.
|
| 8 |
+
|
| 9 |
+
Can you please look into this urgently and let me know when I can expect delivery? If it won't arrive by Friday, I'll need to cancel the order and get a refund.
|
| 10 |
+
|
| 11 |
+
This isn't the first time I've had delivery issues with your company.
|
| 12 |
+
|
| 13 |
+
Thanks,
|
| 14 |
+
Jane Smith
|
examples/sample_email_inquiry.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Subject: Product Inquiry - Enterprise Plan
|
| 2 |
+
From: john.doe@bigcorp.com
|
| 3 |
+
Date: January 15, 2025
|
| 4 |
+
|
| 5 |
+
Hello,
|
| 6 |
+
|
| 7 |
+
I'm reaching out on behalf of BigCorp Inc. We're currently evaluating automation solutions for our enterprise operations and came across your platform.
|
| 8 |
+
|
| 9 |
+
Could you provide more information about:
|
| 10 |
+
1. Enterprise pricing plans
|
| 11 |
+
2. Available features for teams of 500+ users
|
| 12 |
+
3. API integration capabilities
|
| 13 |
+
4. Security and compliance certifications
|
| 14 |
+
5. Implementation timeline
|
| 15 |
+
|
| 16 |
+
We're looking to make a decision by the end of Q1. Would it be possible to schedule a demo call next week?
|
| 17 |
+
|
| 18 |
+
Best regards,
|
| 19 |
+
John Doe
|
| 20 |
+
Director of IT Operations
|
| 21 |
+
BigCorp Inc.
|
examples/sample_email_urgent.txt
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Subject: URGENT - System Outage
|
| 2 |
+
From: ops.team@company.com
|
| 3 |
+
Date: January 15, 2025
|
| 4 |
+
|
| 5 |
+
URGENT: Production system is down!
|
| 6 |
+
|
| 7 |
+
Our main application server crashed at 2:30 PM EST. All customer-facing services are currently unavailable. Error logs show database connection timeout errors.
|
| 8 |
+
|
| 9 |
+
Impact:
|
| 10 |
+
- 5,000+ active users affected
|
| 11 |
+
- Revenue loss: ~$10K per hour
|
| 12 |
+
- Customer support receiving high volume of complaints
|
| 13 |
+
|
| 14 |
+
Actions taken so far:
|
| 15 |
+
- Restarted application servers (no success)
|
| 16 |
+
- Database team investigating
|
| 17 |
+
- Switched to emergency maintenance page
|
| 18 |
+
|
| 19 |
+
NEED IMMEDIATE ATTENTION from DevOps and Database teams!
|
| 20 |
+
|
| 21 |
+
Updates being posted to #incident-response Slack channel.
|
| 22 |
+
|
| 23 |
+
- Operations Team
|
examples/sample_report.txt
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Sample Business Report
|
| 2 |
+
|
| 3 |
+
## Q4 2024 Performance Summary
|
| 4 |
+
|
| 5 |
+
**Executive Summary:**
|
| 6 |
+
Our company achieved outstanding performance in Q4 2024, with revenue reaching $5,000,000, representing a 19% increase compared to the previous quarter. This growth was driven by strong customer acquisition and improved operational efficiency.
|
| 7 |
+
|
| 8 |
+
### Key Highlights:
|
| 9 |
+
- **Revenue Growth:** 19% quarter-over-quarter increase
|
| 10 |
+
- **Customer Base:** Expanded to 2,500 active customers
|
| 11 |
+
- **Profit Margin:** Maintained at 40%, indicating strong cost management
|
| 12 |
+
- **Team Expansion:** Grew from 42 to 50 employees
|
| 13 |
+
|
| 14 |
+
### Financial Metrics:
|
| 15 |
+
The company generated $5M in revenue against $3M in costs, resulting in a healthy $2M profit. Revenue per employee stands at $100,000, demonstrating excellent productivity levels.
|
| 16 |
+
|
| 17 |
+
### Market Position:
|
| 18 |
+
We've successfully penetrated three new market segments, with enterprise clients now representing 35% of our customer base. Customer satisfaction scores remain high at 4.6/5.0.
|
| 19 |
+
|
| 20 |
+
### Looking Forward:
|
| 21 |
+
Based on current trends, we project 25% revenue growth for Q1 2025. Key initiatives include expanding our sales team, launching two new product features, and entering the European market.
|
| 22 |
+
|
| 23 |
+
### Operational Efficiency:
|
| 24 |
+
- Customer acquisition cost: $800
|
| 25 |
+
- Lifetime value: $5,000
|
| 26 |
+
- Churn rate: 3.2% (industry average: 5%)
|
| 27 |
+
- Support response time: Under 2 hours
|
| 28 |
+
|
| 29 |
+
### Technology Investments:
|
| 30 |
+
We've invested $500K in infrastructure improvements, including AI-powered customer service tools and automated reporting systems. These investments are expected to reduce operational costs by 15% in 2025.
|
| 31 |
+
|
| 32 |
+
### Challenges and Mitigation:
|
| 33 |
+
While we faced increased competition in Q4, our unique value proposition and superior customer service allowed us to maintain market share. We're addressing scaling challenges through process automation and strategic hiring.
|
| 34 |
+
|
| 35 |
+
### Conclusion:
|
| 36 |
+
Q4 2024 demonstrated strong business fundamentals and positioned us well for continued growth. Our focus on customer success, operational excellence, and strategic innovation will drive performance in 2025.
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
*This report was generated on January 15, 2025*
|
| 41 |
+
*Prepared by: Finance Department*
|
| 42 |
+
*Confidential - Internal Use Only*
|
requirements.txt
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# MissionControlMCP Requirements
|
| 2 |
+
# Python 3.11+ required
|
| 3 |
+
|
| 4 |
+
# MCP SDK
|
| 5 |
+
mcp>=1.0.0
|
| 6 |
+
|
| 7 |
+
# Document Processing
|
| 8 |
+
pypdf2>=3.0.0
|
| 9 |
+
python-docx>=1.0.0
|
| 10 |
+
|
| 11 |
+
# Web Scraping
|
| 12 |
+
requests>=2.31.0
|
| 13 |
+
beautifulsoup4>=4.12.0
|
| 14 |
+
|
| 15 |
+
# Data Processing
|
| 16 |
+
pandas>=2.0.0
|
| 17 |
+
numpy>=1.24.0
|
| 18 |
+
|
| 19 |
+
# Vector Store & Embeddings
|
| 20 |
+
faiss-cpu>=1.7.4
|
| 21 |
+
sentence-transformers>=2.2.0
|
| 22 |
+
|
| 23 |
+
# Visualization
|
| 24 |
+
matplotlib>=3.7.0
|
| 25 |
+
seaborn>=0.12.0
|
| 26 |
+
pillow>=10.0.0
|
| 27 |
+
|
| 28 |
+
# Web Interface (for demo_gui.py)
|
| 29 |
+
gradio>=4.0.0
|
| 30 |
+
|
| 31 |
+
# NLP & Text Processing
|
| 32 |
+
nltk>=3.8.0
|
| 33 |
+
scikit-learn>=1.3.0
|
| 34 |
+
|
| 35 |
+
# Utilities
|
| 36 |
+
python-dateutil>=2.8.0
|
| 37 |
+
pydantic>=2.0.0
|
tools/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MissionControlMCP Tools Package
|
| 3 |
+
"""
|
tools/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (235 Bytes). View file
|
|
|
tools/__pycache__/data_visualizer.cpython-312.pyc
ADDED
|
Binary file (9.09 kB). View file
|
|
|
tools/__pycache__/email_intent_classifier.cpython-312.pyc
ADDED
|
Binary file (8.79 kB). View file
|
|
|
tools/__pycache__/file_converter.cpython-312.pyc
ADDED
|
Binary file (8.75 kB). View file
|
|
|
tools/__pycache__/kpi_generator.cpython-312.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
tools/__pycache__/pdf_reader.cpython-312.pyc
ADDED
|
Binary file (3.91 kB). View file
|
|
|
tools/__pycache__/rag_search.cpython-312.pyc
ADDED
|
Binary file (5.5 kB). View file
|
|
|
tools/__pycache__/text_extractor.cpython-312.pyc
ADDED
|
Binary file (4.13 kB). View file
|
|
|
tools/__pycache__/web_fetcher.cpython-312.pyc
ADDED
|
Binary file (6.96 kB). View file
|
|
|
tools/data_visualizer.py
ADDED
|
@@ -0,0 +1,231 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Visualizer Tool - Create charts from data
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import io
|
| 7 |
+
import base64
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
# Add parent directory to path for imports
|
| 12 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 13 |
+
|
| 14 |
+
from utils.helpers import parse_json_safe
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def visualize_data(
|
| 20 |
+
data: str,
|
| 21 |
+
chart_type: str = "bar",
|
| 22 |
+
x_column: str = None,
|
| 23 |
+
y_column: str = None,
|
| 24 |
+
title: str = "Data Visualization"
|
| 25 |
+
) -> Dict[str, Any]:
|
| 26 |
+
"""
|
| 27 |
+
Create a chart visualization from data.
|
| 28 |
+
|
| 29 |
+
Args:
|
| 30 |
+
data: JSON or CSV string data
|
| 31 |
+
chart_type: Type of chart - 'bar', 'line', 'pie', 'scatter'
|
| 32 |
+
x_column: X-axis column name
|
| 33 |
+
y_column: Y-axis column name
|
| 34 |
+
title: Chart title
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Dictionary with base64 encoded image and metadata
|
| 38 |
+
"""
|
| 39 |
+
try:
|
| 40 |
+
import matplotlib.pyplot as plt
|
| 41 |
+
import pandas as pd
|
| 42 |
+
import json
|
| 43 |
+
|
| 44 |
+
# Parse data
|
| 45 |
+
try:
|
| 46 |
+
# Try JSON first
|
| 47 |
+
data_dict = json.loads(data)
|
| 48 |
+
df = pd.DataFrame(data_dict)
|
| 49 |
+
except json.JSONDecodeError:
|
| 50 |
+
# Try CSV
|
| 51 |
+
from io import StringIO
|
| 52 |
+
df = pd.read_csv(StringIO(data))
|
| 53 |
+
|
| 54 |
+
if df.empty:
|
| 55 |
+
raise ValueError("Data is empty")
|
| 56 |
+
|
| 57 |
+
# Auto-select columns if not specified
|
| 58 |
+
if x_column is None and len(df.columns) > 0:
|
| 59 |
+
x_column = df.columns[0]
|
| 60 |
+
if y_column is None and len(df.columns) > 1:
|
| 61 |
+
y_column = df.columns[1]
|
| 62 |
+
elif y_column is None:
|
| 63 |
+
y_column = df.columns[0]
|
| 64 |
+
|
| 65 |
+
# Validate columns exist
|
| 66 |
+
if x_column not in df.columns:
|
| 67 |
+
raise ValueError(f"Column '{x_column}' not found in data")
|
| 68 |
+
if y_column not in df.columns:
|
| 69 |
+
raise ValueError(f"Column '{y_column}' not found in data")
|
| 70 |
+
|
| 71 |
+
# Create figure
|
| 72 |
+
plt.figure(figsize=(10, 6))
|
| 73 |
+
|
| 74 |
+
# Generate chart based on type
|
| 75 |
+
if chart_type == "bar":
|
| 76 |
+
plt.bar(df[x_column], df[y_column])
|
| 77 |
+
plt.xlabel(x_column)
|
| 78 |
+
plt.ylabel(y_column)
|
| 79 |
+
|
| 80 |
+
elif chart_type == "line":
|
| 81 |
+
plt.plot(df[x_column], df[y_column], marker='o')
|
| 82 |
+
plt.xlabel(x_column)
|
| 83 |
+
plt.ylabel(y_column)
|
| 84 |
+
plt.grid(True, alpha=0.3)
|
| 85 |
+
|
| 86 |
+
elif chart_type == "pie":
|
| 87 |
+
plt.pie(df[y_column], labels=df[x_column], autopct='%1.1f%%')
|
| 88 |
+
|
| 89 |
+
elif chart_type == "scatter":
|
| 90 |
+
plt.scatter(df[x_column], df[y_column], alpha=0.6)
|
| 91 |
+
plt.xlabel(x_column)
|
| 92 |
+
plt.ylabel(y_column)
|
| 93 |
+
plt.grid(True, alpha=0.3)
|
| 94 |
+
|
| 95 |
+
else:
|
| 96 |
+
raise ValueError(f"Unknown chart type: {chart_type}")
|
| 97 |
+
|
| 98 |
+
plt.title(title)
|
| 99 |
+
plt.tight_layout()
|
| 100 |
+
|
| 101 |
+
# Convert to base64
|
| 102 |
+
buffer = io.BytesIO()
|
| 103 |
+
plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
|
| 104 |
+
buffer.seek(0)
|
| 105 |
+
image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
|
| 106 |
+
plt.close()
|
| 107 |
+
|
| 108 |
+
return {
|
| 109 |
+
"image_base64": image_base64,
|
| 110 |
+
"dimensions": {"width": 1000, "height": 600},
|
| 111 |
+
"chart_type": chart_type,
|
| 112 |
+
"title": title,
|
| 113 |
+
"columns_used": {"x": x_column, "y": y_column}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
except Exception as e:
|
| 117 |
+
logger.error(f"Error creating visualization: {e}")
|
| 118 |
+
raise
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def create_multi_chart(data: str, chart_configs: list) -> Dict[str, Any]:
|
| 122 |
+
"""
|
| 123 |
+
Create multiple charts from the same dataset.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
data: JSON or CSV string data
|
| 127 |
+
chart_configs: List of chart configuration dictionaries
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
Dictionary with multiple chart images
|
| 131 |
+
"""
|
| 132 |
+
try:
|
| 133 |
+
import matplotlib.pyplot as plt
|
| 134 |
+
import pandas as pd
|
| 135 |
+
import json
|
| 136 |
+
|
| 137 |
+
# Parse data once
|
| 138 |
+
try:
|
| 139 |
+
data_dict = json.loads(data)
|
| 140 |
+
df = pd.DataFrame(data_dict)
|
| 141 |
+
except json.JSONDecodeError:
|
| 142 |
+
from io import StringIO
|
| 143 |
+
df = pd.read_csv(StringIO(data))
|
| 144 |
+
|
| 145 |
+
charts = []
|
| 146 |
+
for idx, config in enumerate(chart_configs):
|
| 147 |
+
try:
|
| 148 |
+
result = visualize_data(
|
| 149 |
+
data,
|
| 150 |
+
chart_type=config.get("chart_type", "bar"),
|
| 151 |
+
x_column=config.get("x_column"),
|
| 152 |
+
y_column=config.get("y_column"),
|
| 153 |
+
title=config.get("title", f"Chart {idx+1}")
|
| 154 |
+
)
|
| 155 |
+
charts.append(result)
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logger.error(f"Error creating chart {idx+1}: {e}")
|
| 158 |
+
charts.append({"error": str(e)})
|
| 159 |
+
|
| 160 |
+
return {
|
| 161 |
+
"total_charts": len(charts),
|
| 162 |
+
"charts": charts
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
except Exception as e:
|
| 166 |
+
logger.error(f"Error creating multi-chart: {e}")
|
| 167 |
+
raise
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def generate_statistics_chart(data: str) -> Dict[str, Any]:
|
| 171 |
+
"""
|
| 172 |
+
Generate a statistical summary chart from numeric data.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
data: JSON or CSV string with numeric data
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Dictionary with statistics chart
|
| 179 |
+
"""
|
| 180 |
+
try:
|
| 181 |
+
import matplotlib.pyplot as plt
|
| 182 |
+
import pandas as pd
|
| 183 |
+
import json
|
| 184 |
+
|
| 185 |
+
# Parse data
|
| 186 |
+
try:
|
| 187 |
+
data_dict = json.loads(data)
|
| 188 |
+
df = pd.DataFrame(data_dict)
|
| 189 |
+
except json.JSONDecodeError:
|
| 190 |
+
from io import StringIO
|
| 191 |
+
df = pd.read_csv(StringIO(data))
|
| 192 |
+
|
| 193 |
+
# Get numeric columns
|
| 194 |
+
numeric_cols = df.select_dtypes(include=['number']).columns
|
| 195 |
+
|
| 196 |
+
if len(numeric_cols) == 0:
|
| 197 |
+
raise ValueError("No numeric columns found in data")
|
| 198 |
+
|
| 199 |
+
# Create statistics summary
|
| 200 |
+
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
|
| 201 |
+
|
| 202 |
+
# Box plot
|
| 203 |
+
df[numeric_cols].boxplot(ax=axes[0])
|
| 204 |
+
axes[0].set_title("Distribution (Box Plot)")
|
| 205 |
+
axes[0].set_ylabel("Values")
|
| 206 |
+
|
| 207 |
+
# Histogram
|
| 208 |
+
df[numeric_cols].hist(ax=axes[1], bins=20, alpha=0.7)
|
| 209 |
+
axes[1].set_title("Distribution (Histogram)")
|
| 210 |
+
|
| 211 |
+
plt.tight_layout()
|
| 212 |
+
|
| 213 |
+
# Convert to base64
|
| 214 |
+
buffer = io.BytesIO()
|
| 215 |
+
plt.savefig(buffer, format='png', dpi=100, bbox_inches='tight')
|
| 216 |
+
buffer.seek(0)
|
| 217 |
+
image_base64 = base64.b64encode(buffer.read()).decode('utf-8')
|
| 218 |
+
plt.close()
|
| 219 |
+
|
| 220 |
+
# Calculate statistics
|
| 221 |
+
stats = df[numeric_cols].describe().to_dict()
|
| 222 |
+
|
| 223 |
+
return {
|
| 224 |
+
"image_base64": image_base64,
|
| 225 |
+
"statistics": stats,
|
| 226 |
+
"numeric_columns": list(numeric_cols)
|
| 227 |
+
}
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"Error generating statistics chart: {e}")
|
| 231 |
+
raise
|
tools/email_intent_classifier.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Email Intent Classifier Tool - Classify email intents using NLP
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any, List
|
| 6 |
+
import re
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add parent directory to path for imports
|
| 11 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class EmailIntentClassifier:
|
| 17 |
+
"""
|
| 18 |
+
Rule-based email intent classifier with confidence scoring
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Define intent patterns (keywords and phrases)
|
| 22 |
+
INTENT_PATTERNS = {
|
| 23 |
+
"inquiry": [
|
| 24 |
+
r'\b(question|wondering|curious|clarification|information|details|help)\b',
|
| 25 |
+
r'\b(what|when|where|who|why|how)\b.*\?',
|
| 26 |
+
r'\b(could you|can you|would you).*\b(explain|tell|provide|share)\b'
|
| 27 |
+
],
|
| 28 |
+
"complaint": [
|
| 29 |
+
r'\b(complaint|issue|problem|disappointed|frustrated|unhappy|angry)\b',
|
| 30 |
+
r'\b(not working|broken|failed|error|mistake)\b',
|
| 31 |
+
r'\b(terrible|awful|worst|horrible|unacceptable)\b'
|
| 32 |
+
],
|
| 33 |
+
"request": [
|
| 34 |
+
r'\b(please|kindly|request|need|require|would like)\b',
|
| 35 |
+
r'\b(send|provide|share|give|deliver|forward)\b.*\b(me|us)\b',
|
| 36 |
+
r'\b(need|want|looking for)\b'
|
| 37 |
+
],
|
| 38 |
+
"feedback": [
|
| 39 |
+
r'\b(feedback|suggestion|recommend|improve|enhancement)\b',
|
| 40 |
+
r'\b(think|believe|feel|opinion)\b.*\b(should|could|would)\b',
|
| 41 |
+
r'\b(great|excellent|good|nice|appreciate|love)\b'
|
| 42 |
+
],
|
| 43 |
+
"meeting": [
|
| 44 |
+
r'\b(meeting|schedule|appointment|call|discuss|conference)\b',
|
| 45 |
+
r'\b(available|availability|free time|calendar)\b',
|
| 46 |
+
r'\b(reschedule|postpone|cancel|confirm)\b'
|
| 47 |
+
],
|
| 48 |
+
"order": [
|
| 49 |
+
r'\b(order|purchase|buy|payment|invoice|receipt)\b',
|
| 50 |
+
r'\b(shipping|delivery|tracking|status)\b',
|
| 51 |
+
r'\b(product|item|package)\b'
|
| 52 |
+
],
|
| 53 |
+
"urgent": [
|
| 54 |
+
r'\b(urgent|asap|immediately|critical|emergency|priority)\b',
|
| 55 |
+
r'\b(time-sensitive|deadline|due)\b',
|
| 56 |
+
r'!!+|\bIMPORTANT\b'
|
| 57 |
+
],
|
| 58 |
+
"follow_up": [
|
| 59 |
+
r'\b(follow up|following up|checking in|reminder)\b',
|
| 60 |
+
r'\b(haven\'t heard|waiting for|still pending)\b',
|
| 61 |
+
r'\b(previous|earlier|sent|mentioned)\b.*\b(email|message)\b'
|
| 62 |
+
],
|
| 63 |
+
"thank_you": [
|
| 64 |
+
r'\b(thank|thanks|grateful|appreciate|gratitude)\b',
|
| 65 |
+
r'\b(wonderful|excellent|helpful)\b.*\b(work|help|support)\b'
|
| 66 |
+
],
|
| 67 |
+
"application": [
|
| 68 |
+
r'\b(apply|application|position|job|role|opportunity)\b',
|
| 69 |
+
r'\b(resume|cv|cover letter|portfolio)\b',
|
| 70 |
+
r'\b(interested in|applying for)\b'
|
| 71 |
+
]
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
def classify(self, email_text: str) -> Dict[str, Any]:
|
| 75 |
+
"""
|
| 76 |
+
Classify email intent with confidence scores.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
email_text: Email text to classify
|
| 80 |
+
|
| 81 |
+
Returns:
|
| 82 |
+
Dictionary with primary intent, confidence, and secondary intents
|
| 83 |
+
"""
|
| 84 |
+
if not email_text or not email_text.strip():
|
| 85 |
+
raise ValueError("Email text cannot be empty")
|
| 86 |
+
|
| 87 |
+
# Convert to lowercase for matching
|
| 88 |
+
text_lower = email_text.lower()
|
| 89 |
+
|
| 90 |
+
# Calculate scores for each intent
|
| 91 |
+
intent_scores = {}
|
| 92 |
+
|
| 93 |
+
for intent, patterns in self.INTENT_PATTERNS.items():
|
| 94 |
+
score = 0
|
| 95 |
+
matches = 0
|
| 96 |
+
|
| 97 |
+
for pattern in patterns:
|
| 98 |
+
found = re.findall(pattern, text_lower, re.IGNORECASE)
|
| 99 |
+
if found:
|
| 100 |
+
matches += len(found)
|
| 101 |
+
score += len(found)
|
| 102 |
+
|
| 103 |
+
# Normalize score
|
| 104 |
+
if score > 0:
|
| 105 |
+
intent_scores[intent] = min(score / 3.0, 1.0) # Cap at 1.0
|
| 106 |
+
|
| 107 |
+
# If no patterns matched, classify as "general"
|
| 108 |
+
if not intent_scores:
|
| 109 |
+
return {
|
| 110 |
+
"intent": "general",
|
| 111 |
+
"confidence": 0.5,
|
| 112 |
+
"secondary_intents": [],
|
| 113 |
+
"explanation": "No specific intent patterns detected"
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
# Sort by score
|
| 117 |
+
sorted_intents = sorted(intent_scores.items(), key=lambda x: x[1], reverse=True)
|
| 118 |
+
|
| 119 |
+
# Get primary intent
|
| 120 |
+
primary_intent = sorted_intents[0][0]
|
| 121 |
+
primary_confidence = sorted_intents[0][1]
|
| 122 |
+
|
| 123 |
+
# Get secondary intents (top 3)
|
| 124 |
+
secondary_intents = [
|
| 125 |
+
{"intent": intent, "confidence": round(score, 3)}
|
| 126 |
+
for intent, score in sorted_intents[1:4]
|
| 127 |
+
]
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
"intent": primary_intent,
|
| 131 |
+
"confidence": round(primary_confidence, 3),
|
| 132 |
+
"secondary_intents": secondary_intents,
|
| 133 |
+
"explanation": f"Detected {primary_intent} intent based on keyword analysis"
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def classify_email_intent(email_text: str) -> Dict[str, Any]:
|
| 138 |
+
"""
|
| 139 |
+
Classify the intent of an email.
|
| 140 |
+
|
| 141 |
+
Args:
|
| 142 |
+
email_text: Email text to classify
|
| 143 |
+
|
| 144 |
+
Returns:
|
| 145 |
+
Dictionary with classification results
|
| 146 |
+
"""
|
| 147 |
+
try:
|
| 148 |
+
classifier = EmailIntentClassifier()
|
| 149 |
+
result = classifier.classify(email_text)
|
| 150 |
+
|
| 151 |
+
# Add metadata
|
| 152 |
+
result["email_length"] = len(email_text)
|
| 153 |
+
result["word_count"] = len(email_text.split())
|
| 154 |
+
|
| 155 |
+
return result
|
| 156 |
+
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.error(f"Error classifying email intent: {e}")
|
| 159 |
+
raise
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
def classify_batch_emails(emails: List[str]) -> Dict[str, Any]:
|
| 163 |
+
"""
|
| 164 |
+
Classify multiple emails at once.
|
| 165 |
+
|
| 166 |
+
Args:
|
| 167 |
+
emails: List of email text strings
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Dictionary with batch classification results
|
| 171 |
+
"""
|
| 172 |
+
try:
|
| 173 |
+
classifier = EmailIntentClassifier()
|
| 174 |
+
results = []
|
| 175 |
+
|
| 176 |
+
for idx, email_text in enumerate(emails):
|
| 177 |
+
try:
|
| 178 |
+
result = classifier.classify(email_text)
|
| 179 |
+
result["email_index"] = idx
|
| 180 |
+
results.append(result)
|
| 181 |
+
except Exception as e:
|
| 182 |
+
logger.error(f"Error classifying email {idx}: {e}")
|
| 183 |
+
results.append({
|
| 184 |
+
"email_index": idx,
|
| 185 |
+
"error": str(e),
|
| 186 |
+
"intent": "error",
|
| 187 |
+
"confidence": 0.0
|
| 188 |
+
})
|
| 189 |
+
|
| 190 |
+
# Aggregate statistics
|
| 191 |
+
intent_distribution = {}
|
| 192 |
+
for result in results:
|
| 193 |
+
intent = result.get("intent", "unknown")
|
| 194 |
+
intent_distribution[intent] = intent_distribution.get(intent, 0) + 1
|
| 195 |
+
|
| 196 |
+
return {
|
| 197 |
+
"total_emails": len(emails),
|
| 198 |
+
"results": results,
|
| 199 |
+
"intent_distribution": intent_distribution
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
except Exception as e:
|
| 203 |
+
logger.error(f"Error in batch email classification: {e}")
|
| 204 |
+
raise
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def extract_email_features(email_text: str) -> Dict[str, Any]:
|
| 208 |
+
"""
|
| 209 |
+
Extract features from an email for analysis.
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
email_text: Email text
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
Dictionary with extracted features
|
| 216 |
+
"""
|
| 217 |
+
try:
|
| 218 |
+
features = {
|
| 219 |
+
"length": len(email_text),
|
| 220 |
+
"word_count": len(email_text.split()),
|
| 221 |
+
"sentence_count": len(re.split(r'[.!?]+', email_text)),
|
| 222 |
+
"has_greeting": bool(re.search(r'\b(hi|hello|dear|hey)\b', email_text.lower())),
|
| 223 |
+
"has_closing": bool(re.search(r'\b(regards|sincerely|thanks|best)\b', email_text.lower())),
|
| 224 |
+
"question_count": len(re.findall(r'\?', email_text)),
|
| 225 |
+
"exclamation_count": len(re.findall(r'!', email_text)),
|
| 226 |
+
"has_url": bool(re.search(r'https?://', email_text)),
|
| 227 |
+
"has_email_address": bool(re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', email_text))
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
return features
|
| 231 |
+
|
| 232 |
+
except Exception as e:
|
| 233 |
+
logger.error(f"Error extracting email features: {e}")
|
| 234 |
+
raise
|
tools/file_converter.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File Converter Tool - Convert between different file formats
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
|
| 10 |
+
# Add parent directory to path for imports
|
| 11 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def convert_file(input_path: str, output_format: str, output_path: str = None) -> Dict[str, Any]:
|
| 17 |
+
"""
|
| 18 |
+
Convert a file from one format to another.
|
| 19 |
+
|
| 20 |
+
Supported conversions:
|
| 21 |
+
- PDF to TXT
|
| 22 |
+
- TXT to CSV (assumes structured text)
|
| 23 |
+
- CSV to TXT
|
| 24 |
+
- Any text-based format conversions
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
input_path: Path to input file
|
| 28 |
+
output_format: Desired output format ('txt', 'csv', 'pdf')
|
| 29 |
+
output_path: Optional output path; auto-generated if not provided
|
| 30 |
+
|
| 31 |
+
Returns:
|
| 32 |
+
Dictionary with conversion results
|
| 33 |
+
"""
|
| 34 |
+
try:
|
| 35 |
+
input_file = Path(input_path)
|
| 36 |
+
|
| 37 |
+
if not input_file.exists():
|
| 38 |
+
raise FileNotFoundError(f"Input file not found: {input_path}")
|
| 39 |
+
|
| 40 |
+
# Determine input format
|
| 41 |
+
input_format = input_file.suffix.lower().replace('.', '')
|
| 42 |
+
|
| 43 |
+
# Generate output path if not provided
|
| 44 |
+
if output_path is None:
|
| 45 |
+
output_path = str(input_file.parent / f"{input_file.stem}.{output_format}")
|
| 46 |
+
|
| 47 |
+
output_file = Path(output_path)
|
| 48 |
+
|
| 49 |
+
# Perform conversion based on formats
|
| 50 |
+
if input_format == 'pdf' and output_format == 'txt':
|
| 51 |
+
success, message = _pdf_to_txt(input_path, output_path)
|
| 52 |
+
|
| 53 |
+
elif input_format == 'txt' and output_format == 'csv':
|
| 54 |
+
success, message = _txt_to_csv(input_path, output_path)
|
| 55 |
+
|
| 56 |
+
elif input_format == 'csv' and output_format == 'txt':
|
| 57 |
+
success, message = _csv_to_txt(input_path, output_path)
|
| 58 |
+
|
| 59 |
+
elif input_format in ['txt', 'md', 'log'] and output_format in ['txt', 'md', 'log']:
|
| 60 |
+
success, message = _text_to_text(input_path, output_path)
|
| 61 |
+
|
| 62 |
+
else:
|
| 63 |
+
raise ValueError(f"Conversion from {input_format} to {output_format} not supported")
|
| 64 |
+
|
| 65 |
+
return {
|
| 66 |
+
"output_path": str(output_file),
|
| 67 |
+
"success": success,
|
| 68 |
+
"message": message,
|
| 69 |
+
"input_format": input_format,
|
| 70 |
+
"output_format": output_format,
|
| 71 |
+
"file_size_bytes": output_file.stat().st_size if output_file.exists() else 0
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
except Exception as e:
|
| 75 |
+
logger.error(f"Error converting file: {e}")
|
| 76 |
+
raise
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def _pdf_to_txt(input_path: str, output_path: str) -> tuple:
|
| 80 |
+
"""Convert PDF to TXT"""
|
| 81 |
+
try:
|
| 82 |
+
from PyPDF2 import PdfReader
|
| 83 |
+
|
| 84 |
+
reader = PdfReader(input_path)
|
| 85 |
+
text_parts = []
|
| 86 |
+
|
| 87 |
+
for page in reader.pages:
|
| 88 |
+
text = page.extract_text()
|
| 89 |
+
if text:
|
| 90 |
+
text_parts.append(text)
|
| 91 |
+
|
| 92 |
+
full_text = "\n\n".join(text_parts)
|
| 93 |
+
|
| 94 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 95 |
+
f.write(full_text)
|
| 96 |
+
|
| 97 |
+
return True, f"Successfully converted PDF to TXT ({len(reader.pages)} pages)"
|
| 98 |
+
|
| 99 |
+
except Exception as e:
|
| 100 |
+
logger.error(f"PDF to TXT conversion error: {e}")
|
| 101 |
+
return False, str(e)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _txt_to_csv(input_path: str, output_path: str) -> tuple:
|
| 105 |
+
"""Convert TXT to CSV (assumes tab or comma separated values)"""
|
| 106 |
+
try:
|
| 107 |
+
import pandas as pd
|
| 108 |
+
|
| 109 |
+
# Try to read as CSV with different delimiters
|
| 110 |
+
try:
|
| 111 |
+
df = pd.read_csv(input_path, sep='\t')
|
| 112 |
+
except:
|
| 113 |
+
try:
|
| 114 |
+
df = pd.read_csv(input_path, sep=',')
|
| 115 |
+
except:
|
| 116 |
+
# If not structured, create simple CSV with one column
|
| 117 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 118 |
+
lines = f.readlines()
|
| 119 |
+
|
| 120 |
+
df = pd.DataFrame({'text': [line.strip() for line in lines if line.strip()]})
|
| 121 |
+
|
| 122 |
+
df.to_csv(output_path, index=False)
|
| 123 |
+
|
| 124 |
+
return True, f"Successfully converted TXT to CSV ({len(df)} rows)"
|
| 125 |
+
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"TXT to CSV conversion error: {e}")
|
| 128 |
+
return False, str(e)
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
def _csv_to_txt(input_path: str, output_path: str) -> tuple:
|
| 132 |
+
"""Convert CSV to TXT"""
|
| 133 |
+
try:
|
| 134 |
+
import pandas as pd
|
| 135 |
+
|
| 136 |
+
df = pd.read_csv(input_path)
|
| 137 |
+
|
| 138 |
+
# Convert to formatted text
|
| 139 |
+
text = df.to_string(index=False)
|
| 140 |
+
|
| 141 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 142 |
+
f.write(text)
|
| 143 |
+
|
| 144 |
+
return True, f"Successfully converted CSV to TXT ({len(df)} rows)"
|
| 145 |
+
|
| 146 |
+
except Exception as e:
|
| 147 |
+
logger.error(f"CSV to TXT conversion error: {e}")
|
| 148 |
+
return False, str(e)
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _text_to_text(input_path: str, output_path: str) -> tuple:
|
| 152 |
+
"""Convert between text-based formats"""
|
| 153 |
+
try:
|
| 154 |
+
with open(input_path, 'r', encoding='utf-8') as f:
|
| 155 |
+
content = f.read()
|
| 156 |
+
|
| 157 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
| 158 |
+
f.write(content)
|
| 159 |
+
|
| 160 |
+
return True, "Successfully converted text file"
|
| 161 |
+
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logger.error(f"Text to text conversion error: {e}")
|
| 164 |
+
return False, str(e)
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def batch_convert(input_files: list, output_format: str) -> Dict[str, Any]:
|
| 168 |
+
"""
|
| 169 |
+
Convert multiple files to the same output format.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
input_files: List of input file paths
|
| 173 |
+
output_format: Desired output format for all files
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Dictionary with batch conversion results
|
| 177 |
+
"""
|
| 178 |
+
results = []
|
| 179 |
+
|
| 180 |
+
for input_file in input_files:
|
| 181 |
+
try:
|
| 182 |
+
result = convert_file(input_file, output_format)
|
| 183 |
+
result["input_file"] = input_file
|
| 184 |
+
results.append(result)
|
| 185 |
+
except Exception as e:
|
| 186 |
+
logger.error(f"Error converting {input_file}: {e}")
|
| 187 |
+
results.append({
|
| 188 |
+
"input_file": input_file,
|
| 189 |
+
"success": False,
|
| 190 |
+
"message": str(e)
|
| 191 |
+
})
|
| 192 |
+
|
| 193 |
+
successful = sum(1 for r in results if r.get("success", False))
|
| 194 |
+
|
| 195 |
+
return {
|
| 196 |
+
"total_files": len(input_files),
|
| 197 |
+
"successful": successful,
|
| 198 |
+
"failed": len(input_files) - successful,
|
| 199 |
+
"results": results
|
| 200 |
+
}
|
tools/kpi_generator.py
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
KPI Generator Tool - Generate business KPIs from data
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any, List
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path for imports
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 11 |
+
|
| 12 |
+
from utils.helpers import parse_json_safe, safe_divide
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def generate_kpis(data: str, metrics: List[str] = None) -> Dict[str, Any]:
|
| 18 |
+
"""
|
| 19 |
+
Generate KPI report from business data.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
data: JSON string containing business data
|
| 23 |
+
metrics: List of metrics to calculate (revenue, growth, efficiency, etc.)
|
| 24 |
+
|
| 25 |
+
Returns:
|
| 26 |
+
Dictionary with calculated KPIs and insights
|
| 27 |
+
"""
|
| 28 |
+
try:
|
| 29 |
+
import json
|
| 30 |
+
|
| 31 |
+
# Parse input data
|
| 32 |
+
try:
|
| 33 |
+
business_data = json.loads(data)
|
| 34 |
+
except json.JSONDecodeError as e:
|
| 35 |
+
raise ValueError(f"Invalid JSON data: {e}")
|
| 36 |
+
|
| 37 |
+
if metrics is None:
|
| 38 |
+
metrics = ["revenue", "growth", "efficiency"]
|
| 39 |
+
|
| 40 |
+
kpis = {}
|
| 41 |
+
trends = []
|
| 42 |
+
|
| 43 |
+
# Calculate different KPIs based on requested metrics
|
| 44 |
+
for metric in metrics:
|
| 45 |
+
if metric == "revenue":
|
| 46 |
+
revenue_kpis = _calculate_revenue_kpis(business_data)
|
| 47 |
+
kpis.update(revenue_kpis)
|
| 48 |
+
|
| 49 |
+
elif metric == "growth":
|
| 50 |
+
growth_kpis = _calculate_growth_kpis(business_data)
|
| 51 |
+
kpis.update(growth_kpis)
|
| 52 |
+
|
| 53 |
+
elif metric == "efficiency":
|
| 54 |
+
efficiency_kpis = _calculate_efficiency_kpis(business_data)
|
| 55 |
+
kpis.update(efficiency_kpis)
|
| 56 |
+
|
| 57 |
+
elif metric == "customer":
|
| 58 |
+
customer_kpis = _calculate_customer_kpis(business_data)
|
| 59 |
+
kpis.update(customer_kpis)
|
| 60 |
+
|
| 61 |
+
elif metric == "operational":
|
| 62 |
+
operational_kpis = _calculate_operational_kpis(business_data)
|
| 63 |
+
kpis.update(operational_kpis)
|
| 64 |
+
|
| 65 |
+
# Generate trends
|
| 66 |
+
trends = _identify_trends(kpis, business_data)
|
| 67 |
+
|
| 68 |
+
# Generate executive summary
|
| 69 |
+
summary = _generate_summary(kpis, trends)
|
| 70 |
+
|
| 71 |
+
return {
|
| 72 |
+
"kpis": kpis,
|
| 73 |
+
"summary": summary,
|
| 74 |
+
"trends": trends,
|
| 75 |
+
"metrics_analyzed": metrics,
|
| 76 |
+
"data_points": len(business_data) if isinstance(business_data, list) else len(business_data.keys())
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
except Exception as e:
|
| 80 |
+
logger.error(f"Error generating KPIs: {e}")
|
| 81 |
+
raise
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _calculate_revenue_kpis(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 85 |
+
"""Calculate revenue-related KPIs"""
|
| 86 |
+
kpis = {}
|
| 87 |
+
|
| 88 |
+
try:
|
| 89 |
+
# Total Revenue
|
| 90 |
+
if "revenue" in data:
|
| 91 |
+
if isinstance(data["revenue"], list):
|
| 92 |
+
kpis["total_revenue"] = sum(data["revenue"])
|
| 93 |
+
kpis["average_revenue"] = sum(data["revenue"]) / len(data["revenue"])
|
| 94 |
+
kpis["min_revenue"] = min(data["revenue"])
|
| 95 |
+
kpis["max_revenue"] = max(data["revenue"])
|
| 96 |
+
else:
|
| 97 |
+
kpis["total_revenue"] = data["revenue"]
|
| 98 |
+
|
| 99 |
+
# Revenue per customer
|
| 100 |
+
if "revenue" in data and "customers" in data:
|
| 101 |
+
revenue = data["revenue"] if not isinstance(data["revenue"], list) else sum(data["revenue"])
|
| 102 |
+
customers = data["customers"] if not isinstance(data["customers"], list) else sum(data["customers"])
|
| 103 |
+
kpis["revenue_per_customer"] = safe_divide(revenue, customers)
|
| 104 |
+
|
| 105 |
+
# Profit margin
|
| 106 |
+
if "revenue" in data and "costs" in data:
|
| 107 |
+
revenue = data["revenue"] if not isinstance(data["revenue"], list) else sum(data["revenue"])
|
| 108 |
+
costs = data["costs"] if not isinstance(data["costs"], list) else sum(data["costs"])
|
| 109 |
+
profit = revenue - costs
|
| 110 |
+
kpis["profit"] = profit
|
| 111 |
+
kpis["profit_margin_percent"] = safe_divide(profit * 100, revenue)
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.warning(f"Error calculating revenue KPIs: {e}")
|
| 115 |
+
|
| 116 |
+
return kpis
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _calculate_growth_kpis(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 120 |
+
"""Calculate growth-related KPIs"""
|
| 121 |
+
kpis = {}
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
# Year-over-year growth
|
| 125 |
+
if "current_revenue" in data and "previous_revenue" in data:
|
| 126 |
+
growth = data["current_revenue"] - data["previous_revenue"]
|
| 127 |
+
growth_rate = safe_divide(growth * 100, data["previous_revenue"])
|
| 128 |
+
kpis["revenue_growth"] = growth
|
| 129 |
+
kpis["revenue_growth_rate_percent"] = growth_rate
|
| 130 |
+
|
| 131 |
+
# Customer growth
|
| 132 |
+
if "current_customers" in data and "previous_customers" in data:
|
| 133 |
+
customer_growth = data["current_customers"] - data["previous_customers"]
|
| 134 |
+
customer_growth_rate = safe_divide(customer_growth * 100, data["previous_customers"])
|
| 135 |
+
kpis["customer_growth"] = customer_growth
|
| 136 |
+
kpis["customer_growth_rate_percent"] = customer_growth_rate
|
| 137 |
+
|
| 138 |
+
# Monthly growth rate (if time series data provided)
|
| 139 |
+
if "monthly_revenue" in data and isinstance(data["monthly_revenue"], list):
|
| 140 |
+
revenues = data["monthly_revenue"]
|
| 141 |
+
if len(revenues) >= 2:
|
| 142 |
+
recent_growth = safe_divide((revenues[-1] - revenues[-2]) * 100, revenues[-2])
|
| 143 |
+
kpis["recent_monthly_growth_percent"] = recent_growth
|
| 144 |
+
|
| 145 |
+
except Exception as e:
|
| 146 |
+
logger.warning(f"Error calculating growth KPIs: {e}")
|
| 147 |
+
|
| 148 |
+
return kpis
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def _calculate_efficiency_kpis(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 152 |
+
"""Calculate efficiency-related KPIs"""
|
| 153 |
+
kpis = {}
|
| 154 |
+
|
| 155 |
+
try:
|
| 156 |
+
# Cost per acquisition
|
| 157 |
+
if "marketing_costs" in data and "new_customers" in data:
|
| 158 |
+
kpis["cost_per_acquisition"] = safe_divide(data["marketing_costs"], data["new_customers"])
|
| 159 |
+
|
| 160 |
+
# Operational efficiency
|
| 161 |
+
if "revenue" in data and "operational_costs" in data:
|
| 162 |
+
revenue = data["revenue"] if not isinstance(data["revenue"], list) else sum(data["revenue"])
|
| 163 |
+
kpis["operational_efficiency_ratio"] = safe_divide(revenue, data["operational_costs"])
|
| 164 |
+
|
| 165 |
+
# Employee productivity
|
| 166 |
+
if "revenue" in data and "employees" in data:
|
| 167 |
+
revenue = data["revenue"] if not isinstance(data["revenue"], list) else sum(data["revenue"])
|
| 168 |
+
kpis["revenue_per_employee"] = safe_divide(revenue, data["employees"])
|
| 169 |
+
|
| 170 |
+
# ROI
|
| 171 |
+
if "revenue" in data and "investment" in data:
|
| 172 |
+
revenue = data["revenue"] if not isinstance(data["revenue"], list) else sum(data["revenue"])
|
| 173 |
+
roi = safe_divide((revenue - data["investment"]) * 100, data["investment"])
|
| 174 |
+
kpis["roi_percent"] = roi
|
| 175 |
+
|
| 176 |
+
except Exception as e:
|
| 177 |
+
logger.warning(f"Error calculating efficiency KPIs: {e}")
|
| 178 |
+
|
| 179 |
+
return kpis
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def _calculate_customer_kpis(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 183 |
+
"""Calculate customer-related KPIs"""
|
| 184 |
+
kpis = {}
|
| 185 |
+
|
| 186 |
+
try:
|
| 187 |
+
# Customer lifetime value
|
| 188 |
+
if "average_purchase_value" in data and "purchase_frequency" in data and "customer_lifespan" in data:
|
| 189 |
+
clv = data["average_purchase_value"] * data["purchase_frequency"] * data["customer_lifespan"]
|
| 190 |
+
kpis["customer_lifetime_value"] = clv
|
| 191 |
+
|
| 192 |
+
# Churn rate
|
| 193 |
+
if "churned_customers" in data and "total_customers" in data:
|
| 194 |
+
kpis["churn_rate_percent"] = safe_divide(data["churned_customers"] * 100, data["total_customers"])
|
| 195 |
+
|
| 196 |
+
# Retention rate
|
| 197 |
+
if "retained_customers" in data and "total_customers" in data:
|
| 198 |
+
kpis["retention_rate_percent"] = safe_divide(data["retained_customers"] * 100, data["total_customers"])
|
| 199 |
+
|
| 200 |
+
# Net Promoter Score (if provided)
|
| 201 |
+
if "nps_score" in data:
|
| 202 |
+
kpis["net_promoter_score"] = data["nps_score"]
|
| 203 |
+
|
| 204 |
+
except Exception as e:
|
| 205 |
+
logger.warning(f"Error calculating customer KPIs: {e}")
|
| 206 |
+
|
| 207 |
+
return kpis
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def _calculate_operational_kpis(data: Dict[str, Any]) -> Dict[str, Any]:
|
| 211 |
+
"""Calculate operational KPIs"""
|
| 212 |
+
kpis = {}
|
| 213 |
+
|
| 214 |
+
try:
|
| 215 |
+
# Inventory turnover
|
| 216 |
+
if "cost_of_goods_sold" in data and "average_inventory" in data:
|
| 217 |
+
kpis["inventory_turnover"] = safe_divide(data["cost_of_goods_sold"], data["average_inventory"])
|
| 218 |
+
|
| 219 |
+
# Order fulfillment rate
|
| 220 |
+
if "orders_fulfilled" in data and "total_orders" in data:
|
| 221 |
+
kpis["fulfillment_rate_percent"] = safe_divide(data["orders_fulfilled"] * 100, data["total_orders"])
|
| 222 |
+
|
| 223 |
+
# Average response time
|
| 224 |
+
if "total_response_time" in data and "ticket_count" in data:
|
| 225 |
+
kpis["average_response_time"] = safe_divide(data["total_response_time"], data["ticket_count"])
|
| 226 |
+
|
| 227 |
+
except Exception as e:
|
| 228 |
+
logger.warning(f"Error calculating operational KPIs: {e}")
|
| 229 |
+
|
| 230 |
+
return kpis
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _identify_trends(kpis: Dict[str, Any], data: Dict[str, Any]) -> List[str]:
|
| 234 |
+
"""Identify key trends from KPIs"""
|
| 235 |
+
trends = []
|
| 236 |
+
|
| 237 |
+
try:
|
| 238 |
+
# Check growth trends
|
| 239 |
+
if "revenue_growth_rate_percent" in kpis:
|
| 240 |
+
rate = kpis["revenue_growth_rate_percent"]
|
| 241 |
+
if rate > 20:
|
| 242 |
+
trends.append(f"Strong revenue growth of {rate:.1f}%")
|
| 243 |
+
elif rate > 0:
|
| 244 |
+
trends.append(f"Positive revenue growth of {rate:.1f}%")
|
| 245 |
+
else:
|
| 246 |
+
trends.append(f"Revenue decline of {abs(rate):.1f}%")
|
| 247 |
+
|
| 248 |
+
# Check profitability
|
| 249 |
+
if "profit_margin_percent" in kpis:
|
| 250 |
+
margin = kpis["profit_margin_percent"]
|
| 251 |
+
if margin > 20:
|
| 252 |
+
trends.append(f"Healthy profit margin at {margin:.1f}%")
|
| 253 |
+
elif margin > 0:
|
| 254 |
+
trends.append(f"Modest profit margin at {margin:.1f}%")
|
| 255 |
+
else:
|
| 256 |
+
trends.append(f"Operating at a loss with {abs(margin):.1f}% negative margin")
|
| 257 |
+
|
| 258 |
+
# Check efficiency
|
| 259 |
+
if "roi_percent" in kpis:
|
| 260 |
+
roi = kpis["roi_percent"]
|
| 261 |
+
if roi > 100:
|
| 262 |
+
trends.append(f"Excellent ROI of {roi:.1f}%")
|
| 263 |
+
elif roi > 0:
|
| 264 |
+
trends.append(f"Positive ROI of {roi:.1f}%")
|
| 265 |
+
|
| 266 |
+
# Check customer metrics
|
| 267 |
+
if "churn_rate_percent" in kpis:
|
| 268 |
+
churn = kpis["churn_rate_percent"]
|
| 269 |
+
if churn > 10:
|
| 270 |
+
trends.append(f"High customer churn rate of {churn:.1f}%")
|
| 271 |
+
else:
|
| 272 |
+
trends.append(f"Healthy churn rate of {churn:.1f}%")
|
| 273 |
+
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logger.warning(f"Error identifying trends: {e}")
|
| 276 |
+
|
| 277 |
+
return trends if trends else ["Insufficient data for trend analysis"]
|
| 278 |
+
|
| 279 |
+
|
| 280 |
+
def _generate_summary(kpis: Dict[str, Any], trends: List[str]) -> str:
|
| 281 |
+
"""Generate executive summary"""
|
| 282 |
+
summary_parts = []
|
| 283 |
+
|
| 284 |
+
summary_parts.append("Executive KPI Summary:")
|
| 285 |
+
summary_parts.append(f"- Analyzed {len(kpis)} key performance indicators")
|
| 286 |
+
|
| 287 |
+
if trends:
|
| 288 |
+
summary_parts.append("- Key insights:")
|
| 289 |
+
for trend in trends[:3]: # Top 3 trends
|
| 290 |
+
summary_parts.append(f" • {trend}")
|
| 291 |
+
|
| 292 |
+
return "\n".join(summary_parts)
|
tools/pdf_reader.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
PDF Reader Tool - Extract text and metadata from PDF files
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def read_pdf(file_path: str) -> Dict[str, Any]:
|
| 12 |
+
"""
|
| 13 |
+
Read and extract text from a PDF file.
|
| 14 |
+
|
| 15 |
+
Args:
|
| 16 |
+
file_path: Path to the PDF file
|
| 17 |
+
|
| 18 |
+
Returns:
|
| 19 |
+
Dictionary containing extracted text, page count, and metadata
|
| 20 |
+
"""
|
| 21 |
+
try:
|
| 22 |
+
from PyPDF2 import PdfReader
|
| 23 |
+
|
| 24 |
+
# Validate file exists
|
| 25 |
+
if not Path(file_path).exists():
|
| 26 |
+
raise FileNotFoundError(f"PDF file not found: {file_path}")
|
| 27 |
+
|
| 28 |
+
# Read PDF
|
| 29 |
+
reader = PdfReader(file_path)
|
| 30 |
+
|
| 31 |
+
# Extract text from all pages
|
| 32 |
+
text_parts = []
|
| 33 |
+
for page_num, page in enumerate(reader.pages, 1):
|
| 34 |
+
try:
|
| 35 |
+
text = page.extract_text()
|
| 36 |
+
if text:
|
| 37 |
+
text_parts.append(f"--- Page {page_num} ---\n{text}")
|
| 38 |
+
except Exception as e:
|
| 39 |
+
logger.warning(f"Failed to extract text from page {page_num}: {e}")
|
| 40 |
+
text_parts.append(f"--- Page {page_num} ---\n[Extraction failed]")
|
| 41 |
+
|
| 42 |
+
full_text = "\n\n".join(text_parts)
|
| 43 |
+
|
| 44 |
+
# Extract metadata
|
| 45 |
+
metadata = {}
|
| 46 |
+
if reader.metadata:
|
| 47 |
+
metadata = {
|
| 48 |
+
"author": reader.metadata.get("/Author", "Unknown"),
|
| 49 |
+
"creator": reader.metadata.get("/Creator", "Unknown"),
|
| 50 |
+
"producer": reader.metadata.get("/Producer", "Unknown"),
|
| 51 |
+
"subject": reader.metadata.get("/Subject", "Unknown"),
|
| 52 |
+
"title": reader.metadata.get("/Title", "Unknown"),
|
| 53 |
+
"creation_date": str(reader.metadata.get("/CreationDate", "Unknown"))
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
"text": full_text,
|
| 58 |
+
"pages": len(reader.pages),
|
| 59 |
+
"metadata": metadata
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
except ImportError:
|
| 63 |
+
logger.error("PyPDF2 not installed. Install with: pip install pypdf2")
|
| 64 |
+
raise
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.error(f"Error reading PDF: {e}")
|
| 67 |
+
raise
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def get_pdf_info(file_path: str) -> Dict[str, Any]:
|
| 71 |
+
"""
|
| 72 |
+
Get basic information about a PDF without extracting all text.
|
| 73 |
+
|
| 74 |
+
Args:
|
| 75 |
+
file_path: Path to the PDF file
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
Dictionary with PDF information
|
| 79 |
+
"""
|
| 80 |
+
try:
|
| 81 |
+
from PyPDF2 import PdfReader
|
| 82 |
+
|
| 83 |
+
reader = PdfReader(file_path)
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
"page_count": len(reader.pages),
|
| 87 |
+
"is_encrypted": reader.is_encrypted,
|
| 88 |
+
"file_size_bytes": Path(file_path).stat().st_size,
|
| 89 |
+
"file_name": Path(file_path).name
|
| 90 |
+
}
|
| 91 |
+
except Exception as e:
|
| 92 |
+
logger.error(f"Error getting PDF info: {e}")
|
| 93 |
+
raise
|
tools/rag_search.py
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG Search Tool - Semantic search using vector embeddings
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any, List
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path for imports
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 11 |
+
|
| 12 |
+
from utils.rag_utils import semantic_search, create_rag_store
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def search_documents(query: str, documents: List[str], top_k: int = 3) -> Dict[str, Any]:
|
| 18 |
+
"""
|
| 19 |
+
Perform semantic search on a collection of documents.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
query: Search query string
|
| 23 |
+
documents: List of document strings to search
|
| 24 |
+
top_k: Number of top results to return
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dictionary containing search results with scores
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
if not query or not query.strip():
|
| 31 |
+
raise ValueError("Query cannot be empty")
|
| 32 |
+
|
| 33 |
+
if not documents or len(documents) == 0:
|
| 34 |
+
raise ValueError("Documents list cannot be empty")
|
| 35 |
+
|
| 36 |
+
# Perform semantic search
|
| 37 |
+
results = semantic_search(query, documents, top_k)
|
| 38 |
+
|
| 39 |
+
return {
|
| 40 |
+
"query": query,
|
| 41 |
+
"total_documents": len(documents),
|
| 42 |
+
"returned_results": len(results),
|
| 43 |
+
"results": results
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
except Exception as e:
|
| 47 |
+
logger.error(f"Error performing RAG search: {e}")
|
| 48 |
+
raise
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def build_knowledge_base(documents: List[str]) -> Dict[str, Any]:
|
| 52 |
+
"""
|
| 53 |
+
Build a knowledge base from documents for later querying.
|
| 54 |
+
|
| 55 |
+
Args:
|
| 56 |
+
documents: List of documents to index
|
| 57 |
+
|
| 58 |
+
Returns:
|
| 59 |
+
Dictionary with knowledge base info
|
| 60 |
+
"""
|
| 61 |
+
try:
|
| 62 |
+
if not documents:
|
| 63 |
+
raise ValueError("Documents list cannot be empty")
|
| 64 |
+
|
| 65 |
+
# Create RAG store
|
| 66 |
+
store = create_rag_store(documents)
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"success": True,
|
| 70 |
+
"document_count": len(documents),
|
| 71 |
+
"message": "Knowledge base built successfully",
|
| 72 |
+
"store": store # In a real scenario, this would be persisted
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
except Exception as e:
|
| 76 |
+
logger.error(f"Error building knowledge base: {e}")
|
| 77 |
+
raise
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def multi_query_search(queries: List[str], documents: List[str], top_k: int = 3) -> Dict[str, Any]:
|
| 81 |
+
"""
|
| 82 |
+
Perform multiple searches with different queries on the same document set.
|
| 83 |
+
|
| 84 |
+
Args:
|
| 85 |
+
queries: List of query strings
|
| 86 |
+
documents: List of documents to search
|
| 87 |
+
top_k: Number of results per query
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
Dictionary with results for each query
|
| 91 |
+
"""
|
| 92 |
+
try:
|
| 93 |
+
if not queries or not documents:
|
| 94 |
+
raise ValueError("Both queries and documents must be provided")
|
| 95 |
+
|
| 96 |
+
# Build store once for efficiency
|
| 97 |
+
store = create_rag_store(documents)
|
| 98 |
+
|
| 99 |
+
all_results = {}
|
| 100 |
+
for idx, query in enumerate(queries):
|
| 101 |
+
try:
|
| 102 |
+
results = store.search(query, top_k)
|
| 103 |
+
all_results[f"query_{idx+1}"] = {
|
| 104 |
+
"query": query,
|
| 105 |
+
"results": results
|
| 106 |
+
}
|
| 107 |
+
except Exception as e:
|
| 108 |
+
logger.error(f"Error searching query {idx+1}: {e}")
|
| 109 |
+
all_results[f"query_{idx+1}"] = {
|
| 110 |
+
"query": query,
|
| 111 |
+
"error": str(e),
|
| 112 |
+
"results": []
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
return {
|
| 116 |
+
"total_queries": len(queries),
|
| 117 |
+
"total_documents": len(documents),
|
| 118 |
+
"results": all_results
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
logger.error(f"Error in multi-query search: {e}")
|
| 123 |
+
raise
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def find_similar_documents(target_doc: str, documents: List[str], top_k: int = 5) -> Dict[str, Any]:
|
| 127 |
+
"""
|
| 128 |
+
Find documents similar to a target document.
|
| 129 |
+
|
| 130 |
+
Args:
|
| 131 |
+
target_doc: The document to find similar ones for
|
| 132 |
+
documents: Corpus of documents to search
|
| 133 |
+
top_k: Number of similar documents to return
|
| 134 |
+
|
| 135 |
+
Returns:
|
| 136 |
+
Dictionary with similar documents
|
| 137 |
+
"""
|
| 138 |
+
try:
|
| 139 |
+
if not target_doc or not documents:
|
| 140 |
+
raise ValueError("Target document and documents list must be provided")
|
| 141 |
+
|
| 142 |
+
# Use target doc as query
|
| 143 |
+
results = semantic_search(target_doc, documents, top_k)
|
| 144 |
+
|
| 145 |
+
return {
|
| 146 |
+
"target_document": target_doc[:200] + "..." if len(target_doc) > 200 else target_doc,
|
| 147 |
+
"corpus_size": len(documents),
|
| 148 |
+
"similar_documents": results
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
except Exception as e:
|
| 152 |
+
logger.error(f"Error finding similar documents: {e}")
|
| 153 |
+
raise
|
tools/text_extractor.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text Extractor Tool - Clean, summarize, and process text
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path for imports
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 11 |
+
|
| 12 |
+
from utils.helpers import clean_text, chunk_text, summarize_text, extract_keywords
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def extract_text(text: str, operation: str = "clean", max_length: int = 500) -> Dict[str, Any]:
|
| 18 |
+
"""
|
| 19 |
+
Process text based on the specified operation.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
text: Raw text to process
|
| 23 |
+
operation: Operation to perform - 'clean', 'summarize', 'chunk', or 'keywords'
|
| 24 |
+
max_length: Maximum length for summary operations
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dictionary containing processed text and metadata
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
if not text or not text.strip():
|
| 31 |
+
raise ValueError("Input text is empty")
|
| 32 |
+
|
| 33 |
+
result = ""
|
| 34 |
+
metadata = {}
|
| 35 |
+
|
| 36 |
+
if operation == "clean":
|
| 37 |
+
result = clean_text(text)
|
| 38 |
+
metadata = {
|
| 39 |
+
"operation": "clean",
|
| 40 |
+
"original_length": len(text),
|
| 41 |
+
"cleaned_length": len(result)
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
elif operation == "summarize":
|
| 45 |
+
result = summarize_text(text, max_length)
|
| 46 |
+
metadata = {
|
| 47 |
+
"operation": "summarize",
|
| 48 |
+
"original_length": len(text),
|
| 49 |
+
"summary_length": len(result),
|
| 50 |
+
"compression_ratio": round(len(result) / len(text), 2) if len(text) > 0 else 0
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
elif operation == "chunk":
|
| 54 |
+
chunks = chunk_text(text, chunk_size=max_length, overlap=50)
|
| 55 |
+
result = "\n\n---CHUNK---\n\n".join(chunks)
|
| 56 |
+
metadata = {
|
| 57 |
+
"operation": "chunk",
|
| 58 |
+
"total_chunks": len(chunks),
|
| 59 |
+
"chunk_size": max_length
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
elif operation == "keywords":
|
| 63 |
+
keywords = extract_keywords(text, top_n=10)
|
| 64 |
+
result = ", ".join(keywords)
|
| 65 |
+
metadata = {
|
| 66 |
+
"operation": "keywords",
|
| 67 |
+
"keyword_count": len(keywords),
|
| 68 |
+
"keywords": keywords
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
else:
|
| 72 |
+
raise ValueError(f"Unknown operation: {operation}. Use 'clean', 'summarize', 'chunk', or 'keywords'")
|
| 73 |
+
|
| 74 |
+
# Calculate word count
|
| 75 |
+
word_count = len(result.split())
|
| 76 |
+
|
| 77 |
+
return {
|
| 78 |
+
"result": result,
|
| 79 |
+
"word_count": word_count,
|
| 80 |
+
"metadata": metadata
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
except Exception as e:
|
| 84 |
+
logger.error(f"Error extracting text: {e}")
|
| 85 |
+
raise
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def process_multiple_texts(texts: list, operation: str = "clean") -> list:
|
| 89 |
+
"""
|
| 90 |
+
Process multiple texts with the same operation.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
texts: List of text strings to process
|
| 94 |
+
operation: Operation to apply to all texts
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
List of results for each text
|
| 98 |
+
"""
|
| 99 |
+
results = []
|
| 100 |
+
for idx, text in enumerate(texts):
|
| 101 |
+
try:
|
| 102 |
+
result = extract_text(text, operation)
|
| 103 |
+
result["index"] = idx
|
| 104 |
+
results.append(result)
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.error(f"Error processing text at index {idx}: {e}")
|
| 107 |
+
results.append({
|
| 108 |
+
"index": idx,
|
| 109 |
+
"error": str(e),
|
| 110 |
+
"result": "",
|
| 111 |
+
"word_count": 0
|
| 112 |
+
})
|
| 113 |
+
|
| 114 |
+
return results
|
tools/web_fetcher.py
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Web Fetcher Tool - Fetch and extract content from web pages
|
| 3 |
+
"""
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Add parent directory to path for imports
|
| 10 |
+
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 11 |
+
|
| 12 |
+
from utils.helpers import validate_url, clean_text, format_timestamp
|
| 13 |
+
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def fetch_web_content(url: str, extract_text_only: bool = True, timeout: int = 30) -> Dict[str, Any]:
|
| 18 |
+
"""
|
| 19 |
+
Fetch content from a web URL.
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
url: URL to fetch
|
| 23 |
+
extract_text_only: If True, extract only text content; if False, return HTML
|
| 24 |
+
timeout: Request timeout in seconds
|
| 25 |
+
|
| 26 |
+
Returns:
|
| 27 |
+
Dictionary containing fetched content, status code, and metadata
|
| 28 |
+
"""
|
| 29 |
+
try:
|
| 30 |
+
import requests
|
| 31 |
+
from bs4 import BeautifulSoup
|
| 32 |
+
|
| 33 |
+
# Validate URL
|
| 34 |
+
if not validate_url(url):
|
| 35 |
+
raise ValueError(f"Invalid URL format: {url}")
|
| 36 |
+
|
| 37 |
+
# Set headers to mimic a browser
|
| 38 |
+
headers = {
|
| 39 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# Fetch content
|
| 43 |
+
response = requests.get(url, headers=headers, timeout=timeout)
|
| 44 |
+
response.raise_for_status()
|
| 45 |
+
|
| 46 |
+
content = ""
|
| 47 |
+
content_type = response.headers.get('Content-Type', '')
|
| 48 |
+
|
| 49 |
+
if extract_text_only and 'text/html' in content_type:
|
| 50 |
+
# Parse HTML and extract text
|
| 51 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 52 |
+
|
| 53 |
+
# Extract title
|
| 54 |
+
title = soup.title.string if soup.title else "No title"
|
| 55 |
+
|
| 56 |
+
# Extract links
|
| 57 |
+
links = []
|
| 58 |
+
for link in soup.find_all('a', href=True):
|
| 59 |
+
href = link.get('href', '')
|
| 60 |
+
if href and not href.startswith('#'):
|
| 61 |
+
links.append(href)
|
| 62 |
+
|
| 63 |
+
# Remove script and style elements
|
| 64 |
+
for script in soup(["script", "style", "nav", "footer", "header"]):
|
| 65 |
+
script.decompose()
|
| 66 |
+
|
| 67 |
+
# Get text
|
| 68 |
+
text = soup.get_text()
|
| 69 |
+
|
| 70 |
+
# Clean up text
|
| 71 |
+
lines = (line.strip() for line in text.splitlines())
|
| 72 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 73 |
+
content = '\n'.join(chunk for chunk in chunks if chunk)
|
| 74 |
+
|
| 75 |
+
# Further clean
|
| 76 |
+
content = clean_text(content)
|
| 77 |
+
|
| 78 |
+
else:
|
| 79 |
+
# Return raw content
|
| 80 |
+
content = response.text
|
| 81 |
+
title = "N/A (non-HTML content)"
|
| 82 |
+
links = []
|
| 83 |
+
|
| 84 |
+
# Build metadata
|
| 85 |
+
metadata = {
|
| 86 |
+
"url": url,
|
| 87 |
+
"status_code": response.status_code,
|
| 88 |
+
"content_type": content_type,
|
| 89 |
+
"content_length": len(content),
|
| 90 |
+
"encoding": response.encoding,
|
| 91 |
+
"timestamp": format_timestamp(),
|
| 92 |
+
"headers": dict(response.headers)
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
return {
|
| 96 |
+
"content": content,
|
| 97 |
+
"status_code": response.status_code,
|
| 98 |
+
"title": title,
|
| 99 |
+
"links": links,
|
| 100 |
+
"metadata": metadata
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
except requests.exceptions.RequestException as e:
|
| 104 |
+
logger.error(f"Request error fetching {url}: {e}")
|
| 105 |
+
raise
|
| 106 |
+
except Exception as e:
|
| 107 |
+
logger.error(f"Error fetching web content: {e}")
|
| 108 |
+
raise
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def fetch_multiple_urls(urls: list, extract_text_only: bool = True) -> list:
|
| 112 |
+
"""
|
| 113 |
+
Fetch content from multiple URLs.
|
| 114 |
+
|
| 115 |
+
Args:
|
| 116 |
+
urls: List of URLs to fetch
|
| 117 |
+
extract_text_only: Whether to extract text only
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
List of results for each URL
|
| 121 |
+
"""
|
| 122 |
+
results = []
|
| 123 |
+
for idx, url in enumerate(urls):
|
| 124 |
+
try:
|
| 125 |
+
result = fetch_web_content(url, extract_text_only)
|
| 126 |
+
result["index"] = idx
|
| 127 |
+
result["success"] = True
|
| 128 |
+
results.append(result)
|
| 129 |
+
except Exception as e:
|
| 130 |
+
logger.error(f"Error fetching URL at index {idx} ({url}): {e}")
|
| 131 |
+
results.append({
|
| 132 |
+
"index": idx,
|
| 133 |
+
"url": url,
|
| 134 |
+
"success": False,
|
| 135 |
+
"error": str(e),
|
| 136 |
+
"content": "",
|
| 137 |
+
"status_code": 0
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
return results
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def extract_links(url: str) -> Dict[str, Any]:
|
| 144 |
+
"""
|
| 145 |
+
Extract all links from a web page.
|
| 146 |
+
|
| 147 |
+
Args:
|
| 148 |
+
url: URL to extract links from
|
| 149 |
+
|
| 150 |
+
Returns:
|
| 151 |
+
Dictionary with extracted links
|
| 152 |
+
"""
|
| 153 |
+
try:
|
| 154 |
+
import requests
|
| 155 |
+
from bs4 import BeautifulSoup
|
| 156 |
+
from urllib.parse import urljoin
|
| 157 |
+
|
| 158 |
+
response = requests.get(url, timeout=30)
|
| 159 |
+
response.raise_for_status()
|
| 160 |
+
|
| 161 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 162 |
+
|
| 163 |
+
links = []
|
| 164 |
+
for link in soup.find_all('a', href=True):
|
| 165 |
+
absolute_url = urljoin(url, link['href'])
|
| 166 |
+
links.append({
|
| 167 |
+
"text": link.get_text(strip=True),
|
| 168 |
+
"href": absolute_url
|
| 169 |
+
})
|
| 170 |
+
|
| 171 |
+
return {
|
| 172 |
+
"url": url,
|
| 173 |
+
"total_links": len(links),
|
| 174 |
+
"links": links
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
except Exception as e:
|
| 178 |
+
logger.error(f"Error extracting links: {e}")
|
| 179 |
+
raise
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
MissionControlMCP Utilities Package
|
| 3 |
+
"""
|
utils/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (239 Bytes). View file
|
|
|
utils/__pycache__/helpers.cpython-312.pyc
ADDED
|
Binary file (6.35 kB). View file
|
|
|
utils/__pycache__/rag_utils.cpython-312.pyc
ADDED
|
Binary file (6.22 kB). View file
|
|
|
utils/helpers.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Helper utility functions
|
| 3 |
+
"""
|
| 4 |
+
import re
|
| 5 |
+
import logging
|
| 6 |
+
from typing import List, Dict, Any
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
|
| 9 |
+
# Setup logging
|
| 10 |
+
logging.basicConfig(level=logging.INFO)
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def clean_text(text: str) -> str:
|
| 15 |
+
"""
|
| 16 |
+
Clean and normalize text by removing extra whitespace, special characters, etc.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
text: Raw text to clean
|
| 20 |
+
|
| 21 |
+
Returns:
|
| 22 |
+
Cleaned text string
|
| 23 |
+
"""
|
| 24 |
+
# Remove extra whitespace
|
| 25 |
+
text = re.sub(r'\s+', ' ', text)
|
| 26 |
+
# Remove special characters but keep basic punctuation
|
| 27 |
+
text = re.sub(r'[^\w\s.,!?;:\-\'\"()]', '', text)
|
| 28 |
+
# Strip leading/trailing whitespace
|
| 29 |
+
text = text.strip()
|
| 30 |
+
return text
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
|
| 34 |
+
"""
|
| 35 |
+
Split text into overlapping chunks for processing.
|
| 36 |
+
|
| 37 |
+
Args:
|
| 38 |
+
text: Text to chunk
|
| 39 |
+
chunk_size: Size of each chunk in characters
|
| 40 |
+
overlap: Overlap between chunks
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of text chunks
|
| 44 |
+
"""
|
| 45 |
+
chunks = []
|
| 46 |
+
start = 0
|
| 47 |
+
text_length = len(text)
|
| 48 |
+
|
| 49 |
+
while start < text_length:
|
| 50 |
+
end = start + chunk_size
|
| 51 |
+
chunk = text[start:end]
|
| 52 |
+
chunks.append(chunk)
|
| 53 |
+
start = end - overlap
|
| 54 |
+
|
| 55 |
+
return chunks
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def summarize_text(text: str, max_length: int = 500) -> str:
|
| 59 |
+
"""
|
| 60 |
+
Create a simple extractive summary by taking the first sentences.
|
| 61 |
+
|
| 62 |
+
Args:
|
| 63 |
+
text: Text to summarize
|
| 64 |
+
max_length: Maximum length of summary
|
| 65 |
+
|
| 66 |
+
Returns:
|
| 67 |
+
Summarized text
|
| 68 |
+
"""
|
| 69 |
+
sentences = re.split(r'[.!?]+', text)
|
| 70 |
+
summary = ""
|
| 71 |
+
|
| 72 |
+
for sentence in sentences:
|
| 73 |
+
sentence = sentence.strip()
|
| 74 |
+
if not sentence:
|
| 75 |
+
continue
|
| 76 |
+
if len(summary) + len(sentence) + 2 <= max_length: # +2 for ". "
|
| 77 |
+
summary += sentence + ". "
|
| 78 |
+
else:
|
| 79 |
+
break
|
| 80 |
+
|
| 81 |
+
# If no sentences fit, return truncated text
|
| 82 |
+
if not summary and text:
|
| 83 |
+
summary = text[:max_length].rsplit(' ', 1)[0] + "..."
|
| 84 |
+
|
| 85 |
+
return summary.strip()
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def extract_keywords(text: str, top_n: int = 10) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Extract top keywords from text using simple frequency analysis.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
text: Text to analyze
|
| 94 |
+
top_n: Number of top keywords to return
|
| 95 |
+
|
| 96 |
+
Returns:
|
| 97 |
+
List of keywords
|
| 98 |
+
"""
|
| 99 |
+
# Simple word frequency approach
|
| 100 |
+
words = re.findall(r'\b[a-zA-Z]{4,}\b', text.lower())
|
| 101 |
+
|
| 102 |
+
# Remove common stop words
|
| 103 |
+
stop_words = {'that', 'this', 'with', 'from', 'have', 'been', 'were',
|
| 104 |
+
'will', 'would', 'could', 'should', 'about', 'their', 'there'}
|
| 105 |
+
words = [w for w in words if w not in stop_words]
|
| 106 |
+
|
| 107 |
+
# Count frequency
|
| 108 |
+
word_freq: Dict[str, int] = {}
|
| 109 |
+
for word in words:
|
| 110 |
+
word_freq[word] = word_freq.get(word, 0) + 1
|
| 111 |
+
|
| 112 |
+
# Sort by frequency and return top N
|
| 113 |
+
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
|
| 114 |
+
return [word for word, freq in sorted_words[:top_n]]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def validate_url(url: str) -> bool:
|
| 118 |
+
"""
|
| 119 |
+
Validate if a string is a proper URL.
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
url: URL string to validate
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
True if valid URL, False otherwise
|
| 126 |
+
"""
|
| 127 |
+
url_pattern = re.compile(
|
| 128 |
+
r'^https?://' # http:// or https://
|
| 129 |
+
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain...
|
| 130 |
+
r'localhost|' # localhost...
|
| 131 |
+
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
| 132 |
+
r'(?::\d+)?' # optional port
|
| 133 |
+
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
| 134 |
+
return url_pattern.match(url) is not None
|
| 135 |
+
|
| 136 |
+
|
| 137 |
+
def format_timestamp() -> str:
|
| 138 |
+
"""
|
| 139 |
+
Get current timestamp in ISO format.
|
| 140 |
+
|
| 141 |
+
Returns:
|
| 142 |
+
ISO formatted timestamp string
|
| 143 |
+
"""
|
| 144 |
+
return datetime.now().isoformat()
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def safe_divide(numerator: float, denominator: float, default: float = 0.0) -> float:
|
| 148 |
+
"""
|
| 149 |
+
Safely divide two numbers, returning default if denominator is zero.
|
| 150 |
+
|
| 151 |
+
Args:
|
| 152 |
+
numerator: Numerator value
|
| 153 |
+
denominator: Denominator value
|
| 154 |
+
default: Default value if division by zero
|
| 155 |
+
|
| 156 |
+
Returns:
|
| 157 |
+
Division result or default
|
| 158 |
+
"""
|
| 159 |
+
try:
|
| 160 |
+
return numerator / denominator if denominator != 0 else default
|
| 161 |
+
except (TypeError, ZeroDivisionError):
|
| 162 |
+
return default
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def parse_json_safe(json_str: str) -> Dict[str, Any]:
|
| 166 |
+
"""
|
| 167 |
+
Safely parse JSON string with error handling.
|
| 168 |
+
|
| 169 |
+
Args:
|
| 170 |
+
json_str: JSON string to parse
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
Parsed dictionary or empty dict on error
|
| 174 |
+
"""
|
| 175 |
+
import json
|
| 176 |
+
try:
|
| 177 |
+
return json.loads(json_str)
|
| 178 |
+
except json.JSONDecodeError as e:
|
| 179 |
+
logger.error(f"JSON parse error: {e}")
|
| 180 |
+
return {}
|
utils/rag_utils.py
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
RAG (Retrieval Augmented Generation) utilities using FAISS and embeddings
|
| 3 |
+
"""
|
| 4 |
+
import numpy as np
|
| 5 |
+
from typing import List, Dict, Any
|
| 6 |
+
import logging
|
| 7 |
+
|
| 8 |
+
logger = logging.getLogger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SimpleRAGStore:
|
| 12 |
+
"""
|
| 13 |
+
Simple RAG implementation using FAISS for vector similarity search
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
def __init__(self):
|
| 17 |
+
"""Initialize the RAG store"""
|
| 18 |
+
self.documents: List[str] = []
|
| 19 |
+
self.embeddings: List[np.ndarray] = []
|
| 20 |
+
self.index = None
|
| 21 |
+
self._model = None
|
| 22 |
+
|
| 23 |
+
def _get_model(self):
|
| 24 |
+
"""Lazy load the sentence transformer model"""
|
| 25 |
+
if self._model is None:
|
| 26 |
+
try:
|
| 27 |
+
from sentence_transformers import SentenceTransformer
|
| 28 |
+
self._model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 29 |
+
logger.info("Loaded sentence transformer model")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
logger.error(f"Failed to load sentence transformer: {e}")
|
| 32 |
+
raise
|
| 33 |
+
return self._model
|
| 34 |
+
|
| 35 |
+
def add_documents(self, documents: List[str]) -> None:
|
| 36 |
+
"""
|
| 37 |
+
Add documents to the RAG store and build FAISS index.
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
documents: List of document strings to add
|
| 41 |
+
"""
|
| 42 |
+
import faiss
|
| 43 |
+
|
| 44 |
+
if not documents:
|
| 45 |
+
logger.warning("No documents provided to add")
|
| 46 |
+
return
|
| 47 |
+
|
| 48 |
+
self.documents.extend(documents)
|
| 49 |
+
|
| 50 |
+
# Generate embeddings
|
| 51 |
+
model = self._get_model()
|
| 52 |
+
new_embeddings = model.encode(documents, show_progress_bar=False)
|
| 53 |
+
self.embeddings.extend(new_embeddings)
|
| 54 |
+
|
| 55 |
+
# Build or update FAISS index
|
| 56 |
+
embeddings_array = np.array(self.embeddings).astype('float32')
|
| 57 |
+
dimension = embeddings_array.shape[1]
|
| 58 |
+
|
| 59 |
+
if self.index is None:
|
| 60 |
+
self.index = faiss.IndexFlatL2(dimension)
|
| 61 |
+
|
| 62 |
+
self.index.add(embeddings_array)
|
| 63 |
+
logger.info(f"Added {len(documents)} documents to RAG store")
|
| 64 |
+
|
| 65 |
+
def search(self, query: str, top_k: int = 3) -> List[Dict[str, Any]]:
|
| 66 |
+
"""
|
| 67 |
+
Search for similar documents using the query.
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
query: Search query string
|
| 71 |
+
top_k: Number of top results to return
|
| 72 |
+
|
| 73 |
+
Returns:
|
| 74 |
+
List of search results with scores
|
| 75 |
+
"""
|
| 76 |
+
if self.index is None or len(self.documents) == 0:
|
| 77 |
+
logger.warning("No documents in RAG store")
|
| 78 |
+
return []
|
| 79 |
+
|
| 80 |
+
# Encode query
|
| 81 |
+
model = self._get_model()
|
| 82 |
+
query_embedding = model.encode([query], show_progress_bar=False)
|
| 83 |
+
query_embedding = np.array(query_embedding).astype('float32')
|
| 84 |
+
|
| 85 |
+
# Search FAISS index
|
| 86 |
+
top_k = min(top_k, len(self.documents))
|
| 87 |
+
distances, indices = self.index.search(query_embedding, top_k)
|
| 88 |
+
|
| 89 |
+
# Format results
|
| 90 |
+
results = []
|
| 91 |
+
for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
|
| 92 |
+
if idx < len(self.documents):
|
| 93 |
+
# Convert L2 distance to similarity score (inverse relationship)
|
| 94 |
+
similarity_score = 1.0 / (1.0 + float(distance))
|
| 95 |
+
results.append({
|
| 96 |
+
"rank": i + 1,
|
| 97 |
+
"document": self.documents[idx],
|
| 98 |
+
"score": round(similarity_score, 4),
|
| 99 |
+
"distance": float(distance)
|
| 100 |
+
})
|
| 101 |
+
|
| 102 |
+
return results
|
| 103 |
+
|
| 104 |
+
def clear(self) -> None:
|
| 105 |
+
"""Clear all documents and reset the index"""
|
| 106 |
+
self.documents = []
|
| 107 |
+
self.embeddings = []
|
| 108 |
+
self.index = None
|
| 109 |
+
logger.info("Cleared RAG store")
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
def create_rag_store(documents: List[str]) -> SimpleRAGStore:
|
| 113 |
+
"""
|
| 114 |
+
Factory function to create and populate a RAG store.
|
| 115 |
+
|
| 116 |
+
Args:
|
| 117 |
+
documents: List of documents to add to store
|
| 118 |
+
|
| 119 |
+
Returns:
|
| 120 |
+
Initialized SimpleRAGStore instance
|
| 121 |
+
"""
|
| 122 |
+
store = SimpleRAGStore()
|
| 123 |
+
if documents:
|
| 124 |
+
store.add_documents(documents)
|
| 125 |
+
return store
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def semantic_search(query: str, documents: List[str], top_k: int = 3) -> List[Dict[str, Any]]:
|
| 129 |
+
"""
|
| 130 |
+
Perform semantic search on a list of documents.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
query: Search query
|
| 134 |
+
documents: List of documents to search
|
| 135 |
+
top_k: Number of results to return
|
| 136 |
+
|
| 137 |
+
Returns:
|
| 138 |
+
List of search results
|
| 139 |
+
"""
|
| 140 |
+
store = create_rag_store(documents)
|
| 141 |
+
return store.search(query, top_k)
|