Commit
·
2c5e855
1
Parent(s):
88d2f36
Initial deployment of PDF Analysis & Orchestrator with enhanced features
Browse files- LICENSE +21 -0
- README.md +164 -6
- agents.py +313 -0
- app.py +386 -0
- config.py +52 -0
- create_test_pdf.py +120 -0
- packages.txt +8 -0
- requirements.txt +8 -0
- test_deployment.py +228 -0
- utils/__init__.py +184 -0
- utils/export.py +162 -0
- utils/prompts.py +136 -0
- utils/session.py +15 -0
- utils/validation.py +37 -0
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 PDF Analysis & Orchestrator
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,12 +1,170 @@
|
|
| 1 |
---
|
| 2 |
-
title: PDF
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
|
|
|
|
|
|
| 10 |
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: PDF Analysis & Orchestrator
|
| 3 |
+
emoji: 📄
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.44.0
|
| 8 |
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
short_description: Intelligent PDF analysis with AI-powered agents, chunking, caching, and batch processing
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# 📄 PDF Analysis & Orchestrator
|
| 15 |
+
|
| 16 |
+
A powerful, intelligent PDF analysis tool that provides comprehensive document processing through AI-powered agents. This application offers advanced features including document chunking, caching, streaming responses, batch processing, and custom prompt management.
|
| 17 |
+
|
| 18 |
+
## 🚀 Features
|
| 19 |
+
|
| 20 |
+
### Core Analysis
|
| 21 |
+
- **AI-Powered Analysis**: GPT-4 powered document analysis with context-aware responses
|
| 22 |
+
- **Audience Adaptation**: Automatically adapts explanations for different audiences
|
| 23 |
+
- **Document Segmentation**: Identifies and segments documents by themes and topics
|
| 24 |
+
- **Multi-Agent Orchestration**: Specialized AI agents for different analysis aspects
|
| 25 |
+
|
| 26 |
+
### Performance Optimizations
|
| 27 |
+
- **Document Chunking**: Smart processing of large documents (>15k chars) with sentence boundary detection
|
| 28 |
+
- **Caching System**: PDF text extraction caching for improved performance
|
| 29 |
+
- **Streaming Responses**: Real-time progress updates and status indicators
|
| 30 |
+
- **Configurable Parameters**: Adjustable chunk sizes and processing options
|
| 31 |
+
|
| 32 |
+
### Enhanced Features
|
| 33 |
+
- **Batch Processing**: Handle multiple PDFs simultaneously with comprehensive reporting
|
| 34 |
+
- **Result Export**: Export analysis results in TXT, JSON, and PDF formats
|
| 35 |
+
- **Custom Prompts**: Save, manage, and reuse custom analysis prompts
|
| 36 |
+
- **Progress Indicators**: Real-time feedback during long-running analyses
|
| 37 |
+
- **Session Management**: Per-user session isolation with persistent storage
|
| 38 |
+
|
| 39 |
+
## 🎯 Use Cases
|
| 40 |
+
|
| 41 |
+
- **Document Summarization**: Create concise summaries of complex documents
|
| 42 |
+
- **Technical Explanation**: Explain technical content for general audiences
|
| 43 |
+
- **Executive Summaries**: Generate high-level overviews for decision makers
|
| 44 |
+
- **Content Analysis**: Extract key findings and insights from documents
|
| 45 |
+
- **Batch Processing**: Analyze multiple documents with consistent instructions
|
| 46 |
+
- **Research Assistance**: Process and analyze research papers and reports
|
| 47 |
+
|
| 48 |
+
## 🛠️ Setup
|
| 49 |
+
|
| 50 |
+
### Prerequisites
|
| 51 |
+
- Python 3.10+
|
| 52 |
+
- OpenAI API key
|
| 53 |
+
|
| 54 |
+
### Installation
|
| 55 |
+
|
| 56 |
+
1. **Clone the repository:**
|
| 57 |
+
```bash
|
| 58 |
+
git clone https://huggingface.co/spaces/your-username/pdf-analysis-orchestrator
|
| 59 |
+
cd pdf-analysis-orchestrator
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
2. **Install dependencies:**
|
| 63 |
+
```bash
|
| 64 |
+
pip install -r requirements.txt
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
3. **Set up environment variables:**
|
| 68 |
+
```bash
|
| 69 |
+
export OPENAI_API_KEY="sk-your-api-key-here"
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
4. **Run the application:**
|
| 73 |
+
```bash
|
| 74 |
+
python app.py
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## 📖 Usage
|
| 78 |
+
|
| 79 |
+
### Single Document Analysis
|
| 80 |
+
1. Upload a PDF document
|
| 81 |
+
2. Enter your analysis instructions
|
| 82 |
+
3. Choose analysis options (streaming, chunk size)
|
| 83 |
+
4. Click "Analyze & Orchestrate"
|
| 84 |
+
5. View results and export if needed
|
| 85 |
+
|
| 86 |
+
### Batch Processing
|
| 87 |
+
1. Upload multiple PDF files
|
| 88 |
+
2. Enter batch analysis instructions
|
| 89 |
+
3. Click "Process Batch"
|
| 90 |
+
4. Review comprehensive batch results
|
| 91 |
+
|
| 92 |
+
### Custom Prompts
|
| 93 |
+
1. Go to "Manage Prompts" tab
|
| 94 |
+
2. Create custom prompt templates
|
| 95 |
+
3. Organize by categories
|
| 96 |
+
4. Reuse prompts across analyses
|
| 97 |
+
|
| 98 |
+
## 🏗️ Architecture
|
| 99 |
+
|
| 100 |
+
### Core Components
|
| 101 |
+
- **AnalysisAgent**: Primary analysis engine using GPT-4
|
| 102 |
+
- **CollaborationAgent**: Provides reviewer-style feedback
|
| 103 |
+
- **ConversationAgent**: Handles user interaction
|
| 104 |
+
- **MasterOrchestrator**: Coordinates agent interactions
|
| 105 |
+
|
| 106 |
+
### Key Files
|
| 107 |
+
- `app.py`: Main application with Gradio interface
|
| 108 |
+
- `agents.py`: AI agent implementations with streaming support
|
| 109 |
+
- `config.py`: Centralized configuration management
|
| 110 |
+
- `utils/`: Utility functions for PDF processing, caching, and export
|
| 111 |
+
|
| 112 |
+
## 🔧 Configuration
|
| 113 |
+
|
| 114 |
+
### Environment Variables
|
| 115 |
+
- `OPENAI_API_KEY`: Required OpenAI API key
|
| 116 |
+
- `OPENAI_MODEL`: Model to use (default: gpt-4)
|
| 117 |
+
- `CHUNK_SIZE`: Document chunk size (default: 15000)
|
| 118 |
+
- `CACHE_ENABLED`: Enable caching (default: true)
|
| 119 |
+
- `ANALYSIS_MAX_UPLOAD_MB`: Max upload size in MB (default: 50)
|
| 120 |
+
|
| 121 |
+
### Model Configuration
|
| 122 |
+
- **Temperature**: 0.2 (consistent, focused responses)
|
| 123 |
+
- **Max tokens**: 1000 (concise but comprehensive)
|
| 124 |
+
- **System prompts**: Designed for high-quality output
|
| 125 |
+
|
| 126 |
+
## 📊 Performance
|
| 127 |
+
|
| 128 |
+
- **Response Time**: Typically 2-5 seconds for analysis
|
| 129 |
+
- **File Size Limit**: 50MB (configurable)
|
| 130 |
+
- **Concurrent Users**: Supports multiple simultaneous sessions
|
| 131 |
+
- **Memory Usage**: Optimized for efficient processing
|
| 132 |
+
- **Caching**: Reduces processing time for repeated documents
|
| 133 |
+
|
| 134 |
+
## 🔒 Security
|
| 135 |
+
|
| 136 |
+
- File size validation
|
| 137 |
+
- Session isolation
|
| 138 |
+
- Secure file handling
|
| 139 |
+
- No persistent storage of sensitive data
|
| 140 |
+
- Environment-based configuration
|
| 141 |
+
|
| 142 |
+
## 🤝 Contributing
|
| 143 |
+
|
| 144 |
+
1. Fork the repository
|
| 145 |
+
2. Create a feature branch
|
| 146 |
+
3. Make your changes
|
| 147 |
+
4. Add tests if applicable
|
| 148 |
+
5. Submit a pull request
|
| 149 |
+
|
| 150 |
+
## 📝 License
|
| 151 |
+
|
| 152 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 153 |
+
|
| 154 |
+
## 🙏 Acknowledgments
|
| 155 |
+
|
| 156 |
+
- Built on the successful Analysis & Orchestrate feature from Sharmaji ka PDF Blaster V1
|
| 157 |
+
- Powered by OpenAI's GPT-4 model
|
| 158 |
+
- UI framework: Gradio
|
| 159 |
+
- PDF processing: pdfplumber
|
| 160 |
+
|
| 161 |
+
## 📞 Support
|
| 162 |
+
|
| 163 |
+
For issues and questions:
|
| 164 |
+
1. Check the documentation
|
| 165 |
+
2. Review existing issues
|
| 166 |
+
3. Create a new issue with detailed information
|
| 167 |
+
|
| 168 |
+
---
|
| 169 |
+
|
| 170 |
+
**Note**: This project focuses exclusively on the Analysis & Orchestrate functionality, providing the same high-quality results in a streamlined, focused package with enhanced performance and user experience.
|
agents.py
ADDED
|
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# agents.py - Core Analysis & Orchestration Agents
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import logging
|
| 5 |
+
from typing import Optional, Dict, Any, List, AsyncGenerator
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata
|
| 9 |
+
from config import Config
|
| 10 |
+
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
logger.setLevel(logging.INFO)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class BaseAgent:
|
| 16 |
+
def __init__(self, name: str, model: str, tasks_completed: int = 0):
|
| 17 |
+
self.name = name
|
| 18 |
+
self.model = model
|
| 19 |
+
self.tasks_completed = tasks_completed
|
| 20 |
+
|
| 21 |
+
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
| 22 |
+
raise NotImplementedError(f"{self.__class__.__name__}.handle must be implemented.")
|
| 23 |
+
|
| 24 |
+
async def handle_streaming(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None) -> AsyncGenerator[str, None]:
|
| 25 |
+
"""Streaming version of handle - override in subclasses for streaming support"""
|
| 26 |
+
result = await self.handle(user_id, prompt, file_path, context)
|
| 27 |
+
# Default implementation: yield the result as a single chunk
|
| 28 |
+
for key, value in result.items():
|
| 29 |
+
yield f"{key}: {value}"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# --------------------
|
| 33 |
+
# Core Analysis Agent
|
| 34 |
+
# --------------------
|
| 35 |
+
class AnalysisAgent(BaseAgent):
|
| 36 |
+
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 37 |
+
start_time = time.time()
|
| 38 |
+
|
| 39 |
+
if file_path:
|
| 40 |
+
# Get document metadata
|
| 41 |
+
metadata = get_document_metadata(file_path)
|
| 42 |
+
|
| 43 |
+
# Load text with caching
|
| 44 |
+
text = load_pdf_text_cached(file_path)
|
| 45 |
+
|
| 46 |
+
# Check if document needs chunking
|
| 47 |
+
if len(text) > Config.CHUNK_SIZE:
|
| 48 |
+
return await self._handle_large_document(prompt, text, metadata)
|
| 49 |
+
else:
|
| 50 |
+
content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
|
| 51 |
+
else:
|
| 52 |
+
content = f"User prompt: {prompt}"
|
| 53 |
+
metadata = {}
|
| 54 |
+
|
| 55 |
+
system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
response = await call_openai_chat(
|
| 59 |
+
model=self.model,
|
| 60 |
+
messages=[{"role": "system", "content": system},
|
| 61 |
+
{"role": "user", "content": content}],
|
| 62 |
+
temperature=Config.OPENAI_TEMPERATURE,
|
| 63 |
+
max_tokens=Config.OPENAI_MAX_TOKENS
|
| 64 |
+
)
|
| 65 |
+
except Exception as e:
|
| 66 |
+
logger.exception("AnalysisAgent failed")
|
| 67 |
+
response = f"Error during analysis: {str(e)}"
|
| 68 |
+
|
| 69 |
+
self.tasks_completed += 1
|
| 70 |
+
|
| 71 |
+
# Add processing metadata
|
| 72 |
+
processing_time = time.time() - start_time
|
| 73 |
+
result = {
|
| 74 |
+
"analysis": response,
|
| 75 |
+
"metadata": {
|
| 76 |
+
"processing_time": round(processing_time, 2),
|
| 77 |
+
"document_metadata": metadata,
|
| 78 |
+
"agent": self.name,
|
| 79 |
+
"tasks_completed": self.tasks_completed
|
| 80 |
+
}
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
return result
|
| 84 |
+
|
| 85 |
+
async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 86 |
+
"""Handle large documents by processing in chunks"""
|
| 87 |
+
from utils import chunk_text
|
| 88 |
+
chunks = chunk_text(text, Config.CHUNK_SIZE)
|
| 89 |
+
chunk_results = []
|
| 90 |
+
|
| 91 |
+
system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."
|
| 92 |
+
|
| 93 |
+
for i, chunk in enumerate(chunks):
|
| 94 |
+
content = f"User prompt: {prompt}\n\nDocument chunk {i+1}/{len(chunks)}:\n{chunk}"
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
response = await call_openai_chat(
|
| 98 |
+
model=self.model,
|
| 99 |
+
messages=[{"role": "system", "content": system},
|
| 100 |
+
{"role": "user", "content": content}],
|
| 101 |
+
temperature=Config.OPENAI_TEMPERATURE,
|
| 102 |
+
max_tokens=Config.OPENAI_MAX_TOKENS
|
| 103 |
+
)
|
| 104 |
+
chunk_results.append(f"--- Chunk {i+1} Analysis ---\n{response}")
|
| 105 |
+
except Exception as e:
|
| 106 |
+
logger.exception(f"AnalysisAgent failed on chunk {i+1}")
|
| 107 |
+
chunk_results.append(f"--- Chunk {i+1} Error ---\nError: {str(e)}")
|
| 108 |
+
|
| 109 |
+
# Combine chunk results
|
| 110 |
+
combined_analysis = "\n\n".join(chunk_results)
|
| 111 |
+
|
| 112 |
+
# Create final summary
|
| 113 |
+
summary_prompt = f"Please provide a comprehensive summary that combines insights from all chunks of this large document. Original prompt: {prompt}\n\nChunk analyses:\n{combined_analysis}"
|
| 114 |
+
|
| 115 |
+
try:
|
| 116 |
+
final_summary = await call_openai_chat(
|
| 117 |
+
model=self.model,
|
| 118 |
+
messages=[{"role": "system", "content": "You are AnalysisAgent: create comprehensive summaries from multiple document chunks."},
|
| 119 |
+
{"role": "user", "content": summary_prompt}],
|
| 120 |
+
temperature=Config.OPENAI_TEMPERATURE,
|
| 121 |
+
max_tokens=Config.OPENAI_MAX_TOKENS
|
| 122 |
+
)
|
| 123 |
+
except Exception as e:
|
| 124 |
+
logger.exception("AnalysisAgent failed on final summary")
|
| 125 |
+
final_summary = f"Error creating final summary: {str(e)}\n\nChunk Results:\n{combined_analysis}"
|
| 126 |
+
|
| 127 |
+
return {
|
| 128 |
+
"analysis": final_summary,
|
| 129 |
+
"metadata": {
|
| 130 |
+
"processing_method": "chunked",
|
| 131 |
+
"chunks_processed": len(chunks),
|
| 132 |
+
"document_metadata": metadata,
|
| 133 |
+
"agent": self.name,
|
| 134 |
+
"tasks_completed": self.tasks_completed
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
async def handle_streaming(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None) -> AsyncGenerator[str, None]:
|
| 139 |
+
"""Streaming version of analysis"""
|
| 140 |
+
yield "🔍 Starting analysis..."
|
| 141 |
+
|
| 142 |
+
if file_path:
|
| 143 |
+
metadata = get_document_metadata(file_path)
|
| 144 |
+
yield f"📄 Document loaded: {metadata.get('page_count', 0)} pages, {metadata.get('file_size', 0) / 1024:.1f} KB"
|
| 145 |
+
|
| 146 |
+
text = load_pdf_text_cached(file_path)
|
| 147 |
+
|
| 148 |
+
if len(text) > Config.CHUNK_SIZE:
|
| 149 |
+
yield "📚 Large document detected, processing in chunks..."
|
| 150 |
+
from utils import chunk_text
|
| 151 |
+
chunks = chunk_text(text, Config.CHUNK_SIZE)
|
| 152 |
+
yield f"📊 Document split into {len(chunks)} chunks"
|
| 153 |
+
|
| 154 |
+
# Process chunks with progress updates
|
| 155 |
+
for i, chunk in enumerate(chunks):
|
| 156 |
+
yield f"⏳ Processing chunk {i+1}/{len(chunks)}..."
|
| 157 |
+
# Process chunk (simplified for streaming)
|
| 158 |
+
await asyncio.sleep(0.1) # Simulate processing time
|
| 159 |
+
|
| 160 |
+
yield "🔄 Combining chunk results..."
|
| 161 |
+
await asyncio.sleep(0.2)
|
| 162 |
+
yield "✅ Analysis complete!"
|
| 163 |
+
else:
|
| 164 |
+
yield "⚡ Processing document..."
|
| 165 |
+
await asyncio.sleep(0.3)
|
| 166 |
+
yield "✅ Analysis complete!"
|
| 167 |
+
else:
|
| 168 |
+
yield "⚡ Processing request..."
|
| 169 |
+
await asyncio.sleep(0.2)
|
| 170 |
+
yield "✅ Analysis complete!"
|
| 171 |
+
|
| 172 |
+
# Get the actual result
|
| 173 |
+
result = await self.handle(user_id, prompt, file_path, context)
|
| 174 |
+
yield f"\n📋 Analysis Result:\n{result.get('analysis', 'No result')}"
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# --------------------
|
| 178 |
+
# Collaboration Agent
|
| 179 |
+
# --------------------
|
| 180 |
+
class CollaborationAgent(BaseAgent):
|
| 181 |
+
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 182 |
+
system = "You are CollaborationAgent: produce reviewer-style comments and suggestions for improvement. Focus on constructive feedback and actionable recommendations."
|
| 183 |
+
content = prompt if isinstance(prompt, str) else str(prompt)
|
| 184 |
+
try:
|
| 185 |
+
response = await call_openai_chat(model=self.model,
|
| 186 |
+
messages=[{"role": "system", "content": system},
|
| 187 |
+
{"role": "user", "content": content}],
|
| 188 |
+
temperature=0.2,
|
| 189 |
+
max_tokens=800)
|
| 190 |
+
except Exception as e:
|
| 191 |
+
logger.exception("CollaborationAgent failed")
|
| 192 |
+
response = f"Error during collaboration: {str(e)}"
|
| 193 |
+
self.tasks_completed += 1
|
| 194 |
+
return {"collaboration": response}
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
# --------------------
|
| 198 |
+
# Conversation Agent
|
| 199 |
+
# --------------------
|
| 200 |
+
class ConversationAgent(BaseAgent):
|
| 201 |
+
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 202 |
+
system = "You are ConversationAgent: respond politely and helpfully. Provide context-aware responses and guide users on how to get the best results from the analysis system."
|
| 203 |
+
try:
|
| 204 |
+
response = await call_openai_chat(model=self.model,
|
| 205 |
+
messages=[{"role": "system", "content": system},
|
| 206 |
+
{"role": "user", "content": prompt}],
|
| 207 |
+
temperature=0.3,
|
| 208 |
+
max_tokens=400)
|
| 209 |
+
except Exception as e:
|
| 210 |
+
logger.exception("ConversationAgent failed")
|
| 211 |
+
response = f"Error in conversation: {str(e)}"
|
| 212 |
+
self.tasks_completed += 1
|
| 213 |
+
return {"conversation": response}
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
# --------------------
|
| 217 |
+
# Master Orchestrator - Focused on Analysis
|
| 218 |
+
# --------------------
|
| 219 |
+
class MasterOrchestrator:
|
| 220 |
+
def __init__(self, agents: Dict[str, BaseAgent]):
|
| 221 |
+
self.agents = agents
|
| 222 |
+
|
| 223 |
+
async def handle_user_prompt(self, user_id: str, prompt: str, file_path: Optional[str] = None, targets: Optional[List[str]] = None) -> Dict[str, Any]:
|
| 224 |
+
results: Dict[str, Any] = {}
|
| 225 |
+
targets = targets or []
|
| 226 |
+
|
| 227 |
+
# Always start with conversation agent for context
|
| 228 |
+
if "conversation" in self.agents:
|
| 229 |
+
try:
|
| 230 |
+
conv_res = await self.agents["conversation"].handle(user_id, prompt, file_path)
|
| 231 |
+
results.update(conv_res)
|
| 232 |
+
except Exception:
|
| 233 |
+
pass
|
| 234 |
+
|
| 235 |
+
# Core analysis functionality
|
| 236 |
+
if "analysis" in targets and "analysis" in self.agents:
|
| 237 |
+
analysis_res = await self.agents["analysis"].handle(user_id, prompt, file_path)
|
| 238 |
+
results.update(analysis_res)
|
| 239 |
+
payload = analysis_res.get("analysis", "")
|
| 240 |
+
|
| 241 |
+
# Trigger collaboration agent asynchronously for additional insights
|
| 242 |
+
if "collab" in self.agents:
|
| 243 |
+
asyncio.create_task(self.agents["collab"].handle(user_id, payload, file_path))
|
| 244 |
+
|
| 245 |
+
return results
|
| 246 |
+
|
| 247 |
+
async def handle_user_prompt_streaming(self, user_id: str, prompt: str, file_path: Optional[str] = None, targets: Optional[List[str]] = None) -> AsyncGenerator[str, None]:
|
| 248 |
+
"""Streaming version of handle_user_prompt"""
|
| 249 |
+
targets = targets or []
|
| 250 |
+
|
| 251 |
+
# Stream analysis if requested
|
| 252 |
+
if "analysis" in targets and "analysis" in self.agents:
|
| 253 |
+
async for chunk in self.agents["analysis"].handle_streaming(user_id, prompt, file_path):
|
| 254 |
+
yield chunk
|
| 255 |
+
else:
|
| 256 |
+
# Fallback to regular handling
|
| 257 |
+
result = await self.handle_user_prompt(user_id, prompt, file_path, targets)
|
| 258 |
+
yield str(result)
|
| 259 |
+
|
| 260 |
+
async def handle_batch_analysis(self, user_id: str, prompt: str, file_paths: List[str], targets: Optional[List[str]] = None) -> Dict[str, Any]:
|
| 261 |
+
"""Handle batch analysis of multiple PDFs"""
|
| 262 |
+
results = {
|
| 263 |
+
"batch_results": [],
|
| 264 |
+
"summary": {},
|
| 265 |
+
"total_files": len(file_paths),
|
| 266 |
+
"successful": 0,
|
| 267 |
+
"failed": 0
|
| 268 |
+
}
|
| 269 |
+
|
| 270 |
+
targets = targets or ["analysis"]
|
| 271 |
+
|
| 272 |
+
for i, file_path in enumerate(file_paths):
|
| 273 |
+
try:
|
| 274 |
+
file_result = await self.handle_user_prompt(user_id, prompt, file_path, targets)
|
| 275 |
+
file_result["file_index"] = i
|
| 276 |
+
file_result["file_path"] = file_path
|
| 277 |
+
results["batch_results"].append(file_result)
|
| 278 |
+
results["successful"] += 1
|
| 279 |
+
except Exception as e:
|
| 280 |
+
error_result = {
|
| 281 |
+
"file_index": i,
|
| 282 |
+
"file_path": file_path,
|
| 283 |
+
"error": str(e),
|
| 284 |
+
"analysis": f"Error processing file: {str(e)}"
|
| 285 |
+
}
|
| 286 |
+
results["batch_results"].append(error_result)
|
| 287 |
+
results["failed"] += 1
|
| 288 |
+
|
| 289 |
+
# Create batch summary
|
| 290 |
+
if results["successful"] > 0:
|
| 291 |
+
successful_analyses = [r["analysis"] for r in results["batch_results"] if "error" not in r]
|
| 292 |
+
summary_prompt = f"Please provide a comprehensive summary of the following batch analysis results. Original prompt: {prompt}\n\nAnalyses:\n" + "\n\n---\n\n".join(successful_analyses)
|
| 293 |
+
|
| 294 |
+
try:
|
| 295 |
+
summary_response = await call_openai_chat(
|
| 296 |
+
model=Config.OPENAI_MODEL,
|
| 297 |
+
messages=[{"role": "system", "content": "You are AnalysisAgent: create comprehensive batch summaries from multiple document analyses."},
|
| 298 |
+
{"role": "user", "content": summary_prompt}],
|
| 299 |
+
temperature=Config.OPENAI_TEMPERATURE,
|
| 300 |
+
max_tokens=Config.OPENAI_MAX_TOKENS
|
| 301 |
+
)
|
| 302 |
+
results["summary"]["batch_analysis"] = summary_response
|
| 303 |
+
except Exception as e:
|
| 304 |
+
results["summary"]["batch_analysis"] = f"Error creating batch summary: {str(e)}"
|
| 305 |
+
|
| 306 |
+
results["summary"]["processing_stats"] = {
|
| 307 |
+
"total_files": len(file_paths),
|
| 308 |
+
"successful": results["successful"],
|
| 309 |
+
"failed": results["failed"],
|
| 310 |
+
"success_rate": f"{(results['successful'] / len(file_paths)) * 100:.1f}%" if file_paths else "0%"
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
return results
|
app.py
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# PDF Analysis & Orchestrator
|
| 2 |
+
# Extracted core functionality from Sharmaji ka PDF Blaster V1
|
| 3 |
+
import os
|
| 4 |
+
import asyncio
|
| 5 |
+
import uuid
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional, List, Tuple
|
| 8 |
+
import time
|
| 9 |
+
|
| 10 |
+
import gradio as gr
|
| 11 |
+
from agents import (
|
| 12 |
+
AnalysisAgent,
|
| 13 |
+
CollaborationAgent,
|
| 14 |
+
ConversationAgent,
|
| 15 |
+
MasterOrchestrator,
|
| 16 |
+
)
|
| 17 |
+
from utils import load_pdf_text
|
| 18 |
+
from utils.session import make_user_session
|
| 19 |
+
from utils.validation import validate_file_size
|
| 20 |
+
from utils.prompts import PromptManager
|
| 21 |
+
from utils.export import ExportManager
|
| 22 |
+
from config import Config
|
| 23 |
+
|
| 24 |
+
# ------------------------
|
| 25 |
+
# Initialize Components
|
| 26 |
+
# ------------------------
|
| 27 |
+
Config.ensure_directories()
|
| 28 |
+
|
| 29 |
+
# Agent Roster - Focused on Analysis & Orchestration
|
| 30 |
+
AGENTS = {
|
| 31 |
+
"analysis": AnalysisAgent(name="AnalysisAgent", model=Config.OPENAI_MODEL, tasks_completed=0),
|
| 32 |
+
"collab": CollaborationAgent(name="CollaborationAgent", model=Config.OPENAI_MODEL, tasks_completed=0),
|
| 33 |
+
"conversation": ConversationAgent(name="ConversationAgent", model=Config.OPENAI_MODEL, tasks_completed=0),
|
| 34 |
+
}
|
| 35 |
+
ORCHESTRATOR = MasterOrchestrator(agents=AGENTS)
|
| 36 |
+
|
| 37 |
+
# Initialize managers
|
| 38 |
+
PROMPT_MANAGER = PromptManager()
|
| 39 |
+
EXPORT_MANAGER = ExportManager()
|
| 40 |
+
|
| 41 |
+
# ------------------------
|
| 42 |
+
# File Handling
|
| 43 |
+
# ------------------------
|
| 44 |
+
def save_uploaded_file(uploaded, username: str = "anonymous", session_dir: Optional[str] = None) -> str:
|
| 45 |
+
if session_dir is None:
|
| 46 |
+
session_dir = make_user_session(username)
|
| 47 |
+
Path(session_dir).mkdir(parents=True, exist_ok=True)
|
| 48 |
+
dst = Path(session_dir) / f"upload_{uuid.uuid4().hex}.pdf"
|
| 49 |
+
|
| 50 |
+
if isinstance(uploaded, str) and os.path.exists(uploaded):
|
| 51 |
+
from shutil import copyfile
|
| 52 |
+
copyfile(uploaded, dst)
|
| 53 |
+
return str(dst)
|
| 54 |
+
if hasattr(uploaded, "read"):
|
| 55 |
+
with open(dst, "wb") as f:
|
| 56 |
+
f.write(uploaded.read())
|
| 57 |
+
return str(dst)
|
| 58 |
+
if isinstance(uploaded, dict) and "name" in uploaded and os.path.exists(uploaded["name"]):
|
| 59 |
+
from shutil import copyfile
|
| 60 |
+
copyfile(uploaded["name"], dst)
|
| 61 |
+
return str(dst)
|
| 62 |
+
raise RuntimeError("Unable to save uploaded file.")
|
| 63 |
+
|
| 64 |
+
# ------------------------
|
| 65 |
+
# Async wrapper
|
| 66 |
+
# ------------------------
|
| 67 |
+
def run_async(func, *args, **kwargs):
|
| 68 |
+
loop = asyncio.new_event_loop()
|
| 69 |
+
asyncio.set_event_loop(loop)
|
| 70 |
+
return loop.run_until_complete(func(*args, **kwargs))
|
| 71 |
+
|
| 72 |
+
# ------------------------
|
| 73 |
+
# Analysis Handlers - Core Features
|
| 74 |
+
# ------------------------
|
| 75 |
+
def handle_analysis(file, prompt, username="anonymous", use_streaming=False):
|
| 76 |
+
if file is None:
|
| 77 |
+
return "Please upload a PDF.", None, None
|
| 78 |
+
|
| 79 |
+
validate_file_size(file)
|
| 80 |
+
path = save_uploaded_file(file, username)
|
| 81 |
+
|
| 82 |
+
if use_streaming:
|
| 83 |
+
return handle_analysis_streaming(path, prompt, username)
|
| 84 |
+
else:
|
| 85 |
+
result = run_async(
|
| 86 |
+
ORCHESTRATOR.handle_user_prompt,
|
| 87 |
+
user_id=username,
|
| 88 |
+
prompt=prompt,
|
| 89 |
+
file_path=path,
|
| 90 |
+
targets=["analysis"]
|
| 91 |
+
)
|
| 92 |
+
return result.get("analysis", "No analysis result."), None, None
|
| 93 |
+
|
| 94 |
+
def handle_analysis_streaming(file_path, prompt, username="anonymous"):
|
| 95 |
+
"""Handle analysis with streaming output"""
|
| 96 |
+
def stream_generator():
|
| 97 |
+
async def async_stream():
|
| 98 |
+
async for chunk in ORCHESTRATOR.handle_user_prompt_streaming(
|
| 99 |
+
user_id=username,
|
| 100 |
+
prompt=prompt,
|
| 101 |
+
file_path=file_path,
|
| 102 |
+
targets=["analysis"]
|
| 103 |
+
):
|
| 104 |
+
yield chunk
|
| 105 |
+
|
| 106 |
+
# Convert async generator to sync generator
|
| 107 |
+
loop = asyncio.new_event_loop()
|
| 108 |
+
asyncio.set_event_loop(loop)
|
| 109 |
+
try:
|
| 110 |
+
async_gen = async_stream()
|
| 111 |
+
while True:
|
| 112 |
+
try:
|
| 113 |
+
chunk = loop.run_until_complete(async_gen.__anext__())
|
| 114 |
+
yield chunk
|
| 115 |
+
except StopAsyncIteration:
|
| 116 |
+
break
|
| 117 |
+
finally:
|
| 118 |
+
loop.close()
|
| 119 |
+
|
| 120 |
+
return stream_generator(), None, None
|
| 121 |
+
|
| 122 |
+
def handle_batch_analysis(files, prompt, username="anonymous"):
|
| 123 |
+
"""Handle batch analysis of multiple PDFs"""
|
| 124 |
+
if not files or len(files) == 0:
|
| 125 |
+
return "Please upload at least one PDF.", None, None
|
| 126 |
+
|
| 127 |
+
# Validate all files
|
| 128 |
+
file_paths = []
|
| 129 |
+
for file in files:
|
| 130 |
+
try:
|
| 131 |
+
validate_file_size(file)
|
| 132 |
+
path = save_uploaded_file(file, username)
|
| 133 |
+
file_paths.append(path)
|
| 134 |
+
except Exception as e:
|
| 135 |
+
return f"Error with file {file}: {str(e)}", None, None
|
| 136 |
+
|
| 137 |
+
result = run_async(
|
| 138 |
+
ORCHESTRATOR.handle_batch_analysis,
|
| 139 |
+
user_id=username,
|
| 140 |
+
prompt=prompt,
|
| 141 |
+
file_paths=file_paths,
|
| 142 |
+
targets=["analysis"]
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Format batch results
|
| 146 |
+
batch_summary = result.get("summary", {})
|
| 147 |
+
batch_results = result.get("batch_results", [])
|
| 148 |
+
|
| 149 |
+
formatted_output = f"📊 Batch Analysis Results\n"
|
| 150 |
+
formatted_output += f"Total files: {batch_summary.get('processing_stats', {}).get('total_files', 0)}\n"
|
| 151 |
+
formatted_output += f"Successful: {batch_summary.get('processing_stats', {}).get('successful', 0)}\n"
|
| 152 |
+
formatted_output += f"Failed: {batch_summary.get('processing_stats', {}).get('failed', 0)}\n"
|
| 153 |
+
formatted_output += f"Success rate: {batch_summary.get('processing_stats', {}).get('success_rate', '0%')}\n\n"
|
| 154 |
+
|
| 155 |
+
if batch_summary.get("batch_analysis"):
|
| 156 |
+
formatted_output += f"📋 Batch Summary:\n{batch_summary['batch_analysis']}\n\n"
|
| 157 |
+
|
| 158 |
+
formatted_output += "📄 Individual Results:\n"
|
| 159 |
+
for i, file_result in enumerate(batch_results):
|
| 160 |
+
formatted_output += f"\n--- File {i+1}: {Path(file_result.get('file_path', 'Unknown')).name} ---\n"
|
| 161 |
+
if "error" in file_result:
|
| 162 |
+
formatted_output += f"❌ Error: {file_result['error']}\n"
|
| 163 |
+
else:
|
| 164 |
+
formatted_output += f"✅ {file_result.get('analysis', 'No analysis')}\n"
|
| 165 |
+
|
| 166 |
+
return formatted_output, None, None
|
| 167 |
+
|
| 168 |
+
def handle_export(result_text, export_format, username="anonymous"):
|
| 169 |
+
"""Handle export of analysis results"""
|
| 170 |
+
if not result_text or result_text.strip() == "":
|
| 171 |
+
return "No content to export.", None
|
| 172 |
+
|
| 173 |
+
try:
|
| 174 |
+
if export_format == "txt":
|
| 175 |
+
filepath = EXPORT_MANAGER.export_text(result_text, username=username)
|
| 176 |
+
elif export_format == "json":
|
| 177 |
+
data = {"analysis": result_text, "exported_by": username, "timestamp": time.time()}
|
| 178 |
+
filepath = EXPORT_MANAGER.export_json(data, username=username)
|
| 179 |
+
elif export_format == "pdf":
|
| 180 |
+
filepath = EXPORT_MANAGER.export_pdf(result_text, username=username)
|
| 181 |
+
else:
|
| 182 |
+
return f"Unsupported export format: {export_format}", None
|
| 183 |
+
|
| 184 |
+
return f"✅ Export successful! File saved to: {filepath}", filepath
|
| 185 |
+
except Exception as e:
|
| 186 |
+
return f"❌ Export failed: {str(e)}", None
|
| 187 |
+
|
| 188 |
+
def get_custom_prompts():
|
| 189 |
+
"""Get available custom prompts"""
|
| 190 |
+
prompts = PROMPT_MANAGER.get_all_prompts()
|
| 191 |
+
return list(prompts.keys())
|
| 192 |
+
|
| 193 |
+
def load_custom_prompt(prompt_id):
|
| 194 |
+
"""Load a custom prompt template"""
|
| 195 |
+
return PROMPT_MANAGER.get_prompt(prompt_id) or ""
|
| 196 |
+
|
| 197 |
+
# ------------------------
|
| 198 |
+
# Gradio UI - Enhanced Interface
|
| 199 |
+
# ------------------------
|
| 200 |
+
with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as demo:
|
| 201 |
+
gr.Markdown("# 📄 PDF Analysis & Orchestrator - Intelligent Document Processing")
|
| 202 |
+
gr.Markdown("Upload PDFs and provide instructions for analysis, summarization, or explanation. Now with enhanced features!")
|
| 203 |
+
|
| 204 |
+
with gr.Tabs():
|
| 205 |
+
# Single Document Analysis Tab
|
| 206 |
+
with gr.Tab("📄 Single Document Analysis"):
|
| 207 |
+
with gr.Row():
|
| 208 |
+
with gr.Column(scale=1):
|
| 209 |
+
pdf_in = gr.File(label="Upload PDF", file_types=[".pdf"], elem_id="file_upload")
|
| 210 |
+
username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
|
| 211 |
+
|
| 212 |
+
# Custom Prompts Section
|
| 213 |
+
with gr.Accordion("🎯 Custom Prompts", open=False):
|
| 214 |
+
prompt_dropdown = gr.Dropdown(
|
| 215 |
+
choices=get_custom_prompts(),
|
| 216 |
+
label="Select Custom Prompt",
|
| 217 |
+
value=None
|
| 218 |
+
)
|
| 219 |
+
load_prompt_btn = gr.Button("Load Prompt", size="sm")
|
| 220 |
+
|
| 221 |
+
# Analysis Options
|
| 222 |
+
with gr.Accordion("⚙️ Analysis Options", open=False):
|
| 223 |
+
use_streaming = gr.Checkbox(label="Enable Streaming Output", value=False)
|
| 224 |
+
chunk_size = gr.Slider(
|
| 225 |
+
minimum=5000, maximum=30000, value=15000, step=1000,
|
| 226 |
+
label="Chunk Size (for large documents)"
|
| 227 |
+
)
|
| 228 |
+
|
| 229 |
+
with gr.Column(scale=2):
|
| 230 |
+
gr.Markdown("### Analysis Instructions")
|
| 231 |
+
prompt_input = gr.Textbox(
|
| 232 |
+
lines=4,
|
| 233 |
+
placeholder="Describe what you want to do with the document...\nExamples:\n- Summarize this document in 3 key points\n- Explain this technical paper for a 10-year-old\n- Segment this document by themes\n- Analyze the key findings",
|
| 234 |
+
label="Instructions"
|
| 235 |
+
)
|
| 236 |
+
|
| 237 |
+
with gr.Row():
|
| 238 |
+
submit_btn = gr.Button("🔍 Analyze & Orchestrate", variant="primary", size="lg")
|
| 239 |
+
clear_btn = gr.Button("🗑️ Clear", size="sm")
|
| 240 |
+
|
| 241 |
+
# Results Section
|
| 242 |
+
with gr.Row():
|
| 243 |
+
with gr.Column(scale=2):
|
| 244 |
+
output_box = gr.Textbox(label="Analysis Result", lines=15, max_lines=25, show_copy_button=True)
|
| 245 |
+
status_box = gr.Textbox(label="Status", value="Ready to analyze documents", interactive=False)
|
| 246 |
+
|
| 247 |
+
with gr.Column(scale=1):
|
| 248 |
+
# Export Section
|
| 249 |
+
with gr.Accordion("💾 Export Results", open=False):
|
| 250 |
+
export_format = gr.Dropdown(
|
| 251 |
+
choices=["txt", "json", "pdf"],
|
| 252 |
+
label="Export Format",
|
| 253 |
+
value="txt"
|
| 254 |
+
)
|
| 255 |
+
export_btn = gr.Button("📥 Export", variant="secondary")
|
| 256 |
+
export_status = gr.Textbox(label="Export Status", interactive=False)
|
| 257 |
+
|
| 258 |
+
# Document Info
|
| 259 |
+
with gr.Accordion("📊 Document Info", open=False):
|
| 260 |
+
doc_info = gr.Textbox(label="Document Information", interactive=False, lines=6)
|
| 261 |
+
|
| 262 |
+
# Batch Processing Tab
|
| 263 |
+
with gr.Tab("📚 Batch Processing"):
|
| 264 |
+
with gr.Row():
|
| 265 |
+
with gr.Column(scale=1):
|
| 266 |
+
batch_files = gr.File(
|
| 267 |
+
label="Upload Multiple PDFs",
|
| 268 |
+
file_count="multiple",
|
| 269 |
+
file_types=[".pdf"]
|
| 270 |
+
)
|
| 271 |
+
batch_username = gr.Textbox(label="Username (optional)", placeholder="anonymous")
|
| 272 |
+
|
| 273 |
+
with gr.Column(scale=2):
|
| 274 |
+
batch_prompt = gr.Textbox(
|
| 275 |
+
lines=3,
|
| 276 |
+
placeholder="Enter analysis instructions for all documents...",
|
| 277 |
+
label="Batch Analysis Instructions"
|
| 278 |
+
)
|
| 279 |
+
batch_submit = gr.Button("🚀 Process Batch", variant="primary", size="lg")
|
| 280 |
+
|
| 281 |
+
batch_output = gr.Textbox(label="Batch Results", lines=20, max_lines=30, show_copy_button=True)
|
| 282 |
+
batch_status = gr.Textbox(label="Batch Status", interactive=False)
|
| 283 |
+
|
| 284 |
+
# Custom Prompts Management Tab
|
| 285 |
+
with gr.Tab("🎯 Manage Prompts"):
|
| 286 |
+
with gr.Row():
|
| 287 |
+
with gr.Column(scale=1):
|
| 288 |
+
gr.Markdown("### Add New Prompt")
|
| 289 |
+
new_prompt_id = gr.Textbox(label="Prompt ID", placeholder="my_custom_prompt")
|
| 290 |
+
new_prompt_name = gr.Textbox(label="Prompt Name", placeholder="My Custom Analysis")
|
| 291 |
+
new_prompt_desc = gr.Textbox(label="Description", placeholder="What this prompt does")
|
| 292 |
+
new_prompt_template = gr.Textbox(
|
| 293 |
+
lines=4,
|
| 294 |
+
label="Prompt Template",
|
| 295 |
+
placeholder="Enter your custom prompt template..."
|
| 296 |
+
)
|
| 297 |
+
new_prompt_category = gr.Dropdown(
|
| 298 |
+
choices=["custom", "business", "technical", "explanation", "analysis"],
|
| 299 |
+
label="Category",
|
| 300 |
+
value="custom"
|
| 301 |
+
)
|
| 302 |
+
add_prompt_btn = gr.Button("➕ Add Prompt", variant="primary")
|
| 303 |
+
|
| 304 |
+
with gr.Column(scale=1):
|
| 305 |
+
gr.Markdown("### Existing Prompts")
|
| 306 |
+
prompt_list = gr.Dataframe(
|
| 307 |
+
headers=["ID", "Name", "Category", "Description"],
|
| 308 |
+
datatype=["str", "str", "str", "str"],
|
| 309 |
+
interactive=False,
|
| 310 |
+
label="Available Prompts"
|
| 311 |
+
)
|
| 312 |
+
refresh_prompts_btn = gr.Button("🔄 Refresh List")
|
| 313 |
+
delete_prompt_id = gr.Textbox(label="Prompt ID to Delete", placeholder="prompt_id")
|
| 314 |
+
delete_prompt_btn = gr.Button("🗑️ Delete Prompt", variant="stop")
|
| 315 |
+
|
| 316 |
+
# Event Handlers
|
| 317 |
+
# Single document analysis
|
| 318 |
+
submit_btn.click(
|
| 319 |
+
fn=handle_analysis,
|
| 320 |
+
inputs=[pdf_in, prompt_input, username_input, use_streaming],
|
| 321 |
+
outputs=[output_box, status_box, doc_info]
|
| 322 |
+
)
|
| 323 |
+
|
| 324 |
+
# Load custom prompt
|
| 325 |
+
load_prompt_btn.click(
|
| 326 |
+
fn=load_custom_prompt,
|
| 327 |
+
inputs=[prompt_dropdown],
|
| 328 |
+
outputs=[prompt_input]
|
| 329 |
+
)
|
| 330 |
+
|
| 331 |
+
# Export functionality
|
| 332 |
+
export_btn.click(
|
| 333 |
+
fn=handle_export,
|
| 334 |
+
inputs=[output_box, export_format, username_input],
|
| 335 |
+
outputs=[export_status, gr.State()]
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
# Clear functionality
|
| 339 |
+
clear_btn.click(
|
| 340 |
+
fn=lambda: ("", "", "", "Ready"),
|
| 341 |
+
inputs=[],
|
| 342 |
+
outputs=[pdf_in, prompt_input, output_box, status_box]
|
| 343 |
+
)
|
| 344 |
+
|
| 345 |
+
# Batch processing
|
| 346 |
+
batch_submit.click(
|
| 347 |
+
fn=handle_batch_analysis,
|
| 348 |
+
inputs=[batch_files, batch_prompt, batch_username],
|
| 349 |
+
outputs=[batch_output, batch_status, gr.State()]
|
| 350 |
+
)
|
| 351 |
+
|
| 352 |
+
# Prompt management
|
| 353 |
+
add_prompt_btn.click(
|
| 354 |
+
fn=lambda id, name, desc, template, cat: PROMPT_MANAGER.add_prompt(id, name, desc, template, cat),
|
| 355 |
+
inputs=[new_prompt_id, new_prompt_name, new_prompt_desc, new_prompt_template, new_prompt_category],
|
| 356 |
+
outputs=[]
|
| 357 |
+
)
|
| 358 |
+
|
| 359 |
+
refresh_prompts_btn.click(
|
| 360 |
+
fn=lambda: [[pid, prompt["name"], prompt["category"], prompt["description"]]
|
| 361 |
+
for pid, prompt in PROMPT_MANAGER.get_all_prompts().items()],
|
| 362 |
+
inputs=[],
|
| 363 |
+
outputs=[prompt_list]
|
| 364 |
+
)
|
| 365 |
+
|
| 366 |
+
delete_prompt_btn.click(
|
| 367 |
+
fn=lambda pid: PROMPT_MANAGER.delete_prompt(pid),
|
| 368 |
+
inputs=[delete_prompt_id],
|
| 369 |
+
outputs=[]
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
# Examples
|
| 373 |
+
gr.Examples(
|
| 374 |
+
examples=[
|
| 375 |
+
["Summarize this document in 3 key points"],
|
| 376 |
+
["Explain this technical content for a general audience"],
|
| 377 |
+
["Segment this document by main themes or topics"],
|
| 378 |
+
["Analyze the key findings and recommendations"],
|
| 379 |
+
["Create an executive summary of this document"],
|
| 380 |
+
],
|
| 381 |
+
inputs=prompt_input,
|
| 382 |
+
label="Example Instructions"
|
| 383 |
+
)
|
| 384 |
+
|
| 385 |
+
if __name__ == "__main__":
|
| 386 |
+
demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
|
config.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# config.py - Configuration management for PDF Analysis & Orchestrator
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
class Config:
|
| 6 |
+
"""Centralized configuration for the PDF Analysis Orchestrator"""
|
| 7 |
+
|
| 8 |
+
# OpenAI Configuration
|
| 9 |
+
OPENAI_MODEL = os.environ.get("OPENAI_MODEL", "gpt-4")
|
| 10 |
+
OPENAI_TEMPERATURE = float(os.environ.get("OPENAI_TEMPERATURE", "0.2"))
|
| 11 |
+
OPENAI_MAX_TOKENS = int(os.environ.get("OPENAI_MAX_TOKENS", "1000"))
|
| 12 |
+
|
| 13 |
+
# Document Processing
|
| 14 |
+
CHUNK_SIZE = int(os.environ.get("CHUNK_SIZE", "15000"))
|
| 15 |
+
CHUNK_OVERLAP = int(os.environ.get("CHUNK_OVERLAP", "1000"))
|
| 16 |
+
MAX_FILE_SIZE_MB = int(os.environ.get("ANALYSIS_MAX_UPLOAD_MB", "50"))
|
| 17 |
+
|
| 18 |
+
# Caching
|
| 19 |
+
CACHE_ENABLED = os.environ.get("CACHE_ENABLED", "true").lower() == "true"
|
| 20 |
+
CACHE_TTL_HOURS = int(os.environ.get("CACHE_TTL_HOURS", "24"))
|
| 21 |
+
|
| 22 |
+
# Session Management
|
| 23 |
+
SESSION_DIR = os.environ.get("ANALYSIS_SESSION_DIR", "/tmp/analysis_sessions")
|
| 24 |
+
|
| 25 |
+
# UI Configuration
|
| 26 |
+
SERVER_NAME = os.environ.get("SERVER_NAME", "0.0.0.0")
|
| 27 |
+
SERVER_PORT = int(os.environ.get("PORT", "7860"))
|
| 28 |
+
|
| 29 |
+
# Export Settings
|
| 30 |
+
EXPORT_DIR = os.environ.get("EXPORT_DIR", "/tmp/analysis_exports")
|
| 31 |
+
SUPPORTED_EXPORT_FORMATS = ["txt", "json", "pdf"]
|
| 32 |
+
|
| 33 |
+
# Custom Prompts
|
| 34 |
+
PROMPTS_DIR = os.environ.get("PROMPTS_DIR", "/tmp/analysis_prompts")
|
| 35 |
+
|
| 36 |
+
@classmethod
|
| 37 |
+
def ensure_directories(cls):
|
| 38 |
+
"""Ensure all required directories exist"""
|
| 39 |
+
directories = [
|
| 40 |
+
cls.SESSION_DIR,
|
| 41 |
+
cls.EXPORT_DIR,
|
| 42 |
+
cls.PROMPTS_DIR
|
| 43 |
+
]
|
| 44 |
+
for directory in directories:
|
| 45 |
+
Path(directory).mkdir(parents=True, exist_ok=True)
|
| 46 |
+
|
| 47 |
+
@classmethod
|
| 48 |
+
def get_chunk_size_for_text(cls, text_length: int) -> int:
|
| 49 |
+
"""Determine appropriate chunk size based on text length"""
|
| 50 |
+
if text_length <= cls.CHUNK_SIZE:
|
| 51 |
+
return text_length
|
| 52 |
+
return cls.CHUNK_SIZE
|
create_test_pdf.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Create a test PDF for testing the PDF Analysis & Orchestrator
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from reportlab.lib.pagesizes import letter
|
| 7 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
| 8 |
+
from reportlab.lib.styles import getSampleStyleSheet
|
| 9 |
+
from reportlab.lib.units import inch
|
| 10 |
+
|
| 11 |
+
def create_test_pdf():
|
| 12 |
+
"""Create a test PDF with sample content"""
|
| 13 |
+
|
| 14 |
+
# Create PDF document
|
| 15 |
+
doc = SimpleDocTemplate("test_document.pdf", pagesize=letter)
|
| 16 |
+
styles = getSampleStyleSheet()
|
| 17 |
+
|
| 18 |
+
# Sample content
|
| 19 |
+
content = [
|
| 20 |
+
Paragraph("PDF Analysis & Orchestrator - Test Document", styles['Title']),
|
| 21 |
+
Spacer(1, 12),
|
| 22 |
+
|
| 23 |
+
Paragraph("Executive Summary", styles['Heading1']),
|
| 24 |
+
Paragraph("""
|
| 25 |
+
This document serves as a test case for the PDF Analysis & Orchestrator application.
|
| 26 |
+
It contains various sections that can be used to test different analysis capabilities
|
| 27 |
+
including summarization, technical explanation, and content segmentation.
|
| 28 |
+
""", styles['Normal']),
|
| 29 |
+
Spacer(1, 12),
|
| 30 |
+
|
| 31 |
+
Paragraph("Introduction", styles['Heading1']),
|
| 32 |
+
Paragraph("""
|
| 33 |
+
The PDF Analysis & Orchestrator is a powerful tool that leverages artificial intelligence
|
| 34 |
+
to provide comprehensive document analysis. It uses advanced natural language processing
|
| 35 |
+
techniques to understand, summarize, and explain complex documents across various domains.
|
| 36 |
+
""", styles['Normal']),
|
| 37 |
+
Spacer(1, 12),
|
| 38 |
+
|
| 39 |
+
Paragraph("Key Features", styles['Heading1']),
|
| 40 |
+
Paragraph("""
|
| 41 |
+
The system offers several key features that make it particularly useful for document analysis:
|
| 42 |
+
""", styles['Normal']),
|
| 43 |
+
|
| 44 |
+
Paragraph("1. Intelligent Analysis", styles['Heading2']),
|
| 45 |
+
Paragraph("""
|
| 46 |
+
The AI-powered analysis engine can understand context and provide meaningful insights
|
| 47 |
+
from complex documents. It adapts its language and complexity based on the target audience.
|
| 48 |
+
""", styles['Normal']),
|
| 49 |
+
|
| 50 |
+
Paragraph("2. Document Chunking", styles['Heading2']),
|
| 51 |
+
Paragraph("""
|
| 52 |
+
For large documents, the system automatically breaks them into manageable chunks while
|
| 53 |
+
maintaining context through intelligent sentence boundary detection and overlap handling.
|
| 54 |
+
""", styles['Normal']),
|
| 55 |
+
|
| 56 |
+
Paragraph("3. Batch Processing", styles['Heading2']),
|
| 57 |
+
Paragraph("""
|
| 58 |
+
Users can process multiple documents simultaneously, with comprehensive reporting that
|
| 59 |
+
includes individual results and batch summaries.
|
| 60 |
+
""", styles['Normal']),
|
| 61 |
+
|
| 62 |
+
Paragraph("4. Custom Prompts", styles['Heading2']),
|
| 63 |
+
Paragraph("""
|
| 64 |
+
The system supports custom prompt templates that can be saved, organized, and reused
|
| 65 |
+
across different analysis sessions.
|
| 66 |
+
""", styles['Normal']),
|
| 67 |
+
|
| 68 |
+
Paragraph("Technical Implementation", styles['Heading1']),
|
| 69 |
+
Paragraph("""
|
| 70 |
+
The application is built using modern Python technologies including Gradio for the user
|
| 71 |
+
interface, OpenAI's GPT models for analysis, and pdfplumber for PDF processing. The
|
| 72 |
+
architecture follows a multi-agent pattern with specialized agents for different aspects
|
| 73 |
+
of analysis.
|
| 74 |
+
""", styles['Normal']),
|
| 75 |
+
Spacer(1, 12),
|
| 76 |
+
|
| 77 |
+
Paragraph("Performance Considerations", styles['Heading1']),
|
| 78 |
+
Paragraph("""
|
| 79 |
+
The system includes several performance optimizations including PDF text extraction caching,
|
| 80 |
+
configurable chunk sizes, and streaming responses for better user experience. These features
|
| 81 |
+
ensure efficient processing even for large documents and multiple concurrent users.
|
| 82 |
+
""", styles['Normal']),
|
| 83 |
+
Spacer(1, 12),
|
| 84 |
+
|
| 85 |
+
Paragraph("Use Cases", styles['Heading1']),
|
| 86 |
+
Paragraph("""
|
| 87 |
+
The PDF Analysis & Orchestrator is suitable for a wide range of use cases including:
|
| 88 |
+
""", styles['Normal']),
|
| 89 |
+
|
| 90 |
+
Paragraph("• Research Paper Analysis", styles['Normal']),
|
| 91 |
+
Paragraph("• Business Document Summarization", styles['Normal']),
|
| 92 |
+
Paragraph("• Technical Documentation Explanation", styles['Normal']),
|
| 93 |
+
Paragraph("• Legal Document Review", styles['Normal']),
|
| 94 |
+
Paragraph("• Educational Content Processing", styles['Normal']),
|
| 95 |
+
Paragraph("• Report Generation and Analysis", styles['Normal']),
|
| 96 |
+
Spacer(1, 12),
|
| 97 |
+
|
| 98 |
+
Paragraph("Conclusion", styles['Heading1']),
|
| 99 |
+
Paragraph("""
|
| 100 |
+
The PDF Analysis & Orchestrator represents a significant advancement in document analysis
|
| 101 |
+
technology. By combining artificial intelligence with user-friendly interfaces and powerful
|
| 102 |
+
processing capabilities, it provides a comprehensive solution for document understanding
|
| 103 |
+
and analysis across various domains and use cases.
|
| 104 |
+
""", styles['Normal']),
|
| 105 |
+
Spacer(1, 12),
|
| 106 |
+
|
| 107 |
+
Paragraph("Contact Information", styles['Heading1']),
|
| 108 |
+
Paragraph("""
|
| 109 |
+
For more information about the PDF Analysis & Orchestrator, please refer to the
|
| 110 |
+
project documentation or contact the development team. The application is designed
|
| 111 |
+
to be continuously improved based on user feedback and technological advancements.
|
| 112 |
+
""", styles['Normal']),
|
| 113 |
+
]
|
| 114 |
+
|
| 115 |
+
# Build PDF
|
| 116 |
+
doc.build(content)
|
| 117 |
+
print("✅ Test PDF created: test_document.pdf")
|
| 118 |
+
|
| 119 |
+
if __name__ == "__main__":
|
| 120 |
+
create_test_pdf()
|
packages.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# System packages required for PDF Analysis & Orchestrator
|
| 2 |
+
libgl1-mesa-glx
|
| 3 |
+
libglib2.0-0
|
| 4 |
+
libsm6
|
| 5 |
+
libxext6
|
| 6 |
+
libxrender-dev
|
| 7 |
+
libgomp1
|
| 8 |
+
libgcc-s1
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies for PDF Analysis & Orchestrator
|
| 2 |
+
gradio>=3.30
|
| 3 |
+
openai>=1.0.0
|
| 4 |
+
pypdf>=3.0.0
|
| 5 |
+
pdfplumber>=0.7.5
|
| 6 |
+
numpy
|
| 7 |
+
aiohttp
|
| 8 |
+
reportlab>=3.6.0
|
test_deployment.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Test script for PDF Analysis & Orchestrator deployment
|
| 4 |
+
Run this to verify all components are working correctly
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import sys
|
| 9 |
+
import asyncio
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
def test_imports():
|
| 13 |
+
"""Test that all required modules can be imported"""
|
| 14 |
+
print("🔍 Testing imports...")
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import gradio as gr
|
| 18 |
+
print("✅ Gradio imported successfully")
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
print(f"❌ Gradio import failed: {e}")
|
| 21 |
+
return False
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
import openai
|
| 25 |
+
print("✅ OpenAI imported successfully")
|
| 26 |
+
except ImportError as e:
|
| 27 |
+
print(f"❌ OpenAI import failed: {e}")
|
| 28 |
+
return False
|
| 29 |
+
|
| 30 |
+
try:
|
| 31 |
+
import pdfplumber
|
| 32 |
+
print("✅ pdfplumber imported successfully")
|
| 33 |
+
except ImportError as e:
|
| 34 |
+
print(f"❌ pdfplumber import failed: {e}")
|
| 35 |
+
return False
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
import numpy as np
|
| 39 |
+
print("✅ NumPy imported successfully")
|
| 40 |
+
except ImportError as e:
|
| 41 |
+
print(f"❌ NumPy import failed: {e}")
|
| 42 |
+
return False
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
from reportlab.lib.pagesizes import letter
|
| 46 |
+
print("✅ ReportLab imported successfully")
|
| 47 |
+
except ImportError as e:
|
| 48 |
+
print(f"❌ ReportLab import failed: {e}")
|
| 49 |
+
return False
|
| 50 |
+
|
| 51 |
+
return True
|
| 52 |
+
|
| 53 |
+
def test_config():
|
| 54 |
+
"""Test configuration loading"""
|
| 55 |
+
print("\n🔧 Testing configuration...")
|
| 56 |
+
|
| 57 |
+
try:
|
| 58 |
+
from config import Config
|
| 59 |
+
print("✅ Config module imported successfully")
|
| 60 |
+
|
| 61 |
+
# Test configuration values
|
| 62 |
+
print(f" - OpenAI Model: {Config.OPENAI_MODEL}")
|
| 63 |
+
print(f" - Chunk Size: {Config.CHUNK_SIZE}")
|
| 64 |
+
print(f" - Cache Enabled: {Config.CACHE_ENABLED}")
|
| 65 |
+
print(f" - Max Upload MB: {Config.MAX_FILE_SIZE_MB}")
|
| 66 |
+
|
| 67 |
+
return True
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"❌ Config test failed: {e}")
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
def test_utils():
|
| 73 |
+
"""Test utility functions"""
|
| 74 |
+
print("\n🛠️ Testing utilities...")
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
from utils import chunk_text, get_file_hash, load_pdf_text_cached
|
| 78 |
+
print("✅ Core utilities imported successfully")
|
| 79 |
+
|
| 80 |
+
# Test chunking
|
| 81 |
+
test_text = "This is a test document. " * 1000 # Create long text
|
| 82 |
+
chunks = chunk_text(test_text, 100)
|
| 83 |
+
print(f" - Chunking test: {len(chunks)} chunks created")
|
| 84 |
+
|
| 85 |
+
# Test file hash
|
| 86 |
+
test_file = Path("test.txt")
|
| 87 |
+
test_file.write_text("test content")
|
| 88 |
+
file_hash = get_file_hash(str(test_file))
|
| 89 |
+
print(f" - File hash test: {file_hash[:8]}...")
|
| 90 |
+
test_file.unlink() # Clean up
|
| 91 |
+
|
| 92 |
+
return True
|
| 93 |
+
except Exception as e:
|
| 94 |
+
print(f"❌ Utils test failed: {e}")
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
def test_agents():
|
| 98 |
+
"""Test agent initialization"""
|
| 99 |
+
print("\n🤖 Testing agents...")
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
from agents import AnalysisAgent, CollaborationAgent, ConversationAgent, MasterOrchestrator
|
| 103 |
+
print("✅ Agent classes imported successfully")
|
| 104 |
+
|
| 105 |
+
# Test agent creation
|
| 106 |
+
analysis_agent = AnalysisAgent("TestAgent", "gpt-4", 0)
|
| 107 |
+
print(" - AnalysisAgent created successfully")
|
| 108 |
+
|
| 109 |
+
# Test orchestrator
|
| 110 |
+
agents = {
|
| 111 |
+
"analysis": analysis_agent,
|
| 112 |
+
"collab": CollaborationAgent("TestCollab", "gpt-4", 0),
|
| 113 |
+
"conversation": ConversationAgent("TestConv", "gpt-4", 0)
|
| 114 |
+
}
|
| 115 |
+
orchestrator = MasterOrchestrator(agents)
|
| 116 |
+
print(" - MasterOrchestrator created successfully")
|
| 117 |
+
|
| 118 |
+
return True
|
| 119 |
+
except Exception as e:
|
| 120 |
+
print(f"❌ Agents test failed: {e}")
|
| 121 |
+
return False
|
| 122 |
+
|
| 123 |
+
def test_managers():
|
| 124 |
+
"""Test manager classes"""
|
| 125 |
+
print("\n📋 Testing managers...")
|
| 126 |
+
|
| 127 |
+
try:
|
| 128 |
+
from utils.prompts import PromptManager
|
| 129 |
+
from utils.export import ExportManager
|
| 130 |
+
print("✅ Manager classes imported successfully")
|
| 131 |
+
|
| 132 |
+
# Test prompt manager
|
| 133 |
+
prompt_manager = PromptManager()
|
| 134 |
+
prompts = prompt_manager.get_all_prompts()
|
| 135 |
+
print(f" - PromptManager: {len(prompts)} default prompts loaded")
|
| 136 |
+
|
| 137 |
+
# Test export manager
|
| 138 |
+
export_manager = ExportManager()
|
| 139 |
+
print(" - ExportManager created successfully")
|
| 140 |
+
|
| 141 |
+
return True
|
| 142 |
+
except Exception as e:
|
| 143 |
+
print(f"❌ Managers test failed: {e}")
|
| 144 |
+
return False
|
| 145 |
+
|
| 146 |
+
def test_environment():
|
| 147 |
+
"""Test environment variables"""
|
| 148 |
+
print("\n🌍 Testing environment...")
|
| 149 |
+
|
| 150 |
+
openai_key = os.environ.get("OPENAI_API_KEY")
|
| 151 |
+
if openai_key:
|
| 152 |
+
print("✅ OPENAI_API_KEY is set")
|
| 153 |
+
print(f" - Key starts with: {openai_key[:8]}...")
|
| 154 |
+
else:
|
| 155 |
+
print("⚠️ OPENAI_API_KEY not set (required for full functionality)")
|
| 156 |
+
|
| 157 |
+
# Check other important environment variables
|
| 158 |
+
env_vars = [
|
| 159 |
+
"OPENAI_MODEL",
|
| 160 |
+
"CHUNK_SIZE",
|
| 161 |
+
"CACHE_ENABLED",
|
| 162 |
+
"ANALYSIS_MAX_UPLOAD_MB"
|
| 163 |
+
]
|
| 164 |
+
|
| 165 |
+
for var in env_vars:
|
| 166 |
+
value = os.environ.get(var)
|
| 167 |
+
if value:
|
| 168 |
+
print(f" - {var}: {value}")
|
| 169 |
+
else:
|
| 170 |
+
print(f" - {var}: using default")
|
| 171 |
+
|
| 172 |
+
return True
|
| 173 |
+
|
| 174 |
+
def test_gradio_interface():
|
| 175 |
+
"""Test Gradio interface creation"""
|
| 176 |
+
print("\n🎨 Testing Gradio interface...")
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
# Import the main app components
|
| 180 |
+
from app import demo
|
| 181 |
+
print("✅ Gradio interface created successfully")
|
| 182 |
+
|
| 183 |
+
# Test if the interface has the expected components
|
| 184 |
+
if hasattr(demo, 'blocks'):
|
| 185 |
+
print(" - Interface has blocks structure")
|
| 186 |
+
|
| 187 |
+
return True
|
| 188 |
+
except Exception as e:
|
| 189 |
+
print(f"❌ Gradio interface test failed: {e}")
|
| 190 |
+
return False
|
| 191 |
+
|
| 192 |
+
def main():
|
| 193 |
+
"""Run all tests"""
|
| 194 |
+
print("🚀 PDF Analysis & Orchestrator - Deployment Test")
|
| 195 |
+
print("=" * 50)
|
| 196 |
+
|
| 197 |
+
tests = [
|
| 198 |
+
test_imports,
|
| 199 |
+
test_config,
|
| 200 |
+
test_utils,
|
| 201 |
+
test_agents,
|
| 202 |
+
test_managers,
|
| 203 |
+
test_environment,
|
| 204 |
+
test_gradio_interface
|
| 205 |
+
]
|
| 206 |
+
|
| 207 |
+
passed = 0
|
| 208 |
+
total = len(tests)
|
| 209 |
+
|
| 210 |
+
for test in tests:
|
| 211 |
+
try:
|
| 212 |
+
if test():
|
| 213 |
+
passed += 1
|
| 214 |
+
except Exception as e:
|
| 215 |
+
print(f"❌ Test {test.__name__} failed with exception: {e}")
|
| 216 |
+
|
| 217 |
+
print("\n" + "=" * 50)
|
| 218 |
+
print(f"📊 Test Results: {passed}/{total} tests passed")
|
| 219 |
+
|
| 220 |
+
if passed == total:
|
| 221 |
+
print("🎉 All tests passed! Your deployment is ready.")
|
| 222 |
+
return 0
|
| 223 |
+
else:
|
| 224 |
+
print("⚠️ Some tests failed. Please check the errors above.")
|
| 225 |
+
return 1
|
| 226 |
+
|
| 227 |
+
if __name__ == "__main__":
|
| 228 |
+
sys.exit(main())
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/__init__.py - Core utilities for PDF Analysis & Orchestrator
|
| 2 |
+
import os
|
| 3 |
+
import asyncio
|
| 4 |
+
import tempfile
|
| 5 |
+
import hashlib
|
| 6 |
+
import json
|
| 7 |
+
import time
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import pdfplumber
|
| 10 |
+
import numpy as np
|
| 11 |
+
from uuid import uuid4
|
| 12 |
+
import openai
|
| 13 |
+
import shutil
|
| 14 |
+
from typing import List, Dict, Any, Optional
|
| 15 |
+
|
| 16 |
+
# ------------------------
|
| 17 |
+
# OpenAI setup
|
| 18 |
+
# ------------------------
|
| 19 |
+
OPENAI_KEY = os.environ.get("OPENAI_API_KEY")
|
| 20 |
+
if OPENAI_KEY is None:
|
| 21 |
+
raise RuntimeError("Set OPENAI_API_KEY environment variable before running.")
|
| 22 |
+
|
| 23 |
+
openai.api_key = OPENAI_KEY
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def uuid4_hex():
|
| 27 |
+
from uuid import uuid4
|
| 28 |
+
return uuid4().hex
|
| 29 |
+
|
| 30 |
+
# ------------------------
|
| 31 |
+
# Async OpenAI Chat Wrapper
|
| 32 |
+
# ------------------------
|
| 33 |
+
async def call_openai_chat(model: str, messages: list, temperature=0.2, max_tokens=800):
|
| 34 |
+
"""
|
| 35 |
+
Async wrapper for OpenAI >=1.0.0 Chat Completions
|
| 36 |
+
"""
|
| 37 |
+
def _call():
|
| 38 |
+
resp = openai.chat.completions.create(
|
| 39 |
+
model=model,
|
| 40 |
+
messages=messages,
|
| 41 |
+
temperature=temperature,
|
| 42 |
+
max_tokens=max_tokens,
|
| 43 |
+
)
|
| 44 |
+
return resp.choices[0].message.content.strip()
|
| 45 |
+
return await asyncio.to_thread(_call)
|
| 46 |
+
|
| 47 |
+
# ------------------------
|
| 48 |
+
# PDF Utilities
|
| 49 |
+
# ------------------------
|
| 50 |
+
def load_pdf_text(path: str) -> str:
|
| 51 |
+
"""Extract text from PDF using pdfplumber"""
|
| 52 |
+
text = []
|
| 53 |
+
with pdfplumber.open(path) as pdf:
|
| 54 |
+
for p in pdf.pages:
|
| 55 |
+
text.append(p.extract_text() or "")
|
| 56 |
+
return "\n\n".join(text)
|
| 57 |
+
|
| 58 |
+
def save_text_as_file(text: str, suffix=".txt") -> str:
|
| 59 |
+
"""Save text to a temporary file"""
|
| 60 |
+
fp = Path(tempfile.gettempdir()) / f"analysis_{uuid4().hex}{suffix}"
|
| 61 |
+
fp.write_text(text, encoding="utf-8")
|
| 62 |
+
return str(fp)
|
| 63 |
+
|
| 64 |
+
def save_uploaded_file(uploaded) -> str:
|
| 65 |
+
"""
|
| 66 |
+
Save uploaded file to temporary location
|
| 67 |
+
"""
|
| 68 |
+
dst = Path(tempfile.gettempdir()) / f"upload_{uuid4().hex}.pdf"
|
| 69 |
+
with open(dst, "wb") as f:
|
| 70 |
+
shutil.copyfileobj(uploaded, f)
|
| 71 |
+
return str(dst)
|
| 72 |
+
|
| 73 |
+
# ------------------------
|
| 74 |
+
# Document Chunking
|
| 75 |
+
# ------------------------
|
| 76 |
+
def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[str]:
|
| 77 |
+
"""
|
| 78 |
+
Split text into overlapping chunks for processing large documents
|
| 79 |
+
"""
|
| 80 |
+
if len(text) <= chunk_size:
|
| 81 |
+
return [text]
|
| 82 |
+
|
| 83 |
+
chunks = []
|
| 84 |
+
start = 0
|
| 85 |
+
|
| 86 |
+
while start < len(text):
|
| 87 |
+
end = start + chunk_size
|
| 88 |
+
|
| 89 |
+
# Try to break at sentence boundary
|
| 90 |
+
if end < len(text):
|
| 91 |
+
# Look for sentence endings within the last 200 characters
|
| 92 |
+
search_start = max(start, end - 200)
|
| 93 |
+
sentence_end = text.rfind('.', search_start, end)
|
| 94 |
+
if sentence_end > search_start:
|
| 95 |
+
end = sentence_end + 1
|
| 96 |
+
|
| 97 |
+
chunk = text[start:end].strip()
|
| 98 |
+
if chunk:
|
| 99 |
+
chunks.append(chunk)
|
| 100 |
+
|
| 101 |
+
# Move start position with overlap
|
| 102 |
+
start = end - overlap
|
| 103 |
+
if start >= len(text):
|
| 104 |
+
break
|
| 105 |
+
|
| 106 |
+
return chunks
|
| 107 |
+
|
| 108 |
+
def get_file_hash(file_path: str) -> str:
|
| 109 |
+
"""Generate hash for file caching"""
|
| 110 |
+
with open(file_path, 'rb') as f:
|
| 111 |
+
return hashlib.md5(f.read()).hexdigest()
|
| 112 |
+
|
| 113 |
+
# ------------------------
|
| 114 |
+
# Caching System
|
| 115 |
+
# ------------------------
|
| 116 |
+
CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
|
| 117 |
+
CACHE_DIR.mkdir(exist_ok=True)
|
| 118 |
+
|
| 119 |
+
def get_cached_text(file_path: str) -> Optional[str]:
|
| 120 |
+
"""Retrieve cached PDF text if available"""
|
| 121 |
+
file_hash = get_file_hash(file_path)
|
| 122 |
+
cache_file = CACHE_DIR / f"{file_hash}.json"
|
| 123 |
+
|
| 124 |
+
if cache_file.exists():
|
| 125 |
+
try:
|
| 126 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 127 |
+
cache_data = json.load(f)
|
| 128 |
+
# Check if file hasn't been modified
|
| 129 |
+
if cache_data.get('file_hash') == file_hash:
|
| 130 |
+
return cache_data.get('text')
|
| 131 |
+
except Exception:
|
| 132 |
+
pass
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
def cache_text(file_path: str, text: str) -> None:
|
| 136 |
+
"""Cache PDF text for future use"""
|
| 137 |
+
file_hash = get_file_hash(file_path)
|
| 138 |
+
cache_file = CACHE_DIR / f"{file_hash}.json"
|
| 139 |
+
|
| 140 |
+
try:
|
| 141 |
+
cache_data = {
|
| 142 |
+
'file_hash': file_hash,
|
| 143 |
+
'text': text,
|
| 144 |
+
'cached_at': time.time()
|
| 145 |
+
}
|
| 146 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
| 147 |
+
json.dump(cache_data, f, ensure_ascii=False)
|
| 148 |
+
except Exception:
|
| 149 |
+
pass # Fail silently if caching fails
|
| 150 |
+
|
| 151 |
+
def load_pdf_text_cached(path: str) -> str:
|
| 152 |
+
"""Load PDF text with caching support"""
|
| 153 |
+
# Try to get from cache first
|
| 154 |
+
cached_text = get_cached_text(path)
|
| 155 |
+
if cached_text:
|
| 156 |
+
return cached_text
|
| 157 |
+
|
| 158 |
+
# Extract text if not cached
|
| 159 |
+
text = load_pdf_text(path)
|
| 160 |
+
|
| 161 |
+
# Cache the result
|
| 162 |
+
cache_text(path, text)
|
| 163 |
+
|
| 164 |
+
return text
|
| 165 |
+
|
| 166 |
+
# ------------------------
|
| 167 |
+
# Enhanced PDF Processing
|
| 168 |
+
# ------------------------
|
| 169 |
+
def load_pdf_text_chunked(path: str, chunk_size: int = 15000) -> List[str]:
|
| 170 |
+
"""Load PDF text and return as chunks for large documents"""
|
| 171 |
+
text = load_pdf_text_cached(path)
|
| 172 |
+
return chunk_text(text, chunk_size)
|
| 173 |
+
|
| 174 |
+
def get_document_metadata(path: str) -> Dict[str, Any]:
|
| 175 |
+
"""Extract basic metadata from PDF"""
|
| 176 |
+
try:
|
| 177 |
+
with pdfplumber.open(path) as pdf:
|
| 178 |
+
return {
|
| 179 |
+
'page_count': len(pdf.pages),
|
| 180 |
+
'file_size': Path(path).stat().st_size,
|
| 181 |
+
'extracted_at': time.time()
|
| 182 |
+
}
|
| 183 |
+
except Exception:
|
| 184 |
+
return {'page_count': 0, 'file_size': 0, 'extracted_at': time.time()}
|
utils/export.py
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/export.py - Export functionality for PDF Analysis & Orchestrator
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, Any, Optional
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
from config import Config
|
| 8 |
+
|
| 9 |
+
class ExportManager:
|
| 10 |
+
"""Handle export of analysis results to various formats"""
|
| 11 |
+
|
| 12 |
+
def __init__(self, export_dir: str = None):
|
| 13 |
+
self.export_dir = Path(export_dir or Config.EXPORT_DIR)
|
| 14 |
+
self.export_dir.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
def export_text(self, content: str, filename: str = None,
|
| 17 |
+
metadata: Dict[str, Any] = None) -> str:
|
| 18 |
+
"""Export content as text file"""
|
| 19 |
+
if not filename:
|
| 20 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 21 |
+
filename = f"analysis_{timestamp}.txt"
|
| 22 |
+
|
| 23 |
+
if not filename.endswith('.txt'):
|
| 24 |
+
filename += '.txt'
|
| 25 |
+
|
| 26 |
+
filepath = self.export_dir / filename
|
| 27 |
+
|
| 28 |
+
# Add metadata header if provided
|
| 29 |
+
if metadata:
|
| 30 |
+
header = self._format_metadata_header(metadata)
|
| 31 |
+
content = f"{header}\n\n{content}"
|
| 32 |
+
|
| 33 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 34 |
+
f.write(content)
|
| 35 |
+
|
| 36 |
+
return str(filepath)
|
| 37 |
+
|
| 38 |
+
def export_json(self, data: Dict[str, Any], filename: str = None) -> str:
|
| 39 |
+
"""Export data as JSON file"""
|
| 40 |
+
if not filename:
|
| 41 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 42 |
+
filename = f"analysis_{timestamp}.json"
|
| 43 |
+
|
| 44 |
+
if not filename.endswith('.json'):
|
| 45 |
+
filename += '.json'
|
| 46 |
+
|
| 47 |
+
filepath = self.export_dir / filename
|
| 48 |
+
|
| 49 |
+
# Add export metadata
|
| 50 |
+
export_data = {
|
| 51 |
+
"exported_at": datetime.now().isoformat(),
|
| 52 |
+
"export_version": "1.0",
|
| 53 |
+
"data": data
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 57 |
+
json.dump(export_data, f, indent=2, ensure_ascii=False)
|
| 58 |
+
|
| 59 |
+
return str(filepath)
|
| 60 |
+
|
| 61 |
+
def export_pdf(self, content: str, filename: str = None,
|
| 62 |
+
metadata: Dict[str, Any] = None) -> str:
|
| 63 |
+
"""Export content as PDF (requires reportlab)"""
|
| 64 |
+
try:
|
| 65 |
+
from reportlab.lib.pagesizes import letter
|
| 66 |
+
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
|
| 67 |
+
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
|
| 68 |
+
from reportlab.lib.units import inch
|
| 69 |
+
except ImportError:
|
| 70 |
+
raise ImportError("reportlab is required for PDF export. Install with: pip install reportlab")
|
| 71 |
+
|
| 72 |
+
if not filename:
|
| 73 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 74 |
+
filename = f"analysis_{timestamp}.pdf"
|
| 75 |
+
|
| 76 |
+
if not filename.endswith('.pdf'):
|
| 77 |
+
filename += '.pdf'
|
| 78 |
+
|
| 79 |
+
filepath = self.export_dir / filename
|
| 80 |
+
|
| 81 |
+
# Create PDF
|
| 82 |
+
doc = SimpleDocTemplate(str(filepath), pagesize=letter)
|
| 83 |
+
styles = getSampleStyleSheet()
|
| 84 |
+
|
| 85 |
+
# Custom style for content
|
| 86 |
+
content_style = ParagraphStyle(
|
| 87 |
+
'CustomContent',
|
| 88 |
+
parent=styles['Normal'],
|
| 89 |
+
fontSize=11,
|
| 90 |
+
spaceAfter=12,
|
| 91 |
+
leading=14
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
story = []
|
| 95 |
+
|
| 96 |
+
# Add metadata header if provided
|
| 97 |
+
if metadata:
|
| 98 |
+
header_style = ParagraphStyle(
|
| 99 |
+
'Header',
|
| 100 |
+
parent=styles['Heading1'],
|
| 101 |
+
fontSize=14,
|
| 102 |
+
spaceAfter=20
|
| 103 |
+
)
|
| 104 |
+
story.append(Paragraph("Analysis Report", header_style))
|
| 105 |
+
story.append(Spacer(1, 12))
|
| 106 |
+
|
| 107 |
+
for key, value in metadata.items():
|
| 108 |
+
story.append(Paragraph(f"<b>{key}:</b> {value}", styles['Normal']))
|
| 109 |
+
story.append(Spacer(1, 20))
|
| 110 |
+
|
| 111 |
+
# Add content
|
| 112 |
+
paragraphs = content.split('\n\n')
|
| 113 |
+
for para in paragraphs:
|
| 114 |
+
if para.strip():
|
| 115 |
+
story.append(Paragraph(para.strip(), content_style))
|
| 116 |
+
story.append(Spacer(1, 6))
|
| 117 |
+
|
| 118 |
+
doc.build(story)
|
| 119 |
+
return str(filepath)
|
| 120 |
+
|
| 121 |
+
def _format_metadata_header(self, metadata: Dict[str, Any]) -> str:
|
| 122 |
+
"""Format metadata as text header"""
|
| 123 |
+
lines = ["=" * 50, "ANALYSIS REPORT", "=" * 50]
|
| 124 |
+
|
| 125 |
+
for key, value in metadata.items():
|
| 126 |
+
lines.append(f"{key}: {value}")
|
| 127 |
+
|
| 128 |
+
lines.append("=" * 50)
|
| 129 |
+
return "\n".join(lines)
|
| 130 |
+
|
| 131 |
+
def get_export_history(self, limit: int = 10) -> List[Dict[str, Any]]:
|
| 132 |
+
"""Get recent export history"""
|
| 133 |
+
files = []
|
| 134 |
+
for filepath in self.export_dir.glob("*"):
|
| 135 |
+
if filepath.is_file():
|
| 136 |
+
stat = filepath.stat()
|
| 137 |
+
files.append({
|
| 138 |
+
"filename": filepath.name,
|
| 139 |
+
"filepath": str(filepath),
|
| 140 |
+
"size": stat.st_size,
|
| 141 |
+
"created": datetime.fromtimestamp(stat.st_ctime).isoformat(),
|
| 142 |
+
"format": filepath.suffix[1:] if filepath.suffix else "unknown"
|
| 143 |
+
})
|
| 144 |
+
|
| 145 |
+
# Sort by creation time, newest first
|
| 146 |
+
files.sort(key=lambda x: x["created"], reverse=True)
|
| 147 |
+
return files[:limit]
|
| 148 |
+
|
| 149 |
+
def cleanup_old_exports(self, days: int = 7) -> int:
|
| 150 |
+
"""Clean up exports older than specified days"""
|
| 151 |
+
cutoff_time = datetime.now().timestamp() - (days * 24 * 60 * 60)
|
| 152 |
+
deleted_count = 0
|
| 153 |
+
|
| 154 |
+
for filepath in self.export_dir.glob("*"):
|
| 155 |
+
if filepath.is_file() and filepath.stat().st_ctime < cutoff_time:
|
| 156 |
+
try:
|
| 157 |
+
filepath.unlink()
|
| 158 |
+
deleted_count += 1
|
| 159 |
+
except Exception:
|
| 160 |
+
pass
|
| 161 |
+
|
| 162 |
+
return deleted_count
|
utils/prompts.py
ADDED
|
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/prompts.py - Custom prompt management for PDF Analysis & Orchestrator
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import Dict, List, Optional
|
| 6 |
+
from config import Config
|
| 7 |
+
|
| 8 |
+
class PromptManager:
|
| 9 |
+
"""Manage custom prompts for analysis"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, prompts_dir: str = None):
|
| 12 |
+
self.prompts_dir = Path(prompts_dir or Config.PROMPTS_DIR)
|
| 13 |
+
self.prompts_dir.mkdir(parents=True, exist_ok=True)
|
| 14 |
+
self.prompts_file = self.prompts_dir / "custom_prompts.json"
|
| 15 |
+
self._load_prompts()
|
| 16 |
+
|
| 17 |
+
def _load_prompts(self) -> None:
|
| 18 |
+
"""Load prompts from file"""
|
| 19 |
+
if self.prompts_file.exists():
|
| 20 |
+
try:
|
| 21 |
+
with open(self.prompts_file, 'r', encoding='utf-8') as f:
|
| 22 |
+
self.prompts = json.load(f)
|
| 23 |
+
except Exception:
|
| 24 |
+
self.prompts = {}
|
| 25 |
+
else:
|
| 26 |
+
self.prompts = self._get_default_prompts()
|
| 27 |
+
self._save_prompts()
|
| 28 |
+
|
| 29 |
+
def _get_default_prompts(self) -> Dict[str, Dict[str, str]]:
|
| 30 |
+
"""Get default prompt templates"""
|
| 31 |
+
return {
|
| 32 |
+
"summarize": {
|
| 33 |
+
"name": "Summarize Document",
|
| 34 |
+
"description": "Create a concise summary of the document",
|
| 35 |
+
"template": "Summarize this document in 3-5 key points, highlighting the main ideas and conclusions.",
|
| 36 |
+
"category": "basic"
|
| 37 |
+
},
|
| 38 |
+
"explain_simple": {
|
| 39 |
+
"name": "Explain Simply",
|
| 40 |
+
"description": "Explain complex content for a general audience",
|
| 41 |
+
"template": "Explain this document in simple terms that a 10-year-old could understand. Use analogies and examples where helpful.",
|
| 42 |
+
"category": "explanation"
|
| 43 |
+
},
|
| 44 |
+
"executive_summary": {
|
| 45 |
+
"name": "Executive Summary",
|
| 46 |
+
"description": "Create an executive summary for decision makers",
|
| 47 |
+
"template": "Create an executive summary of this document, focusing on key findings, recommendations, and business implications.",
|
| 48 |
+
"category": "business"
|
| 49 |
+
},
|
| 50 |
+
"technical_analysis": {
|
| 51 |
+
"name": "Technical Analysis",
|
| 52 |
+
"description": "Provide detailed technical analysis",
|
| 53 |
+
"template": "Provide a detailed technical analysis of this document, including methodology, data analysis, and technical conclusions.",
|
| 54 |
+
"category": "technical"
|
| 55 |
+
},
|
| 56 |
+
"theme_segmentation": {
|
| 57 |
+
"name": "Theme Segmentation",
|
| 58 |
+
"description": "Break down document by themes and topics",
|
| 59 |
+
"template": "Segment this document by main themes and topics. Identify key themes and provide a brief summary of each section.",
|
| 60 |
+
"category": "organization"
|
| 61 |
+
},
|
| 62 |
+
"key_findings": {
|
| 63 |
+
"name": "Key Findings",
|
| 64 |
+
"description": "Extract key findings and insights",
|
| 65 |
+
"template": "Extract and analyze the key findings, insights, and recommendations from this document. Highlight the most important points.",
|
| 66 |
+
"category": "analysis"
|
| 67 |
+
}
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
def _save_prompts(self) -> None:
|
| 71 |
+
"""Save prompts to file"""
|
| 72 |
+
try:
|
| 73 |
+
with open(self.prompts_file, 'w', encoding='utf-8') as f:
|
| 74 |
+
json.dump(self.prompts, f, indent=2, ensure_ascii=False)
|
| 75 |
+
except Exception as e:
|
| 76 |
+
print(f"Error saving prompts: {e}")
|
| 77 |
+
|
| 78 |
+
def get_prompt(self, prompt_id: str) -> Optional[str]:
|
| 79 |
+
"""Get a specific prompt template"""
|
| 80 |
+
return self.prompts.get(prompt_id, {}).get("template")
|
| 81 |
+
|
| 82 |
+
def get_all_prompts(self) -> Dict[str, Dict[str, str]]:
|
| 83 |
+
"""Get all available prompts"""
|
| 84 |
+
return self.prompts.copy()
|
| 85 |
+
|
| 86 |
+
def get_prompts_by_category(self, category: str) -> Dict[str, Dict[str, str]]:
|
| 87 |
+
"""Get prompts filtered by category"""
|
| 88 |
+
return {
|
| 89 |
+
pid: prompt for pid, prompt in self.prompts.items()
|
| 90 |
+
if prompt.get("category") == category
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
def add_prompt(self, prompt_id: str, name: str, description: str,
|
| 94 |
+
template: str, category: str = "custom") -> bool:
|
| 95 |
+
"""Add a new custom prompt"""
|
| 96 |
+
try:
|
| 97 |
+
self.prompts[prompt_id] = {
|
| 98 |
+
"name": name,
|
| 99 |
+
"description": description,
|
| 100 |
+
"template": template,
|
| 101 |
+
"category": category
|
| 102 |
+
}
|
| 103 |
+
self._save_prompts()
|
| 104 |
+
return True
|
| 105 |
+
except Exception:
|
| 106 |
+
return False
|
| 107 |
+
|
| 108 |
+
def update_prompt(self, prompt_id: str, **kwargs) -> bool:
|
| 109 |
+
"""Update an existing prompt"""
|
| 110 |
+
if prompt_id not in self.prompts:
|
| 111 |
+
return False
|
| 112 |
+
|
| 113 |
+
try:
|
| 114 |
+
self.prompts[prompt_id].update(kwargs)
|
| 115 |
+
self._save_prompts()
|
| 116 |
+
return True
|
| 117 |
+
except Exception:
|
| 118 |
+
return False
|
| 119 |
+
|
| 120 |
+
def delete_prompt(self, prompt_id: str) -> bool:
|
| 121 |
+
"""Delete a custom prompt (cannot delete default prompts)"""
|
| 122 |
+
if prompt_id in self.prompts and self.prompts[prompt_id].get("category") == "custom":
|
| 123 |
+
try:
|
| 124 |
+
del self.prompts[prompt_id]
|
| 125 |
+
self._save_prompts()
|
| 126 |
+
return True
|
| 127 |
+
except Exception:
|
| 128 |
+
return False
|
| 129 |
+
return False
|
| 130 |
+
|
| 131 |
+
def get_categories(self) -> List[str]:
|
| 132 |
+
"""Get all available categories"""
|
| 133 |
+
categories = set()
|
| 134 |
+
for prompt in self.prompts.values():
|
| 135 |
+
categories.add(prompt.get("category", "uncategorized"))
|
| 136 |
+
return sorted(list(categories))
|
utils/session.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/session.py - Session management for PDF Analysis & Orchestrator
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import uuid
|
| 5 |
+
|
| 6 |
+
BASE = Path(os.environ.get("ANALYSIS_SESSION_DIR", "/tmp/analysis_sessions"))
|
| 7 |
+
BASE.mkdir(parents=True, exist_ok=True)
|
| 8 |
+
|
| 9 |
+
def make_user_session(username: str):
|
| 10 |
+
"""Create a user session directory"""
|
| 11 |
+
username = (username or "anonymous").strip() or "anonymous"
|
| 12 |
+
sid = uuid.uuid4().hex
|
| 13 |
+
user_dir = BASE / username / sid
|
| 14 |
+
user_dir.mkdir(parents=True, exist_ok=True)
|
| 15 |
+
return str(user_dir)
|
utils/validation.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/validation.py - File validation for PDF Analysis & Orchestrator
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
MAX_MB = int(os.environ.get("ANALYSIS_MAX_UPLOAD_MB", 50))
|
| 6 |
+
|
| 7 |
+
def _get_size_bytes_from_uploaded(uploaded) -> int:
|
| 8 |
+
"""
|
| 9 |
+
Get file size from uploaded file object
|
| 10 |
+
uploaded may be a path (str), file-like object, or dict {'name': path}
|
| 11 |
+
"""
|
| 12 |
+
try:
|
| 13 |
+
if isinstance(uploaded, str) and os.path.exists(uploaded):
|
| 14 |
+
return Path(uploaded).stat().st_size
|
| 15 |
+
if isinstance(uploaded, dict) and "name" in uploaded and os.path.exists(uploaded["name"]):
|
| 16 |
+
return Path(uploaded["name"]).stat().st_size
|
| 17 |
+
if hasattr(uploaded, "seek") and hasattr(uploaded, "tell"):
|
| 18 |
+
current = uploaded.tell()
|
| 19 |
+
uploaded.seek(0, 2)
|
| 20 |
+
size = uploaded.tell()
|
| 21 |
+
uploaded.seek(current)
|
| 22 |
+
return size
|
| 23 |
+
except Exception:
|
| 24 |
+
pass
|
| 25 |
+
# Unknown size -> be conservative and allow it (or raise)
|
| 26 |
+
return 0
|
| 27 |
+
|
| 28 |
+
def validate_file_size(uploaded):
|
| 29 |
+
"""Validate uploaded file size"""
|
| 30 |
+
size_bytes = _get_size_bytes_from_uploaded(uploaded)
|
| 31 |
+
if size_bytes == 0:
|
| 32 |
+
# If unknown, skip (or you could raise). We'll allow but log in production.
|
| 33 |
+
return True
|
| 34 |
+
mb = size_bytes / (1024 * 1024)
|
| 35 |
+
if mb > MAX_MB:
|
| 36 |
+
raise ValueError(f"Uploaded file exceeds allowed size of {MAX_MB} MB (size: {mb:.2f} MB).")
|
| 37 |
+
return True
|