Commit
·
bb3909a
1
Parent(s):
624de5a
Major performance and UX improvements: caching, visual outputs, dynamic tokens, enhanced document processing
Browse files- agents.py +74 -16
- app.py +26 -14
- utils/__init__.py +79 -8
- utils/visual_output.py +262 -0
agents.py
CHANGED
|
@@ -5,7 +5,8 @@ import logging
|
|
| 5 |
from typing import Optional, Dict, Any, List, AsyncGenerator
|
| 6 |
import time
|
| 7 |
|
| 8 |
-
from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata
|
|
|
|
| 9 |
from config import Config
|
| 10 |
|
| 11 |
logger = logging.getLogger(__name__)
|
|
@@ -33,9 +34,44 @@ class BaseAgent:
|
|
| 33 |
# Core Analysis Agent
|
| 34 |
# --------------------
|
| 35 |
class AnalysisAgent(BaseAgent):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 37 |
start_time = time.time()
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
if file_path:
|
| 40 |
# Get document metadata
|
| 41 |
metadata = get_document_metadata(file_path)
|
|
@@ -45,29 +81,46 @@ class AnalysisAgent(BaseAgent):
|
|
| 45 |
|
| 46 |
# Check if document needs chunking
|
| 47 |
if len(text) > Config.CHUNK_SIZE:
|
| 48 |
-
|
| 49 |
else:
|
| 50 |
content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
|
|
|
|
| 51 |
else:
|
| 52 |
content = f"User prompt: {prompt}"
|
| 53 |
metadata = {}
|
|
|
|
| 54 |
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
FORMATTING REQUIREMENTS:
|
| 58 |
- Use clear section headers with emojis (## 📋 Key Points, ## 🔍 Analysis, etc.)
|
| 59 |
-
-
|
|
|
|
| 60 |
- Include visual separators (---) between major sections
|
| 61 |
-
- Use bold
|
| 62 |
-
-
|
| 63 |
-
-
|
| 64 |
|
| 65 |
CONTENT REQUIREMENTS:
|
| 66 |
-
-
|
| 67 |
-
-
|
| 68 |
-
-
|
| 69 |
-
-
|
| 70 |
-
- Structure information
|
|
|
|
| 71 |
|
| 72 |
try:
|
| 73 |
response = await call_openai_chat(
|
|
@@ -75,23 +128,28 @@ CONTENT REQUIREMENTS:
|
|
| 75 |
messages=[{"role": "system", "content": system},
|
| 76 |
{"role": "user", "content": content}],
|
| 77 |
temperature=Config.OPENAI_TEMPERATURE,
|
| 78 |
-
max_tokens=
|
| 79 |
)
|
| 80 |
except Exception as e:
|
| 81 |
logger.exception("AnalysisAgent failed")
|
| 82 |
response = f"Error during analysis: {str(e)}"
|
| 83 |
|
|
|
|
|
|
|
|
|
|
| 84 |
self.tasks_completed += 1
|
| 85 |
|
| 86 |
# Add processing metadata
|
| 87 |
processing_time = time.time() - start_time
|
| 88 |
result = {
|
| 89 |
-
"analysis":
|
| 90 |
"metadata": {
|
| 91 |
"processing_time": round(processing_time, 2),
|
| 92 |
"document_metadata": metadata,
|
| 93 |
"agent": self.name,
|
| 94 |
-
"tasks_completed": self.tasks_completed
|
|
|
|
|
|
|
| 95 |
}
|
| 96 |
}
|
| 97 |
|
|
|
|
| 5 |
from typing import Optional, Dict, Any, List, AsyncGenerator
|
| 6 |
import time
|
| 7 |
|
| 8 |
+
from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis
|
| 9 |
+
from utils.visual_output import VisualOutputGenerator
|
| 10 |
from config import Config
|
| 11 |
|
| 12 |
logger = logging.getLogger(__name__)
|
|
|
|
| 34 |
# Core Analysis Agent
|
| 35 |
# --------------------
|
| 36 |
class AnalysisAgent(BaseAgent):
|
| 37 |
+
def __init__(self, name: str, model: str, tasks_completed: int = 0):
|
| 38 |
+
super().__init__(name, model, tasks_completed)
|
| 39 |
+
self.visual_generator = VisualOutputGenerator()
|
| 40 |
+
|
| 41 |
+
def _calculate_dynamic_tokens(self, prompt: str, text_length: int) -> int:
|
| 42 |
+
"""Calculate dynamic token allocation based on prompt complexity and text length"""
|
| 43 |
+
base_tokens = Config.OPENAI_MAX_TOKENS
|
| 44 |
+
|
| 45 |
+
# Increase tokens for complex prompts
|
| 46 |
+
complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete', 'extensive']
|
| 47 |
+
complexity_multiplier = 1.0
|
| 48 |
+
for keyword in complex_keywords:
|
| 49 |
+
if keyword.lower() in prompt.lower():
|
| 50 |
+
complexity_multiplier += 0.3
|
| 51 |
+
|
| 52 |
+
# Increase tokens for longer documents
|
| 53 |
+
length_multiplier = min(2.0, 1.0 + (text_length / 50000)) # Cap at 2x for very long docs
|
| 54 |
+
|
| 55 |
+
# Increase tokens for specific document types
|
| 56 |
+
doc_type_keywords = ['whitepaper', 'research', 'technical', 'financial', 'legal', 'academic']
|
| 57 |
+
doc_type_multiplier = 1.0
|
| 58 |
+
for keyword in doc_type_keywords:
|
| 59 |
+
if keyword.lower() in prompt.lower():
|
| 60 |
+
doc_type_multiplier += 0.2
|
| 61 |
+
|
| 62 |
+
final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
|
| 63 |
+
return min(final_tokens, 4000) # Cap at 4000 tokens
|
| 64 |
+
|
| 65 |
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 66 |
start_time = time.time()
|
| 67 |
|
| 68 |
+
# Check cache first
|
| 69 |
+
if file_path:
|
| 70 |
+
cached_result = get_cached_analysis(file_path, prompt)
|
| 71 |
+
if cached_result:
|
| 72 |
+
logger.info(f"Returning cached analysis for {file_path}")
|
| 73 |
+
return cached_result
|
| 74 |
+
|
| 75 |
if file_path:
|
| 76 |
# Get document metadata
|
| 77 |
metadata = get_document_metadata(file_path)
|
|
|
|
| 81 |
|
| 82 |
# Check if document needs chunking
|
| 83 |
if len(text) > Config.CHUNK_SIZE:
|
| 84 |
+
result = await self._handle_large_document(prompt, text, metadata)
|
| 85 |
else:
|
| 86 |
content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
|
| 87 |
+
result = await self._process_content(prompt, content, metadata, text)
|
| 88 |
else:
|
| 89 |
content = f"User prompt: {prompt}"
|
| 90 |
metadata = {}
|
| 91 |
+
result = await self._process_content(prompt, content, metadata, "")
|
| 92 |
|
| 93 |
+
# Cache the result
|
| 94 |
+
if file_path:
|
| 95 |
+
cache_analysis(file_path, prompt, result)
|
| 96 |
+
|
| 97 |
+
return result
|
| 98 |
+
|
| 99 |
+
async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
|
| 100 |
+
"""Process content with dynamic token allocation and visual formatting"""
|
| 101 |
+
start_time = time.time()
|
| 102 |
+
|
| 103 |
+
# Calculate dynamic tokens
|
| 104 |
+
max_tokens = self._calculate_dynamic_tokens(prompt, len(text))
|
| 105 |
+
|
| 106 |
+
system = """You are AnalysisAgent: produce crisp, visually appealing, and highly readable insights.
|
| 107 |
|
| 108 |
+
CRITICAL FORMATTING REQUIREMENTS:
|
| 109 |
- Use clear section headers with emojis (## 📋 Key Points, ## 🔍 Analysis, etc.)
|
| 110 |
+
- Create concise bullet points (max 1 line each)
|
| 111 |
+
- Use tables for data comparison when appropriate
|
| 112 |
- Include visual separators (---) between major sections
|
| 113 |
+
- Use **bold** for key concepts and numbers
|
| 114 |
+
- Keep sections short and scannable
|
| 115 |
+
- Prioritize actionable insights over lengthy explanations
|
| 116 |
|
| 117 |
CONTENT REQUIREMENTS:
|
| 118 |
+
- Be concise and to the point
|
| 119 |
+
- Use simple language even for technical topics
|
| 120 |
+
- Include specific numbers, percentages, and metrics
|
| 121 |
+
- Provide clear next steps or recommendations
|
| 122 |
+
- Structure information for quick scanning
|
| 123 |
+
- Focus on what matters most to the user"""
|
| 124 |
|
| 125 |
try:
|
| 126 |
response = await call_openai_chat(
|
|
|
|
| 128 |
messages=[{"role": "system", "content": system},
|
| 129 |
{"role": "user", "content": content}],
|
| 130 |
temperature=Config.OPENAI_TEMPERATURE,
|
| 131 |
+
max_tokens=max_tokens
|
| 132 |
)
|
| 133 |
except Exception as e:
|
| 134 |
logger.exception("AnalysisAgent failed")
|
| 135 |
response = f"Error during analysis: {str(e)}"
|
| 136 |
|
| 137 |
+
# Enhance with visual formatting
|
| 138 |
+
visual_response = self.visual_generator.format_analysis_with_visuals(response, metadata)
|
| 139 |
+
|
| 140 |
self.tasks_completed += 1
|
| 141 |
|
| 142 |
# Add processing metadata
|
| 143 |
processing_time = time.time() - start_time
|
| 144 |
result = {
|
| 145 |
+
"analysis": visual_response,
|
| 146 |
"metadata": {
|
| 147 |
"processing_time": round(processing_time, 2),
|
| 148 |
"document_metadata": metadata,
|
| 149 |
"agent": self.name,
|
| 150 |
+
"tasks_completed": self.tasks_completed,
|
| 151 |
+
"tokens_used": max_tokens,
|
| 152 |
+
"cached": False
|
| 153 |
}
|
| 154 |
}
|
| 155 |
|
app.py
CHANGED
|
@@ -87,16 +87,30 @@ def handle_analysis(file, prompt, username="anonymous", use_streaming=False):
|
|
| 87 |
validate_file_size(file)
|
| 88 |
path = save_uploaded_file(file, username)
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
except Exception as e:
|
| 99 |
-
return f"Error during analysis: {str(e)}",
|
| 100 |
|
| 101 |
def handle_batch_analysis(files, prompt, username="anonymous"):
|
| 102 |
"""Handle batch analysis of multiple PDFs"""
|
|
@@ -240,11 +254,9 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
|
|
| 240 |
label="Analysis Result",
|
| 241 |
show_copy_button=True
|
| 242 |
)
|
| 243 |
-
status_box = gr.
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
interactive=False,
|
| 247 |
-
info="Current processing status will appear here"
|
| 248 |
)
|
| 249 |
|
| 250 |
with gr.Column(scale=1):
|
|
|
|
| 87 |
validate_file_size(file)
|
| 88 |
path = save_uploaded_file(file, username)
|
| 89 |
|
| 90 |
+
# Check if this is a cached result
|
| 91 |
+
from utils import get_cached_analysis
|
| 92 |
+
cached_result = get_cached_analysis(path, prompt)
|
| 93 |
+
|
| 94 |
+
if cached_result:
|
| 95 |
+
status = "⚡ **Cached Result** - Instant response from previous analysis"
|
| 96 |
+
result = cached_result.get("analysis", "No analysis result.")
|
| 97 |
+
metadata = cached_result.get("metadata", {})
|
| 98 |
+
else:
|
| 99 |
+
status = "🔄 **Processing** - Analyzing document with AI..."
|
| 100 |
+
result = run_async(
|
| 101 |
+
ORCHESTRATOR.handle_user_prompt,
|
| 102 |
+
user_id=username,
|
| 103 |
+
prompt=prompt,
|
| 104 |
+
file_path=path,
|
| 105 |
+
targets=["analysis"]
|
| 106 |
+
)
|
| 107 |
+
result = result.get("analysis", "No analysis result.")
|
| 108 |
+
metadata = result.get("metadata", {}) if isinstance(result, dict) else {}
|
| 109 |
+
status = "✅ **Analysis Complete** - Fresh analysis generated"
|
| 110 |
+
|
| 111 |
+
return result, status, metadata
|
| 112 |
except Exception as e:
|
| 113 |
+
return f"Error during analysis: {str(e)}", f"❌ **Error** - {str(e)}", None
|
| 114 |
|
| 115 |
def handle_batch_analysis(files, prompt, username="anonymous"):
|
| 116 |
"""Handle batch analysis of multiple PDFs"""
|
|
|
|
| 254 |
label="Analysis Result",
|
| 255 |
show_copy_button=True
|
| 256 |
)
|
| 257 |
+
status_box = gr.Markdown(
|
| 258 |
+
value="**🔄 Status:** Ready to analyze documents\n\n**💡 Tip:** Same document + same prompt = instant cached response!",
|
| 259 |
+
label="Status & Performance"
|
|
|
|
|
|
|
| 260 |
)
|
| 261 |
|
| 262 |
with gr.Column(scale=1):
|
utils/__init__.py
CHANGED
|
@@ -48,12 +48,46 @@ async def call_openai_chat(model: str, messages: list, temperature=0.2, max_toke
|
|
| 48 |
# PDF Utilities
|
| 49 |
# ------------------------
|
| 50 |
def load_pdf_text(path: str) -> str:
|
| 51 |
-
"""Extract
|
| 52 |
-
|
| 53 |
with pdfplumber.open(path) as pdf:
|
| 54 |
-
for
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def save_text_as_file(text: str, suffix=".txt") -> str:
|
| 59 |
"""Save text to a temporary file"""
|
|
@@ -111,15 +145,52 @@ def get_file_hash(file_path: str) -> str:
|
|
| 111 |
return hashlib.md5(f.read()).hexdigest()
|
| 112 |
|
| 113 |
# ------------------------
|
| 114 |
-
# Caching System
|
| 115 |
# ------------------------
|
| 116 |
CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
|
| 117 |
CACHE_DIR.mkdir(exist_ok=True)
|
| 118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
def get_cached_text(file_path: str) -> Optional[str]:
|
| 120 |
"""Retrieve cached PDF text if available"""
|
| 121 |
file_hash = get_file_hash(file_path)
|
| 122 |
-
cache_file = CACHE_DIR / f"{file_hash}.json"
|
| 123 |
|
| 124 |
if cache_file.exists():
|
| 125 |
try:
|
|
@@ -135,7 +206,7 @@ def get_cached_text(file_path: str) -> Optional[str]:
|
|
| 135 |
def cache_text(file_path: str, text: str) -> None:
|
| 136 |
"""Cache PDF text for future use"""
|
| 137 |
file_hash = get_file_hash(file_path)
|
| 138 |
-
cache_file = CACHE_DIR / f"{file_hash}.json"
|
| 139 |
|
| 140 |
try:
|
| 141 |
cache_data = {
|
|
|
|
| 48 |
# PDF Utilities
|
| 49 |
# ------------------------
|
| 50 |
def load_pdf_text(path: str) -> str:
|
| 51 |
+
"""Extract comprehensive content from PDF using pdfplumber"""
|
| 52 |
+
content = []
|
| 53 |
with pdfplumber.open(path) as pdf:
|
| 54 |
+
for page_num, page in enumerate(pdf.pages, 1):
|
| 55 |
+
page_content = []
|
| 56 |
+
|
| 57 |
+
# Extract text
|
| 58 |
+
text = page.extract_text()
|
| 59 |
+
if text:
|
| 60 |
+
page_content.append(f"=== PAGE {page_num} TEXT ===")
|
| 61 |
+
page_content.append(text)
|
| 62 |
+
|
| 63 |
+
# Extract tables
|
| 64 |
+
tables = page.extract_tables()
|
| 65 |
+
if tables:
|
| 66 |
+
page_content.append(f"\n=== PAGE {page_num} TABLES ===")
|
| 67 |
+
for table_num, table in enumerate(tables, 1):
|
| 68 |
+
page_content.append(f"\n--- TABLE {table_num} ---")
|
| 69 |
+
for row in table:
|
| 70 |
+
if row: # Skip empty rows
|
| 71 |
+
# Clean and format table row
|
| 72 |
+
clean_row = [cell.strip() if cell else "" for cell in row]
|
| 73 |
+
page_content.append(" | ".join(clean_row))
|
| 74 |
+
|
| 75 |
+
# Extract images info (metadata only)
|
| 76 |
+
images = page.images
|
| 77 |
+
if images:
|
| 78 |
+
page_content.append(f"\n=== PAGE {page_num} IMAGES ===")
|
| 79 |
+
for img_num, img in enumerate(images, 1):
|
| 80 |
+
page_content.append(f"Image {img_num}: {img.get('width', 'unknown')}x{img.get('height', 'unknown')} pixels")
|
| 81 |
+
|
| 82 |
+
# Extract metadata
|
| 83 |
+
page_content.append(f"\n=== PAGE {page_num} METADATA ===")
|
| 84 |
+
page_content.append(f"Page size: {page.width}x{page.height}")
|
| 85 |
+
page_content.append(f"Rotation: {page.rotation}")
|
| 86 |
+
|
| 87 |
+
if page_content:
|
| 88 |
+
content.append("\n".join(page_content))
|
| 89 |
+
|
| 90 |
+
return "\n\n".join(content)
|
| 91 |
|
| 92 |
def save_text_as_file(text: str, suffix=".txt") -> str:
|
| 93 |
"""Save text to a temporary file"""
|
|
|
|
| 145 |
return hashlib.md5(f.read()).hexdigest()
|
| 146 |
|
| 147 |
# ------------------------
|
| 148 |
+
# Enhanced Caching System
|
| 149 |
# ------------------------
|
| 150 |
CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
|
| 151 |
CACHE_DIR.mkdir(exist_ok=True)
|
| 152 |
|
| 153 |
+
def get_cached_analysis(file_path: str, prompt: str) -> Optional[Dict[str, Any]]:
|
| 154 |
+
"""Retrieve cached analysis if available"""
|
| 155 |
+
file_hash = get_file_hash(file_path)
|
| 156 |
+
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
|
| 157 |
+
cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
|
| 158 |
+
|
| 159 |
+
if cache_file.exists():
|
| 160 |
+
try:
|
| 161 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 162 |
+
cache_data = json.load(f)
|
| 163 |
+
# Check if file hasn't been modified and cache is still valid (24 hours)
|
| 164 |
+
if (cache_data.get('file_hash') == file_hash and
|
| 165 |
+
cache_data.get('prompt_hash') == prompt_hash and
|
| 166 |
+
time.time() - cache_data.get('cached_at', 0) < 86400): # 24 hours
|
| 167 |
+
return cache_data.get('analysis')
|
| 168 |
+
except Exception:
|
| 169 |
+
pass
|
| 170 |
+
return None
|
| 171 |
+
|
| 172 |
+
def cache_analysis(file_path: str, prompt: str, analysis: Dict[str, Any]) -> None:
|
| 173 |
+
"""Cache analysis results for future use"""
|
| 174 |
+
file_hash = get_file_hash(file_path)
|
| 175 |
+
prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
|
| 176 |
+
cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
|
| 177 |
+
|
| 178 |
+
try:
|
| 179 |
+
cache_data = {
|
| 180 |
+
'file_hash': file_hash,
|
| 181 |
+
'prompt_hash': prompt_hash,
|
| 182 |
+
'analysis': analysis,
|
| 183 |
+
'cached_at': time.time()
|
| 184 |
+
}
|
| 185 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
| 186 |
+
json.dump(cache_data, f, ensure_ascii=False)
|
| 187 |
+
except Exception:
|
| 188 |
+
pass # Fail silently if caching fails
|
| 189 |
+
|
| 190 |
def get_cached_text(file_path: str) -> Optional[str]:
|
| 191 |
"""Retrieve cached PDF text if available"""
|
| 192 |
file_hash = get_file_hash(file_path)
|
| 193 |
+
cache_file = CACHE_DIR / f"{file_hash}_text.json"
|
| 194 |
|
| 195 |
if cache_file.exists():
|
| 196 |
try:
|
|
|
|
| 206 |
def cache_text(file_path: str, text: str) -> None:
|
| 207 |
"""Cache PDF text for future use"""
|
| 208 |
file_hash = get_file_hash(file_path)
|
| 209 |
+
cache_file = CACHE_DIR / f"{file_hash}_text.json"
|
| 210 |
|
| 211 |
try:
|
| 212 |
cache_data = {
|
utils/visual_output.py
ADDED
|
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# utils/visual_output.py - Visual output generation for PDF Analysis & Orchestrator
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
from typing import Dict, List, Any, Optional
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
|
| 7 |
+
class VisualOutputGenerator:
|
| 8 |
+
"""Generate visual representations of analysis results"""
|
| 9 |
+
|
| 10 |
+
def __init__(self):
|
| 11 |
+
self.visual_elements = []
|
| 12 |
+
|
| 13 |
+
def create_infographic(self, data: Dict[str, Any], title: str = "Analysis Summary") -> str:
|
| 14 |
+
"""Create an infographic-style summary"""
|
| 15 |
+
visual = f"""
|
| 16 |
+
## 📊 {title}
|
| 17 |
+
|
| 18 |
+
<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; margin: 10px 0;">
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Key metrics
|
| 22 |
+
if 'metrics' in data:
|
| 23 |
+
visual += f"""
|
| 24 |
+
<div style="display: flex; justify-content: space-around; margin: 20px 0;">
|
| 25 |
+
"""
|
| 26 |
+
for metric, value in data['metrics'].items():
|
| 27 |
+
visual += f"""
|
| 28 |
+
<div style="text-align: center; background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; margin: 5px;">
|
| 29 |
+
<h3 style="margin: 0; font-size: 24px;">{value}</h3>
|
| 30 |
+
<p style="margin: 5px 0 0 0; font-size: 14px;">{metric}</p>
|
| 31 |
+
</div>
|
| 32 |
+
"""
|
| 33 |
+
visual += "</div>"
|
| 34 |
+
|
| 35 |
+
visual += "</div>"
|
| 36 |
+
return visual
|
| 37 |
+
|
| 38 |
+
def create_data_table(self, data: List[Dict[str, Any]], title: str = "Data Table") -> str:
|
| 39 |
+
"""Create a formatted table from data"""
|
| 40 |
+
if not data:
|
| 41 |
+
return ""
|
| 42 |
+
|
| 43 |
+
# Get headers from first row
|
| 44 |
+
headers = list(data[0].keys())
|
| 45 |
+
|
| 46 |
+
table = f"""
|
| 47 |
+
## 📋 {title}
|
| 48 |
+
|
| 49 |
+
| {' | '.join(headers)} |
|
| 50 |
+
| {' | '.join(['---'] * len(headers))} |
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
for row in data:
|
| 54 |
+
values = [str(row.get(header, '')) for header in headers]
|
| 55 |
+
table += f"| {' | '.join(values)} |\n"
|
| 56 |
+
|
| 57 |
+
return table
|
| 58 |
+
|
| 59 |
+
def create_progress_bar(self, value: float, max_value: float, label: str) -> str:
|
| 60 |
+
"""Create a progress bar visualization"""
|
| 61 |
+
percentage = min(100, (value / max_value) * 100) if max_value > 0 else 0
|
| 62 |
+
|
| 63 |
+
return f"""
|
| 64 |
+
<div style="margin: 10px 0;">
|
| 65 |
+
<p style="margin: 5px 0; font-weight: bold;">{label}: {value:.1f}/{max_value:.1f} ({percentage:.1f}%)</p>
|
| 66 |
+
<div style="background: #e0e0e0; border-radius: 10px; height: 20px; overflow: hidden;">
|
| 67 |
+
<div style="background: linear-gradient(90deg, #4CAF50, #8BC34A); height: 100%; width: {percentage}%; transition: width 0.3s ease;"></div>
|
| 68 |
+
</div>
|
| 69 |
+
</div>
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
def create_timeline(self, events: List[Dict[str, str]], title: str = "Timeline") -> str:
|
| 73 |
+
"""Create a timeline visualization"""
|
| 74 |
+
timeline = f"""
|
| 75 |
+
## ⏰ {title}
|
| 76 |
+
|
| 77 |
+
<div style="position: relative; padding-left: 30px; margin: 20px 0;">
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
for i, event in enumerate(events):
|
| 81 |
+
timeline += f"""
|
| 82 |
+
<div style="position: relative; margin-bottom: 20px;">
|
| 83 |
+
<div style="position: absolute; left: -25px; top: 5px; width: 12px; height: 12px; background: #4CAF50; border-radius: 50%; border: 3px solid white; box-shadow: 0 0 0 3px #4CAF50;"></div>
|
| 84 |
+
<div style="background: #f5f5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #4CAF50;">
|
| 85 |
+
<h4 style="margin: 0 0 5px 0; color: #333;">{event.get('title', 'Event')}</h4>
|
| 86 |
+
<p style="margin: 0; color: #666;">{event.get('description', '')}</p>
|
| 87 |
+
<small style="color: #999;">{event.get('date', '')}</small>
|
| 88 |
+
</div>
|
| 89 |
+
</div>
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
timeline += "</div>"
|
| 93 |
+
return timeline
|
| 94 |
+
|
| 95 |
+
def create_comparison_chart(self, data: Dict[str, float], title: str = "Comparison") -> str:
|
| 96 |
+
"""Create a comparison chart"""
|
| 97 |
+
if not data:
|
| 98 |
+
return ""
|
| 99 |
+
|
| 100 |
+
max_value = max(data.values()) if data.values() else 1
|
| 101 |
+
|
| 102 |
+
chart = f"""
|
| 103 |
+
## 📈 {title}
|
| 104 |
+
|
| 105 |
+
<div style="margin: 20px 0;">
|
| 106 |
+
"""
|
| 107 |
+
|
| 108 |
+
for label, value in data.items():
|
| 109 |
+
percentage = (value / max_value) * 100
|
| 110 |
+
chart += f"""
|
| 111 |
+
<div style="margin: 10px 0;">
|
| 112 |
+
<div style="display: flex; justify-content: space-between; margin-bottom: 5px;">
|
| 113 |
+
<span style="font-weight: bold;">{label}</span>
|
| 114 |
+
<span style="color: #666;">{value:.1f}</span>
|
| 115 |
+
</div>
|
| 116 |
+
<div style="background: #e0e0e0; border-radius: 5px; height: 20px; overflow: hidden;">
|
| 117 |
+
<div style="background: linear-gradient(90deg, #2196F3, #21CBF3); height: 100%; width: {percentage}%; transition: width 0.3s ease;"></div>
|
| 118 |
+
</div>
|
| 119 |
+
</div>
|
| 120 |
+
"""
|
| 121 |
+
|
| 122 |
+
chart += "</div>"
|
| 123 |
+
return chart
|
| 124 |
+
|
| 125 |
+
def create_key_points(self, points: List[str], title: str = "Key Points") -> str:
|
| 126 |
+
"""Create a visually appealing key points section"""
|
| 127 |
+
if not points:
|
| 128 |
+
return ""
|
| 129 |
+
|
| 130 |
+
visual = f"""
|
| 131 |
+
## 💡 {title}
|
| 132 |
+
|
| 133 |
+
<div style="display: grid; gap: 15px; margin: 20px 0;">
|
| 134 |
+
"""
|
| 135 |
+
|
| 136 |
+
for i, point in enumerate(points, 1):
|
| 137 |
+
visual += f"""
|
| 138 |
+
<div style="background: #f8f9fa; border-left: 4px solid #007bff; padding: 15px; border-radius: 0 8px 8px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 139 |
+
<div style="display: flex; align-items: flex-start;">
|
| 140 |
+
<span style="background: #007bff; color: white; border-radius: 50%; width: 24px; height: 24px; display: flex; align-items: center; justify-content: center; font-weight: bold; margin-right: 12px; flex-shrink: 0;">{i}</span>
|
| 141 |
+
<p style="margin: 0; line-height: 1.5;">{point}</p>
|
| 142 |
+
</div>
|
| 143 |
+
</div>
|
| 144 |
+
"""
|
| 145 |
+
|
| 146 |
+
visual += "</div>"
|
| 147 |
+
return visual
|
| 148 |
+
|
| 149 |
+
def create_alert_box(self, message: str, alert_type: str = "info") -> str:
|
| 150 |
+
"""Create an alert box"""
|
| 151 |
+
colors = {
|
| 152 |
+
"info": "#2196F3",
|
| 153 |
+
"success": "#4CAF50",
|
| 154 |
+
"warning": "#FF9800",
|
| 155 |
+
"error": "#F44336"
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
icons = {
|
| 159 |
+
"info": "ℹ️",
|
| 160 |
+
"success": "✅",
|
| 161 |
+
"warning": "⚠️",
|
| 162 |
+
"error": "❌"
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
color = colors.get(alert_type, colors["info"])
|
| 166 |
+
icon = icons.get(alert_type, icons["info"])
|
| 167 |
+
|
| 168 |
+
return f"""
|
| 169 |
+
<div style="background: {color}15; border: 1px solid {color}; border-radius: 8px; padding: 15px; margin: 15px 0; display: flex; align-items: flex-start;">
|
| 170 |
+
<span style="font-size: 20px; margin-right: 10px;">{icon}</span>
|
| 171 |
+
<p style="margin: 0; color: {color}; font-weight: 500;">{message}</p>
|
| 172 |
+
</div>
|
| 173 |
+
"""
|
| 174 |
+
|
| 175 |
+
def create_metric_cards(self, metrics: Dict[str, Any], title: str = "Key Metrics") -> str:
|
| 176 |
+
"""Create metric cards"""
|
| 177 |
+
if not metrics:
|
| 178 |
+
return ""
|
| 179 |
+
|
| 180 |
+
cards = f"""
|
| 181 |
+
## 📊 {title}
|
| 182 |
+
|
| 183 |
+
<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;">
|
| 184 |
+
"""
|
| 185 |
+
|
| 186 |
+
for metric, value in metrics.items():
|
| 187 |
+
cards += f"""
|
| 188 |
+
<div style="background: white; border: 1px solid #e0e0e0; border-radius: 8px; padding: 20px; text-align: center; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
|
| 189 |
+
<h3 style="margin: 0 0 10px 0; color: #333; font-size: 28px;">{value}</h3>
|
| 190 |
+
<p style="margin: 0; color: #666; font-size: 14px; text-transform: uppercase; letter-spacing: 0.5px;">{metric}</p>
|
| 191 |
+
</div>
|
| 192 |
+
"""
|
| 193 |
+
|
| 194 |
+
cards += "</div>"
|
| 195 |
+
return cards
|
| 196 |
+
|
| 197 |
+
def format_analysis_with_visuals(self, analysis_text: str, document_metadata: Dict[str, Any] = None) -> str:
|
| 198 |
+
"""Format analysis text with visual elements"""
|
| 199 |
+
# Extract key information for visual representation
|
| 200 |
+
visual_elements = []
|
| 201 |
+
|
| 202 |
+
# Add document info if available
|
| 203 |
+
if document_metadata:
|
| 204 |
+
visual_elements.append(self.create_metric_cards({
|
| 205 |
+
"Pages": document_metadata.get('page_count', 'Unknown'),
|
| 206 |
+
"File Size": f"{document_metadata.get('file_size', 0) / 1024:.1f} KB",
|
| 207 |
+
"Processing Time": f"{document_metadata.get('processing_time', 0):.1f}s"
|
| 208 |
+
}, "Document Information"))
|
| 209 |
+
|
| 210 |
+
# Try to extract key points from analysis
|
| 211 |
+
key_points = self._extract_key_points(analysis_text)
|
| 212 |
+
if key_points:
|
| 213 |
+
visual_elements.append(self.create_key_points(key_points))
|
| 214 |
+
|
| 215 |
+
# Try to extract metrics
|
| 216 |
+
metrics = self._extract_metrics(analysis_text)
|
| 217 |
+
if metrics:
|
| 218 |
+
visual_elements.append(self.create_metric_cards(metrics, "Key Metrics"))
|
| 219 |
+
|
| 220 |
+
# Combine visual elements with analysis
|
| 221 |
+
result = analysis_text
|
| 222 |
+
|
| 223 |
+
if visual_elements:
|
| 224 |
+
result = "\n\n".join(visual_elements) + "\n\n---\n\n" + analysis_text
|
| 225 |
+
|
| 226 |
+
return result
|
| 227 |
+
|
| 228 |
+
def _extract_key_points(self, text: str) -> List[str]:
|
| 229 |
+
"""Extract key points from analysis text"""
|
| 230 |
+
# Look for bullet points, numbered lists, or key findings
|
| 231 |
+
points = []
|
| 232 |
+
|
| 233 |
+
# Extract bullet points
|
| 234 |
+
bullet_pattern = r'[-•*]\s+(.+?)(?=\n|$)'
|
| 235 |
+
bullets = re.findall(bullet_pattern, text, re.MULTILINE)
|
| 236 |
+
points.extend([bullet.strip() for bullet in bullets if len(bullet.strip()) > 10])
|
| 237 |
+
|
| 238 |
+
# Extract numbered points
|
| 239 |
+
number_pattern = r'\d+\.\s+(.+?)(?=\n|$)'
|
| 240 |
+
numbers = re.findall(number_pattern, text, re.MULTILINE)
|
| 241 |
+
points.extend([num.strip() for num in numbers if len(num.strip()) > 10])
|
| 242 |
+
|
| 243 |
+
# Limit to top 5 points
|
| 244 |
+
return points[:5]
|
| 245 |
+
|
| 246 |
+
def _extract_metrics(self, text: str) -> Dict[str, str]:
|
| 247 |
+
"""Extract metrics from analysis text"""
|
| 248 |
+
metrics = {}
|
| 249 |
+
|
| 250 |
+
# Look for percentage patterns
|
| 251 |
+
percent_pattern = r'(\d+(?:\.\d+)?%)'
|
| 252 |
+
percentages = re.findall(percent_pattern, text)
|
| 253 |
+
if percentages:
|
| 254 |
+
metrics["Success Rate"] = percentages[0]
|
| 255 |
+
|
| 256 |
+
# Look for number patterns
|
| 257 |
+
number_pattern = r'(\d+(?:,\d+)*(?:\.\d+)?)\s+(?:pages?|items?|points?|years?|months?)'
|
| 258 |
+
numbers = re.findall(number_pattern, text, re.IGNORECASE)
|
| 259 |
+
if numbers:
|
| 260 |
+
metrics["Total Items"] = numbers[0]
|
| 261 |
+
|
| 262 |
+
return metrics
|