Spaces:

JatinAutonomousLabs
/

PDF_analyst

Paused

App Files Files Community

JatsTheAIGen commited on Oct 19, 2025

Commit

bb3909a

1 Parent(s): 624de5a

Major performance and UX improvements: caching, visual outputs, dynamic tokens, enhanced document processing

Browse files

Files changed (4) hide show

agents.py +74 -16
app.py +26 -14
utils/__init__.py +79 -8
utils/visual_output.py +262 -0

agents.py CHANGED Viewed

@@ -5,7 +5,8 @@ import logging
 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
-from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata
 from config import Config
 logger = logging.getLogger(__name__)
@@ -33,9 +34,44 @@ class BaseAgent:
 # Core Analysis Agent
 # --------------------
 class AnalysisAgent(BaseAgent):
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
         if file_path:
             # Get document metadata
             metadata = get_document_metadata(file_path)
@@ -45,29 +81,46 @@ class AnalysisAgent(BaseAgent):
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
-                return await self._handle_large_document(prompt, text, metadata)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
-        system = """You are AnalysisAgent: produce well-structured, readable insights and summaries.
-FORMATTING REQUIREMENTS:
 - Use clear section headers with emojis (## 📋 Key Points, ## 🔍 Analysis, etc.)
-- Use bullet points and numbered lists for better readability
 - Include visual separators (---) between major sections
-- Use bold text for important concepts
-- Add technical details in organized subsections
-- Include actionable insights with clear next steps
 CONTENT REQUIREMENTS:
-- Adapt language and complexity to the target audience
-- Provide clear, actionable insights with examples
-- Use analogies for complex topics
-- Include quantitative details when available
-- Structure information hierarchically for easy scanning"""
         try:
             response = await call_openai_chat(
@@ -75,23 +128,28 @@ CONTENT REQUIREMENTS:
                 messages=[{"role": "system", "content": system},
                          {"role": "user", "content": content}],
                 temperature=Config.OPENAI_TEMPERATURE,
-                max_tokens=Config.OPENAI_MAX_TOKENS
             )
         except Exception as e:
             logger.exception("AnalysisAgent failed")
             response = f"Error during analysis: {str(e)}"
         self.tasks_completed += 1
         # Add processing metadata
         processing_time = time.time() - start_time
         result = {
-            "analysis": response,
             "metadata": {
                 "processing_time": round(processing_time, 2),
                 "document_metadata": metadata,
                 "agent": self.name,
-                "tasks_completed": self.tasks_completed
             }
         }

 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
+from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis
+from utils.visual_output import VisualOutputGenerator
 from config import Config
 logger = logging.getLogger(__name__)
 # Core Analysis Agent
 # --------------------
 class AnalysisAgent(BaseAgent):
+    def __init__(self, name: str, model: str, tasks_completed: int = 0):
+        super().__init__(name, model, tasks_completed)
+        self.visual_generator = VisualOutputGenerator()
+    def _calculate_dynamic_tokens(self, prompt: str, text_length: int) -> int:
+        """Calculate dynamic token allocation based on prompt complexity and text length"""
+        base_tokens = Config.OPENAI_MAX_TOKENS
+        # Increase tokens for complex prompts
+        complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete', 'extensive']
+        complexity_multiplier = 1.0
+        for keyword in complex_keywords:
+            if keyword.lower() in prompt.lower():
+                complexity_multiplier += 0.3
+        # Increase tokens for longer documents
+        length_multiplier = min(2.0, 1.0 + (text_length / 50000))  # Cap at 2x for very long docs
+        # Increase tokens for specific document types
+        doc_type_keywords = ['whitepaper', 'research', 'technical', 'financial', 'legal', 'academic']
+        doc_type_multiplier = 1.0
+        for keyword in doc_type_keywords:
+            if keyword.lower() in prompt.lower():
+                doc_type_multiplier += 0.2
+        final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
+        return min(final_tokens, 4000)  # Cap at 4000 tokens
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
+        # Check cache first
+        if file_path:
+            cached_result = get_cached_analysis(file_path, prompt)
+            if cached_result:
+                logger.info(f"Returning cached analysis for {file_path}")
+                return cached_result
         if file_path:
             # Get document metadata
             metadata = get_document_metadata(file_path)
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
+                result = await self._handle_large_document(prompt, text, metadata)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
+                result = await self._process_content(prompt, content, metadata, text)
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
+            result = await self._process_content(prompt, content, metadata, "")
+        # Cache the result
+        if file_path:
+            cache_analysis(file_path, prompt, result)
+        return result
+    async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
+        """Process content with dynamic token allocation and visual formatting"""
+        start_time = time.time()
+        # Calculate dynamic tokens
+        max_tokens = self._calculate_dynamic_tokens(prompt, len(text))
+        system = """You are AnalysisAgent: produce crisp, visually appealing, and highly readable insights.
+CRITICAL FORMATTING REQUIREMENTS:
 - Use clear section headers with emojis (## 📋 Key Points, ## 🔍 Analysis, etc.)
+- Create concise bullet points (max 1 line each)
+- Use tables for data comparison when appropriate
 - Include visual separators (---) between major sections
+- Use **bold** for key concepts and numbers
+- Keep sections short and scannable
+- Prioritize actionable insights over lengthy explanations
 CONTENT REQUIREMENTS:
+- Be concise and to the point
+- Use simple language even for technical topics
+- Include specific numbers, percentages, and metrics
+- Provide clear next steps or recommendations
+- Structure information for quick scanning
+- Focus on what matters most to the user"""
         try:
             response = await call_openai_chat(
                 messages=[{"role": "system", "content": system},
                          {"role": "user", "content": content}],
                 temperature=Config.OPENAI_TEMPERATURE,
+                max_tokens=max_tokens
             )
         except Exception as e:
             logger.exception("AnalysisAgent failed")
             response = f"Error during analysis: {str(e)}"
+        # Enhance with visual formatting
+        visual_response = self.visual_generator.format_analysis_with_visuals(response, metadata)
         self.tasks_completed += 1
         # Add processing metadata
         processing_time = time.time() - start_time
         result = {
+            "analysis": visual_response,
             "metadata": {
                 "processing_time": round(processing_time, 2),
                 "document_metadata": metadata,
                 "agent": self.name,
+                "tasks_completed": self.tasks_completed,
+                "tokens_used": max_tokens,
+                "cached": False
             }
         }

app.py CHANGED Viewed

@@ -87,16 +87,30 @@ def handle_analysis(file, prompt, username="anonymous", use_streaming=False):
         validate_file_size(file)
         path = save_uploaded_file(file, username)
-        result = run_async(
-            ORCHESTRATOR.handle_user_prompt,
-            user_id=username,
-            prompt=prompt,
-            file_path=path,
-            targets=["analysis"]
-        )
-        return result.get("analysis", "No analysis result."), None, None
     except Exception as e:
-        return f"Error during analysis: {str(e)}", None, None
 def handle_batch_analysis(files, prompt, username="anonymous"):
     """Handle batch analysis of multiple PDFs"""
@@ -240,11 +254,9 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
                         label="Analysis Result",
                         show_copy_button=True
                     )
-                    status_box = gr.Textbox(
-                        label="🔄 Status",
-                        value="Ready to analyze documents",
-                        interactive=False,
-                        info="Current processing status will appear here"
                     )
                 with gr.Column(scale=1):

         validate_file_size(file)
         path = save_uploaded_file(file, username)
+        # Check if this is a cached result
+        from utils import get_cached_analysis
+        cached_result = get_cached_analysis(path, prompt)
+        if cached_result:
+            status = "⚡ **Cached Result** - Instant response from previous analysis"
+            result = cached_result.get("analysis", "No analysis result.")
+            metadata = cached_result.get("metadata", {})
+        else:
+            status = "🔄 **Processing** - Analyzing document with AI..."
+            result = run_async(
+                ORCHESTRATOR.handle_user_prompt,
+                user_id=username,
+                prompt=prompt,
+                file_path=path,
+                targets=["analysis"]
+            )
+            result = result.get("analysis", "No analysis result.")
+            metadata = result.get("metadata", {}) if isinstance(result, dict) else {}
+            status = "✅ **Analysis Complete** - Fresh analysis generated"
+        return result, status, metadata
     except Exception as e:
+        return f"Error during analysis: {str(e)}", f"❌ **Error** - {str(e)}", None
 def handle_batch_analysis(files, prompt, username="anonymous"):
     """Handle batch analysis of multiple PDFs"""
                         label="Analysis Result",
                         show_copy_button=True
                     )
+                    status_box = gr.Markdown(
+                        value="**🔄 Status:** Ready to analyze documents\n\n**💡 Tip:** Same document + same prompt = instant cached response!",
+                        label="Status & Performance"
                     )
                 with gr.Column(scale=1):

utils/__init__.py CHANGED Viewed

@@ -48,12 +48,46 @@ async def call_openai_chat(model: str, messages: list, temperature=0.2, max_toke
 # PDF Utilities
 # ------------------------
 def load_pdf_text(path: str) -> str:
-    """Extract text from PDF using pdfplumber"""
-    text = []
     with pdfplumber.open(path) as pdf:
-        for p in pdf.pages:
-            text.append(p.extract_text() or "")
-    return "\n\n".join(text)
 def save_text_as_file(text: str, suffix=".txt") -> str:
     """Save text to a temporary file"""
@@ -111,15 +145,52 @@ def get_file_hash(file_path: str) -> str:
         return hashlib.md5(f.read()).hexdigest()
 # ------------------------
-# Caching System
 # ------------------------
 CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
 CACHE_DIR.mkdir(exist_ok=True)
 def get_cached_text(file_path: str) -> Optional[str]:
     """Retrieve cached PDF text if available"""
     file_hash = get_file_hash(file_path)
-    cache_file = CACHE_DIR / f"{file_hash}.json"
     if cache_file.exists():
         try:
@@ -135,7 +206,7 @@ def get_cached_text(file_path: str) -> Optional[str]:
 def cache_text(file_path: str, text: str) -> None:
     """Cache PDF text for future use"""
     file_hash = get_file_hash(file_path)
-    cache_file = CACHE_DIR / f"{file_hash}.json"
     try:
         cache_data = {

 # PDF Utilities
 # ------------------------
 def load_pdf_text(path: str) -> str:
+    """Extract comprehensive content from PDF using pdfplumber"""
+    content = []
     with pdfplumber.open(path) as pdf:
+        for page_num, page in enumerate(pdf.pages, 1):
+            page_content = []
+            # Extract text
+            text = page.extract_text()
+            if text:
+                page_content.append(f"=== PAGE {page_num} TEXT ===")
+                page_content.append(text)
+            # Extract tables
+            tables = page.extract_tables()
+            if tables:
+                page_content.append(f"\n=== PAGE {page_num} TABLES ===")
+                for table_num, table in enumerate(tables, 1):
+                    page_content.append(f"\n--- TABLE {table_num} ---")
+                    for row in table:
+                        if row:  # Skip empty rows
+                            # Clean and format table row
+                            clean_row = [cell.strip() if cell else "" for cell in row]
+                            page_content.append(" | ".join(clean_row))
+            # Extract images info (metadata only)
+            images = page.images
+            if images:
+                page_content.append(f"\n=== PAGE {page_num} IMAGES ===")
+                for img_num, img in enumerate(images, 1):
+                    page_content.append(f"Image {img_num}: {img.get('width', 'unknown')}x{img.get('height', 'unknown')} pixels")
+            # Extract metadata
+            page_content.append(f"\n=== PAGE {page_num} METADATA ===")
+            page_content.append(f"Page size: {page.width}x{page.height}")
+            page_content.append(f"Rotation: {page.rotation}")
+            if page_content:
+                content.append("\n".join(page_content))
+    return "\n\n".join(content)
 def save_text_as_file(text: str, suffix=".txt") -> str:
     """Save text to a temporary file"""
         return hashlib.md5(f.read()).hexdigest()
 # ------------------------
+# Enhanced Caching System
 # ------------------------
 CACHE_DIR = Path(tempfile.gettempdir()) / "pdf_analysis_cache"
 CACHE_DIR.mkdir(exist_ok=True)
+def get_cached_analysis(file_path: str, prompt: str) -> Optional[Dict[str, Any]]:
+    """Retrieve cached analysis if available"""
+    file_hash = get_file_hash(file_path)
+    prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
+    cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
+    if cache_file.exists():
+        try:
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                cache_data = json.load(f)
+                # Check if file hasn't been modified and cache is still valid (24 hours)
+                if (cache_data.get('file_hash') == file_hash and
+                    cache_data.get('prompt_hash') == prompt_hash and
+                    time.time() - cache_data.get('cached_at', 0) < 86400):  # 24 hours
+                    return cache_data.get('analysis')
+        except Exception:
+            pass
+    return None
+def cache_analysis(file_path: str, prompt: str, analysis: Dict[str, Any]) -> None:
+    """Cache analysis results for future use"""
+    file_hash = get_file_hash(file_path)
+    prompt_hash = hashlib.md5(prompt.encode()).hexdigest()
+    cache_file = CACHE_DIR / f"{file_hash}_{prompt_hash}.json"
+    try:
+        cache_data = {
+            'file_hash': file_hash,
+            'prompt_hash': prompt_hash,
+            'analysis': analysis,
+            'cached_at': time.time()
+        }
+        with open(cache_file, 'w', encoding='utf-8') as f:
+            json.dump(cache_data, f, ensure_ascii=False)
+    except Exception:
+        pass  # Fail silently if caching fails
 def get_cached_text(file_path: str) -> Optional[str]:
     """Retrieve cached PDF text if available"""
     file_hash = get_file_hash(file_path)
+    cache_file = CACHE_DIR / f"{file_hash}_text.json"
     if cache_file.exists():
         try:
 def cache_text(file_path: str, text: str) -> None:
     """Cache PDF text for future use"""
     file_hash = get_file_hash(file_path)
+    cache_file = CACHE_DIR / f"{file_hash}_text.json"
     try:
         cache_data = {

utils/visual_output.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# utils/visual_output.py - Visual output generation for PDF Analysis & Orchestrator
+import json
+import re
+from typing import Dict, List, Any, Optional
+from datetime import datetime
+class VisualOutputGenerator:
+    """Generate visual representations of analysis results"""
+    def __init__(self):
+        self.visual_elements = []
+    def create_infographic(self, data: Dict[str, Any], title: str = "Analysis Summary") -> str:
+        """Create an infographic-style summary"""
+        visual = f"""
+## 📊 {title}
+<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; margin: 10px 0;">
+"""
+        # Key metrics
+        if 'metrics' in data:
+            visual += f"""
+<div style="display: flex; justify-content: space-around; margin: 20px 0;">
+"""
+            for metric, value in data['metrics'].items():
+                visual += f"""
+<div style="text-align: center; background: rgba(255,255,255,0.2); padding: 15px; border-radius: 8px; margin: 5px;">
+<h3 style="margin: 0; font-size: 24px;">{value}</h3>
+<p style="margin: 5px 0 0 0; font-size: 14px;">{metric}</p>
+</div>
+"""
+            visual += "</div>"
+        visual += "</div>"
+        return visual
+    def create_data_table(self, data: List[Dict[str, Any]], title: str = "Data Table") -> str:
+        """Create a formatted table from data"""
+        if not data:
+            return ""
+        # Get headers from first row
+        headers = list(data[0].keys())
+        table = f"""
+## 📋 {title}
+| {' | '.join(headers)} |
+| {' | '.join(['---'] * len(headers))} |
+"""
+        for row in data:
+            values = [str(row.get(header, '')) for header in headers]
+            table += f"| {' | '.join(values)} |\n"
+        return table
+    def create_progress_bar(self, value: float, max_value: float, label: str) -> str:
+        """Create a progress bar visualization"""
+        percentage = min(100, (value / max_value) * 100) if max_value > 0 else 0
+        return f"""
+<div style="margin: 10px 0;">
+<p style="margin: 5px 0; font-weight: bold;">{label}: {value:.1f}/{max_value:.1f} ({percentage:.1f}%)</p>
+<div style="background: #e0e0e0; border-radius: 10px; height: 20px; overflow: hidden;">
+<div style="background: linear-gradient(90deg, #4CAF50, #8BC34A); height: 100%; width: {percentage}%; transition: width 0.3s ease;"></div>
+</div>
+</div>
+"""
+    def create_timeline(self, events: List[Dict[str, str]], title: str = "Timeline") -> str:
+        """Create a timeline visualization"""
+        timeline = f"""
+## ⏰ {title}
+<div style="position: relative; padding-left: 30px; margin: 20px 0;">
+"""
+        for i, event in enumerate(events):
+            timeline += f"""
+<div style="position: relative; margin-bottom: 20px;">
+<div style="position: absolute; left: -25px; top: 5px; width: 12px; height: 12px; background: #4CAF50; border-radius: 50%; border: 3px solid white; box-shadow: 0 0 0 3px #4CAF50;"></div>
+<div style="background: #f5f5f5; padding: 15px; border-radius: 8px; border-left: 4px solid #4CAF50;">
+<h4 style="margin: 0 0 5px 0; color: #333;">{event.get('title', 'Event')}</h4>
+<p style="margin: 0; color: #666;">{event.get('description', '')}</p>
+<small style="color: #999;">{event.get('date', '')}</small>
+</div>
+</div>
+"""
+        timeline += "</div>"
+        return timeline
+    def create_comparison_chart(self, data: Dict[str, float], title: str = "Comparison") -> str:
+        """Create a comparison chart"""
+        if not data:
+            return ""
+        max_value = max(data.values()) if data.values() else 1
+        chart = f"""
+## 📈 {title}
+<div style="margin: 20px 0;">
+"""
+        for label, value in data.items():
+            percentage = (value / max_value) * 100
+            chart += f"""
+<div style="margin: 10px 0;">
+<div style="display: flex; justify-content: space-between; margin-bottom: 5px;">
+<span style="font-weight: bold;">{label}</span>
+<span style="color: #666;">{value:.1f}</span>
+</div>
+<div style="background: #e0e0e0; border-radius: 5px; height: 20px; overflow: hidden;">
+<div style="background: linear-gradient(90deg, #2196F3, #21CBF3); height: 100%; width: {percentage}%; transition: width 0.3s ease;"></div>
+</div>
+</div>
+"""
+        chart += "</div>"
+        return chart
+    def create_key_points(self, points: List[str], title: str = "Key Points") -> str:
+        """Create a visually appealing key points section"""
+        if not points:
+            return ""
+        visual = f"""
+## 💡 {title}
+<div style="display: grid; gap: 15px; margin: 20px 0;">
+"""
+        for i, point in enumerate(points, 1):
+            visual += f"""
+<div style="background: #f8f9fa; border-left: 4px solid #007bff; padding: 15px; border-radius: 0 8px 8px 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+<div style="display: flex; align-items: flex-start;">
+<span style="background: #007bff; color: white; border-radius: 50%; width: 24px; height: 24px; display: flex; align-items: center; justify-content: center; font-weight: bold; margin-right: 12px; flex-shrink: 0;">{i}</span>
+<p style="margin: 0; line-height: 1.5;">{point}</p>
+</div>
+</div>
+"""
+        visual += "</div>"
+        return visual
+    def create_alert_box(self, message: str, alert_type: str = "info") -> str:
+        """Create an alert box"""
+        colors = {
+            "info": "#2196F3",
+            "success": "#4CAF50",
+            "warning": "#FF9800",
+            "error": "#F44336"
+        }
+        icons = {
+            "info": "ℹ️",
+            "success": "✅",
+            "warning": "⚠️",
+            "error": "❌"
+        }
+        color = colors.get(alert_type, colors["info"])
+        icon = icons.get(alert_type, icons["info"])
+        return f"""
+<div style="background: {color}15; border: 1px solid {color}; border-radius: 8px; padding: 15px; margin: 15px 0; display: flex; align-items: flex-start;">
+<span style="font-size: 20px; margin-right: 10px;">{icon}</span>
+<p style="margin: 0; color: {color}; font-weight: 500;">{message}</p>
+</div>
+"""
+    def create_metric_cards(self, metrics: Dict[str, Any], title: str = "Key Metrics") -> str:
+        """Create metric cards"""
+        if not metrics:
+            return ""
+        cards = f"""
+## 📊 {title}
+<div style="display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 15px; margin: 20px 0;">
+"""
+        for metric, value in metrics.items():
+            cards += f"""
+<div style="background: white; border: 1px solid #e0e0e0; border-radius: 8px; padding: 20px; text-align: center; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
+<h3 style="margin: 0 0 10px 0; color: #333; font-size: 28px;">{value}</h3>
+<p style="margin: 0; color: #666; font-size: 14px; text-transform: uppercase; letter-spacing: 0.5px;">{metric}</p>
+</div>
+"""
+        cards += "</div>"
+        return cards
+    def format_analysis_with_visuals(self, analysis_text: str, document_metadata: Dict[str, Any] = None) -> str:
+        """Format analysis text with visual elements"""
+        # Extract key information for visual representation
+        visual_elements = []
+        # Add document info if available
+        if document_metadata:
+            visual_elements.append(self.create_metric_cards({
+                "Pages": document_metadata.get('page_count', 'Unknown'),
+                "File Size": f"{document_metadata.get('file_size', 0) / 1024:.1f} KB",
+                "Processing Time": f"{document_metadata.get('processing_time', 0):.1f}s"
+            }, "Document Information"))
+        # Try to extract key points from analysis
+        key_points = self._extract_key_points(analysis_text)
+        if key_points:
+            visual_elements.append(self.create_key_points(key_points))
+        # Try to extract metrics
+        metrics = self._extract_metrics(analysis_text)
+        if metrics:
+            visual_elements.append(self.create_metric_cards(metrics, "Key Metrics"))
+        # Combine visual elements with analysis
+        result = analysis_text
+        if visual_elements:
+            result = "\n\n".join(visual_elements) + "\n\n---\n\n" + analysis_text
+        return result
+    def _extract_key_points(self, text: str) -> List[str]:
+        """Extract key points from analysis text"""
+        # Look for bullet points, numbered lists, or key findings
+        points = []
+        # Extract bullet points
+        bullet_pattern = r'[-•*]\s+(.+?)(?=\n|$)'
+        bullets = re.findall(bullet_pattern, text, re.MULTILINE)
+        points.extend([bullet.strip() for bullet in bullets if len(bullet.strip()) > 10])
+        # Extract numbered points
+        number_pattern = r'\d+\.\s+(.+?)(?=\n|$)'
+        numbers = re.findall(number_pattern, text, re.MULTILINE)
+        points.extend([num.strip() for num in numbers if len(num.strip()) > 10])
+        # Limit to top 5 points
+        return points[:5]
+    def _extract_metrics(self, text: str) -> Dict[str, str]:
+        """Extract metrics from analysis text"""
+        metrics = {}
+        # Look for percentage patterns
+        percent_pattern = r'(\d+(?:\.\d+)?%)'
+        percentages = re.findall(percent_pattern, text)
+        if percentages:
+            metrics["Success Rate"] = percentages[0]
+        # Look for number patterns
+        number_pattern = r'(\d+(?:,\d+)*(?:\.\d+)?)\s+(?:pages?|items?|points?|years?|months?)'
+        numbers = re.findall(number_pattern, text, re.IGNORECASE)
+        if numbers:
+            metrics["Total Items"] = numbers[0]
+        return metrics