Spaces:

JatinAutonomousLabs
/

PDF_analyst

Paused

JatsTheAIGen commited on Oct 19, 2025

Commit

a929e66

1 Parent(s): ffff3e5

Simplify PDF Analysis Orchestrator: Remove Document Type Analysis and fix context length errors

- Removed complex Document Type Analysis feature for better usability
- Implemented hierarchical summarization to handle large documents
- Added token counting utilities to prevent context length exceeded errors
- Simplified UI by removing document type selection
- Streamlined agent processing without dynamic token calculation
- Maintained all core analysis functionality with improved reliability

Files changed (3) hide show

agents.py +20 -92
app.py +4 -29
utils/__init__.py +77 -51

agents.py CHANGED Viewed

@@ -38,70 +38,6 @@ class AnalysisAgent(BaseAgent):
         super().__init__(name, model, tasks_completed)
         self.visual_generator = VisualOutputGenerator()
-    def _detect_document_type(self, text: str, prompt: str) -> str:
-        """Detect document type based on content and prompt"""
-        text_lower = text.lower()
-        prompt_lower = prompt.lower()
-        # Technical documents
-        if any(keyword in text_lower for keyword in ['api', 'function', 'method', 'class', 'code', 'implementation', 'technical specification']):
-            return "technical"
-        # Financial documents
-        if any(keyword in text_lower for keyword in ['revenue', 'profit', 'financial', 'balance sheet', 'income statement', 'cash flow', 'budget']):
-            return "financial"
-        # Legal documents
-        if any(keyword in text_lower for keyword in ['agreement', 'contract', 'terms', 'conditions', 'liability', 'legal', 'jurisdiction']):
-            return "legal"
-        # Academic papers
-        if any(keyword in text_lower for keyword in ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'research', 'study']):
-            return "academic"
-        # Business documents
-        if any(keyword in text_lower for keyword in ['business plan', 'strategy', 'market', 'customer', 'product', 'service']):
-            return "business"
-        # Creative content
-        if any(keyword in text_lower for keyword in ['creative', 'design', 'marketing', 'brand', 'advertising']):
-            return "creative"
-        # Check prompt for hints
-        if any(keyword in prompt_lower for keyword in ['technical', 'financial', 'legal', 'academic', 'business', 'creative']):
-            return prompt_lower.split()[0]  # Use first keyword from prompt
-        return "general"
-    def _calculate_dynamic_tokens(self, prompt: str, text_length: int, document_type: str = "general") -> int:
-        """Calculate dynamic token allocation based on prompt complexity, text length, and document type"""
-        base_tokens = Config.OPENAI_MAX_TOKENS
-        # Increase tokens for complex prompts
-        complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete', 'extensive']
-        complexity_multiplier = 1.0
-        for keyword in complex_keywords:
-            if keyword.lower() in prompt.lower():
-                complexity_multiplier += 0.3
-        # Increase tokens for longer documents
-        length_multiplier = min(2.0, 1.0 + (text_length / 50000))  # Cap at 2x for very long docs
-        # Increase tokens for specific document types
-        doc_type_multipliers = {
-            "technical": 1.3,
-            "financial": 1.4,
-            "legal": 1.5,
-            "academic": 1.2,
-            "business": 1.1,
-            "creative": 1.0,
-            "general": 1.0
-        }
-        doc_type_multiplier = doc_type_multipliers.get(document_type, 1.0)
-        final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
-        return min(final_tokens, 4000)  # Cap at 4000 tokens
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
@@ -119,20 +55,16 @@ class AnalysisAgent(BaseAgent):
             # Load text with caching
             text = load_pdf_text_cached(file_path)
-            # Detect document type
-            document_type = self._detect_document_type(text, prompt)
-            metadata['document_type'] = document_type
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
-                result = await self._handle_large_document(prompt, text, metadata, document_type)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
-                result = await self._process_content(prompt, content, metadata, text, document_type)
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
-            result = await self._process_content(prompt, content, metadata, "", "general")
         # Cache the result
         if file_path:
@@ -140,12 +72,12 @@ class AnalysisAgent(BaseAgent):
         return result
-    async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str, document_type: str = "general") -> Dict[str, Any]:
-        """Process content with dynamic token allocation and visual formatting"""
         start_time = time.time()
-        # Calculate dynamic tokens
-        max_tokens = self._calculate_dynamic_tokens(prompt, len(text), document_type)
         system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
@@ -210,15 +142,13 @@ VISUAL ELEMENTS TO USE:
         return result
-    async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any], document_type: str = "general") -> Dict[str, Any]:
-        """Handle large documents by processing in smart chunks"""
-        # Use smart chunking based on document type and content
-        chunks = smart_chunk_text(text, prompt, document_type)
-        # Get optimal chunk size for display
-        optimal_size, optimal_overlap = get_optimal_chunk_size(text, prompt, document_type)
-        metadata['chunk_size'] = optimal_size
-        metadata['chunk_overlap'] = optimal_overlap
         metadata['total_chunks'] = len(chunks)
         chunk_results = []
@@ -243,16 +173,14 @@ VISUAL ELEMENTS TO USE:
         # Combine chunk results
         combined_analysis = "\n\n".join(chunk_results)
-        # Create final summary
-        summary_prompt = f"Please provide a comprehensive summary that combines insights from all chunks of this large document. Original prompt: {prompt}\n\nChunk analyses:\n{combined_analysis}"
         try:
-            final_summary = await call_openai_chat(
                 model=self.model,
-                messages=[{"role": "system", "content": "You are AnalysisAgent: create comprehensive summaries from multiple document chunks."},
-                         {"role": "user", "content": summary_prompt}],
-                temperature=Config.OPENAI_TEMPERATURE,
-                max_tokens=Config.OPENAI_MAX_TOKENS
             )
         except Exception as e:
             logger.exception("AnalysisAgent failed on final summary")

         super().__init__(name, model, tasks_completed)
         self.visual_generator = VisualOutputGenerator()
     async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
         start_time = time.time()
             # Load text with caching
             text = load_pdf_text_cached(file_path)
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
+                result = await self._handle_large_document(prompt, text, metadata)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
+                result = await self._process_content(prompt, content, metadata, text)
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
+            result = await self._process_content(prompt, content, metadata, "")
         # Cache the result
         if file_path:
         return result
+    async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
+        """Process content with visual formatting"""
         start_time = time.time()
+        # Use standard token allocation
+        max_tokens = Config.OPENAI_MAX_TOKENS
         system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
         return result
+    async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
+        """Handle large documents by processing in chunks"""
+        # Use standard chunking
+        from utils import chunk_text
+        chunks = chunk_text(text, Config.CHUNK_SIZE)
+        metadata['chunk_size'] = Config.CHUNK_SIZE
+        metadata['chunk_overlap'] = 1000
         metadata['total_chunks'] = len(chunks)
         chunk_results = []
         # Combine chunk results
         combined_analysis = "\n\n".join(chunk_results)
+        # Create final summary using hierarchical approach to avoid token limits
         try:
+            from utils import create_hierarchical_summary
+            final_summary = await create_hierarchical_summary(
+                chunk_results=chunk_results,
+                prompt=prompt,
                 model=self.model,
+                max_tokens=6000  # Conservative limit to avoid context length errors
             )
         except Exception as e:
             logger.exception("AnalysisAgent failed on final summary")

app.py CHANGED Viewed

@@ -240,38 +240,13 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
                     username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
                     # Custom Prompts Section
-                    with gr.Accordion("🎯 Document Type Analysis", open=True):
-                        gr.Markdown("**Choose a document type for specialized analysis:**")
                         prompt_dropdown = gr.Dropdown(
                             choices=get_custom_prompts(),
-                            label="📋 Select Document Type",
-                            value=None,
-                            info="Choose the type of document you're analyzing for better results"
                         )
-                        load_prompt_btn = gr.Button("📥 Load Analysis Template", size="sm", variant="secondary")
-                        # Document type categories
-                        with gr.Row():
-                            gr.Markdown("**Quick Categories:**")
-                        with gr.Row():
-                            gr.Markdown("📄 **Business:** Whitepapers, Business Plans")
-                            gr.Markdown("⚙️ **Technical:** User Manuals, Specs")
-                        with gr.Row():
-                            gr.Markdown("💰 **Financial:** Reports, Bank Statements")
-                            gr.Markdown("🎓 **Academic:** Research Papers")
-                        with gr.Row():
-                            gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
-                            gr.Markdown("🎨 **Creative:** Briefs, Marketing")
-                        # Smart processing info
-                        gr.Markdown("**🧠 Smart Processing:**")
-                        gr.Markdown("• **Auto-optimized chunk sizes** based on document type")
-                        gr.Markdown("• **Technical docs**: 8K chars (dense content)")
-                        gr.Markdown("• **Financial docs**: 6K chars (precise data)")
-                        gr.Markdown("• **Legal docs**: 5K chars (detailed terms)")
-                        gr.Markdown("• **Academic papers**: 10K chars (research)")
-                        gr.Markdown("• **Business docs**: 12K chars (standard)")
-                        gr.Markdown("• **Creative content**: 18K chars (narrative)")
                 with gr.Column(scale=2):
                     gr.Markdown("### Analysis Instructions")

                     username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
                     # Custom Prompts Section
+                    with gr.Accordion("🎯 Custom Prompts", open=False):
                         prompt_dropdown = gr.Dropdown(
                             choices=get_custom_prompts(),
+                            label="Select Custom Prompt",
+                            value=None
                         )
+                        load_prompt_btn = gr.Button("Load Prompt", size="sm")
                 with gr.Column(scale=2):
                     gr.Markdown("### Analysis Instructions")

utils/__init__.py CHANGED Viewed

@@ -139,65 +139,91 @@ def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[
     return chunks
-def get_optimal_chunk_size(text: str, prompt: str, document_type: str = "general") -> tuple[int, int]:
-    """
-    Calculate optimal chunk size and overlap based on content and analysis type
-    """
-    base_chunk_size = 15000
-    base_overlap = 1000
-    # Adjust based on document type
-    type_adjustments = {
-        "technical": (8000, 1200),      # Smaller chunks for technical docs
-        "financial": (6000, 1000),      # Even smaller for financial data
-        "legal": (5000, 800),           # Small chunks for legal precision
-        "academic": (10000, 1500),      # Medium chunks for academic papers
-        "business": (12000, 1000),      # Standard for business docs
-        "creative": (18000, 1500),      # Larger for creative content
-        "general": (15000, 1000)        # Default
-    }
-    chunk_size, overlap = type_adjustments.get(document_type, (base_chunk_size, base_overlap))
-    # Adjust based on prompt complexity
-    complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete']
-    if any(keyword in prompt.lower() for keyword in complex_keywords):
-        chunk_size = int(chunk_size * 0.7)  # Smaller chunks for complex analysis
-        overlap = int(overlap * 1.2)        # More overlap for better context
-    # Adjust based on text length
-    if len(text) > 100000:  # Very long documents
-        chunk_size = int(chunk_size * 0.8)  # Smaller chunks
-        overlap = int(overlap * 1.3)        # More overlap
-    # Adjust based on content density
-    avg_sentence_length = len(text) / text.count('.') if text.count('.') > 0 else 100
-    if avg_sentence_length > 200:  # Dense technical content
-        chunk_size = int(chunk_size * 0.6)  # Much smaller chunks
-        overlap = int(overlap * 1.5)        # Much more overlap
-    # Ensure minimum and maximum bounds
-    chunk_size = max(3000, min(chunk_size, 20000))
-    overlap = max(500, min(overlap, chunk_size // 3))
-    return chunk_size, overlap
-def smart_chunk_text(text: str, prompt: str, document_type: str = "general") -> List[str]:
-    """
-    Smart chunking that adapts to content and analysis needs
-    """
-    if len(text) <= 15000:  # Small documents don't need chunking
-        return [text]
-    chunk_size, overlap = get_optimal_chunk_size(text, prompt, document_type)
-    # Use the optimized chunking
-    return chunk_text(text, chunk_size, overlap)
-def get_file_hash(file_path: str) -> str:
-    """Generate hash for file caching"""
-    with open(file_path, 'rb') as f:
-        return hashlib.md5(f.read()).hexdigest()
 # ------------------------
 # Enhanced Caching System

     return chunks
+def get_file_hash(file_path: str) -> str:
+    """Generate hash for file caching"""
+    with open(file_path, 'rb') as f:
+        return hashlib.md5(f.read()).hexdigest()
+# ------------------------
+# Token Counting Utilities
+# ------------------------
+def estimate_tokens(text: str) -> int:
+    """Rough estimation of token count (1 token ≈ 4 characters for English)"""
+    return len(text) // 4
+def is_within_token_limit(text: str, max_tokens: int = 6000) -> bool:
+    """Check if text is within token limit for API calls"""
+    return estimate_tokens(text) <= max_tokens
+def truncate_to_token_limit(text: str, max_tokens: int = 6000) -> str:
+    """Truncate text to fit within token limit"""
+    if is_within_token_limit(text, max_tokens):
+        return text
+    # Rough character limit based on token estimation
+    char_limit = max_tokens * 4
+    return text[:char_limit] + "\n\n[Content truncated due to length...]"
+# ------------------------
+# Hierarchical Summarization
+# ------------------------
+async def create_hierarchical_summary(chunk_results: List[str], prompt: str, model: str, max_tokens: int = 6000) -> str:
+    """Create a summary using hierarchical approach to avoid token limits"""
+    # First, create intermediate summaries of groups of chunks
+    intermediate_summaries = []
+    group_size = 3  # Process 3 chunks at a time
+    for i in range(0, len(chunk_results), group_size):
+        group = chunk_results[i:i + group_size]
+        group_text = "\n\n".join(group)
+        # Truncate if too long
+        if not is_within_token_limit(group_text, max_tokens):
+            group_text = truncate_to_token_limit(group_text, max_tokens)
+        group_prompt = f"Summarize the following chunk analyses, focusing on key insights and findings:\n\n{group_text}"
+        try:
+            summary = await call_openai_chat(
+                model=model,
+                messages=[
+                    {"role": "system", "content": "You are a summarization expert. Create concise summaries that capture the most important insights."},
+                    {"role": "user", "content": group_prompt}
+                ],
+                temperature=0.2,
+                max_tokens=800
+            )
+            intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\n{summary}")
+        except Exception as e:
+            intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\nError: {str(e)}")
+    # Now create final summary from intermediate summaries
+    if len(intermediate_summaries) == 1:
+        return intermediate_summaries[0]
+    final_text = "\n\n".join(intermediate_summaries)
+    # If still too long, create another level of summarization
+    if not is_within_token_limit(final_text, max_tokens):
+        final_text = truncate_to_token_limit(final_text, max_tokens)
+    final_prompt = f"Create a comprehensive final summary based on the following intermediate summaries. Original prompt: {prompt}\n\n{final_text}"
+    try:
+        final_summary = await call_openai_chat(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are an expert at creating comprehensive summaries from multiple sources. Synthesize the key insights into a coherent final summary."},
+                {"role": "user", "content": final_prompt}
+            ],
+            temperature=0.2,
+            max_tokens=1000
+        )
+        return final_summary
+    except Exception as e:
+        return f"Error creating final summary: {str(e)}\n\nIntermediate summaries:\n{final_text}"
 # ------------------------
 # Enhanced Caching System