Spaces:

JatinAutonomousLabs
/

PDF_analyst

Paused

App Files Files Community

JatsTheAIGen commited on Oct 19, 2025

Commit

ffff3e5

1 Parent(s): 73f15b1

Implement smart chunking: adaptive chunk sizes based on document type and content complexity

Browse files

Files changed (3) hide show

agents.py +67 -17
app.py +10 -0
utils/__init__.py +55 -0

agents.py CHANGED Viewed

@@ -5,7 +5,7 @@ import logging
 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
-from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis
 from utils.visual_output import VisualOutputGenerator
 from config import Config
@@ -38,8 +38,43 @@ class AnalysisAgent(BaseAgent):
         super().__init__(name, model, tasks_completed)
         self.visual_generator = VisualOutputGenerator()
-    def _calculate_dynamic_tokens(self, prompt: str, text_length: int) -> int:
-        """Calculate dynamic token allocation based on prompt complexity and text length"""
         base_tokens = Config.OPENAI_MAX_TOKENS
         # Increase tokens for complex prompts
@@ -53,11 +88,16 @@ class AnalysisAgent(BaseAgent):
         length_multiplier = min(2.0, 1.0 + (text_length / 50000))  # Cap at 2x for very long docs
         # Increase tokens for specific document types
-        doc_type_keywords = ['whitepaper', 'research', 'technical', 'financial', 'legal', 'academic']
-        doc_type_multiplier = 1.0
-        for keyword in doc_type_keywords:
-            if keyword.lower() in prompt.lower():
-                doc_type_multiplier += 0.2
         final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
         return min(final_tokens, 4000)  # Cap at 4000 tokens
@@ -79,16 +119,20 @@ class AnalysisAgent(BaseAgent):
             # Load text with caching
             text = load_pdf_text_cached(file_path)
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
-                result = await self._handle_large_document(prompt, text, metadata)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
-                result = await self._process_content(prompt, content, metadata, text)
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
-            result = await self._process_content(prompt, content, metadata, "")
         # Cache the result
         if file_path:
@@ -96,12 +140,12 @@ class AnalysisAgent(BaseAgent):
         return result
-    async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
         """Process content with dynamic token allocation and visual formatting"""
         start_time = time.time()
         # Calculate dynamic tokens
-        max_tokens = self._calculate_dynamic_tokens(prompt, len(text))
         system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
@@ -166,10 +210,16 @@ VISUAL ELEMENTS TO USE:
         return result
-    async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
-        """Handle large documents by processing in chunks"""
-        from utils import chunk_text
-        chunks = chunk_text(text, Config.CHUNK_SIZE)
         chunk_results = []
         system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."

 from typing import Optional, Dict, Any, List, AsyncGenerator
 import time
+from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis, smart_chunk_text, get_optimal_chunk_size
 from utils.visual_output import VisualOutputGenerator
 from config import Config
         super().__init__(name, model, tasks_completed)
         self.visual_generator = VisualOutputGenerator()
+    def _detect_document_type(self, text: str, prompt: str) -> str:
+        """Detect document type based on content and prompt"""
+        text_lower = text.lower()
+        prompt_lower = prompt.lower()
+        # Technical documents
+        if any(keyword in text_lower for keyword in ['api', 'function', 'method', 'class', 'code', 'implementation', 'technical specification']):
+            return "technical"
+        # Financial documents
+        if any(keyword in text_lower for keyword in ['revenue', 'profit', 'financial', 'balance sheet', 'income statement', 'cash flow', 'budget']):
+            return "financial"
+        # Legal documents
+        if any(keyword in text_lower for keyword in ['agreement', 'contract', 'terms', 'conditions', 'liability', 'legal', 'jurisdiction']):
+            return "legal"
+        # Academic papers
+        if any(keyword in text_lower for keyword in ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'research', 'study']):
+            return "academic"
+        # Business documents
+        if any(keyword in text_lower for keyword in ['business plan', 'strategy', 'market', 'customer', 'product', 'service']):
+            return "business"
+        # Creative content
+        if any(keyword in text_lower for keyword in ['creative', 'design', 'marketing', 'brand', 'advertising']):
+            return "creative"
+        # Check prompt for hints
+        if any(keyword in prompt_lower for keyword in ['technical', 'financial', 'legal', 'academic', 'business', 'creative']):
+            return prompt_lower.split()[0]  # Use first keyword from prompt
+        return "general"
+    def _calculate_dynamic_tokens(self, prompt: str, text_length: int, document_type: str = "general") -> int:
+        """Calculate dynamic token allocation based on prompt complexity, text length, and document type"""
         base_tokens = Config.OPENAI_MAX_TOKENS
         # Increase tokens for complex prompts
         length_multiplier = min(2.0, 1.0 + (text_length / 50000))  # Cap at 2x for very long docs
         # Increase tokens for specific document types
+        doc_type_multipliers = {
+            "technical": 1.3,
+            "financial": 1.4,
+            "legal": 1.5,
+            "academic": 1.2,
+            "business": 1.1,
+            "creative": 1.0,
+            "general": 1.0
+        }
+        doc_type_multiplier = doc_type_multipliers.get(document_type, 1.0)
         final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
         return min(final_tokens, 4000)  # Cap at 4000 tokens
             # Load text with caching
             text = load_pdf_text_cached(file_path)
+            # Detect document type
+            document_type = self._detect_document_type(text, prompt)
+            metadata['document_type'] = document_type
             # Check if document needs chunking
             if len(text) > Config.CHUNK_SIZE:
+                result = await self._handle_large_document(prompt, text, metadata, document_type)
             else:
                 content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
+                result = await self._process_content(prompt, content, metadata, text, document_type)
         else:
             content = f"User prompt: {prompt}"
             metadata = {}
+            result = await self._process_content(prompt, content, metadata, "", "general")
         # Cache the result
         if file_path:
         return result
+    async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str, document_type: str = "general") -> Dict[str, Any]:
         """Process content with dynamic token allocation and visual formatting"""
         start_time = time.time()
         # Calculate dynamic tokens
+        max_tokens = self._calculate_dynamic_tokens(prompt, len(text), document_type)
         system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
         return result
+    async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any], document_type: str = "general") -> Dict[str, Any]:
+        """Handle large documents by processing in smart chunks"""
+        # Use smart chunking based on document type and content
+        chunks = smart_chunk_text(text, prompt, document_type)
+        # Get optimal chunk size for display
+        optimal_size, optimal_overlap = get_optimal_chunk_size(text, prompt, document_type)
+        metadata['chunk_size'] = optimal_size
+        metadata['chunk_overlap'] = optimal_overlap
+        metadata['total_chunks'] = len(chunks)
         chunk_results = []
         system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."

app.py CHANGED Viewed

@@ -262,6 +262,16 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
                         with gr.Row():
                             gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
                             gr.Markdown("🎨 **Creative:** Briefs, Marketing")
                 with gr.Column(scale=2):
                     gr.Markdown("### Analysis Instructions")

                         with gr.Row():
                             gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
                             gr.Markdown("🎨 **Creative:** Briefs, Marketing")
+                        # Smart processing info
+                        gr.Markdown("**🧠 Smart Processing:**")
+                        gr.Markdown("• **Auto-optimized chunk sizes** based on document type")
+                        gr.Markdown("• **Technical docs**: 8K chars (dense content)")
+                        gr.Markdown("• **Financial docs**: 6K chars (precise data)")
+                        gr.Markdown("• **Legal docs**: 5K chars (detailed terms)")
+                        gr.Markdown("• **Academic papers**: 10K chars (research)")
+                        gr.Markdown("• **Business docs**: 12K chars (standard)")
+                        gr.Markdown("• **Creative content**: 18K chars (narrative)")
                 with gr.Column(scale=2):
                     gr.Markdown("### Analysis Instructions")

utils/__init__.py CHANGED Viewed

@@ -139,6 +139,61 @@ def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[
     return chunks
 def get_file_hash(file_path: str) -> str:
     """Generate hash for file caching"""
     with open(file_path, 'rb') as f:

     return chunks
+def get_optimal_chunk_size(text: str, prompt: str, document_type: str = "general") -> tuple[int, int]:
+    """
+    Calculate optimal chunk size and overlap based on content and analysis type
+    """
+    base_chunk_size = 15000
+    base_overlap = 1000
+    # Adjust based on document type
+    type_adjustments = {
+        "technical": (8000, 1200),      # Smaller chunks for technical docs
+        "financial": (6000, 1000),      # Even smaller for financial data
+        "legal": (5000, 800),           # Small chunks for legal precision
+        "academic": (10000, 1500),      # Medium chunks for academic papers
+        "business": (12000, 1000),      # Standard for business docs
+        "creative": (18000, 1500),      # Larger for creative content
+        "general": (15000, 1000)        # Default
+    }
+    chunk_size, overlap = type_adjustments.get(document_type, (base_chunk_size, base_overlap))
+    # Adjust based on prompt complexity
+    complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete']
+    if any(keyword in prompt.lower() for keyword in complex_keywords):
+        chunk_size = int(chunk_size * 0.7)  # Smaller chunks for complex analysis
+        overlap = int(overlap * 1.2)        # More overlap for better context
+    # Adjust based on text length
+    if len(text) > 100000:  # Very long documents
+        chunk_size = int(chunk_size * 0.8)  # Smaller chunks
+        overlap = int(overlap * 1.3)        # More overlap
+    # Adjust based on content density
+    avg_sentence_length = len(text) / text.count('.') if text.count('.') > 0 else 100
+    if avg_sentence_length > 200:  # Dense technical content
+        chunk_size = int(chunk_size * 0.6)  # Much smaller chunks
+        overlap = int(overlap * 1.5)        # Much more overlap
+    # Ensure minimum and maximum bounds
+    chunk_size = max(3000, min(chunk_size, 20000))
+    overlap = max(500, min(overlap, chunk_size // 3))
+    return chunk_size, overlap
+def smart_chunk_text(text: str, prompt: str, document_type: str = "general") -> List[str]:
+    """
+    Smart chunking that adapts to content and analysis needs
+    """
+    if len(text) <= 15000:  # Small documents don't need chunking
+        return [text]
+    chunk_size, overlap = get_optimal_chunk_size(text, prompt, document_type)
+    # Use the optimized chunking
+    return chunk_text(text, chunk_size, overlap)
 def get_file_hash(file_path: str) -> str:
     """Generate hash for file caching"""
     with open(file_path, 'rb') as f: