JatsTheAIGen commited on
Commit
ffff3e5
·
1 Parent(s): 73f15b1

Implement smart chunking: adaptive chunk sizes based on document type and content complexity

Browse files
Files changed (3) hide show
  1. agents.py +67 -17
  2. app.py +10 -0
  3. utils/__init__.py +55 -0
agents.py CHANGED
@@ -5,7 +5,7 @@ import logging
5
  from typing import Optional, Dict, Any, List, AsyncGenerator
6
  import time
7
 
8
- from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis
9
  from utils.visual_output import VisualOutputGenerator
10
  from config import Config
11
 
@@ -38,8 +38,43 @@ class AnalysisAgent(BaseAgent):
38
  super().__init__(name, model, tasks_completed)
39
  self.visual_generator = VisualOutputGenerator()
40
 
41
- def _calculate_dynamic_tokens(self, prompt: str, text_length: int) -> int:
42
- """Calculate dynamic token allocation based on prompt complexity and text length"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  base_tokens = Config.OPENAI_MAX_TOKENS
44
 
45
  # Increase tokens for complex prompts
@@ -53,11 +88,16 @@ class AnalysisAgent(BaseAgent):
53
  length_multiplier = min(2.0, 1.0 + (text_length / 50000)) # Cap at 2x for very long docs
54
 
55
  # Increase tokens for specific document types
56
- doc_type_keywords = ['whitepaper', 'research', 'technical', 'financial', 'legal', 'academic']
57
- doc_type_multiplier = 1.0
58
- for keyword in doc_type_keywords:
59
- if keyword.lower() in prompt.lower():
60
- doc_type_multiplier += 0.2
 
 
 
 
 
61
 
62
  final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
63
  return min(final_tokens, 4000) # Cap at 4000 tokens
@@ -79,16 +119,20 @@ class AnalysisAgent(BaseAgent):
79
  # Load text with caching
80
  text = load_pdf_text_cached(file_path)
81
 
 
 
 
 
82
  # Check if document needs chunking
83
  if len(text) > Config.CHUNK_SIZE:
84
- result = await self._handle_large_document(prompt, text, metadata)
85
  else:
86
  content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
87
- result = await self._process_content(prompt, content, metadata, text)
88
  else:
89
  content = f"User prompt: {prompt}"
90
  metadata = {}
91
- result = await self._process_content(prompt, content, metadata, "")
92
 
93
  # Cache the result
94
  if file_path:
@@ -96,12 +140,12 @@ class AnalysisAgent(BaseAgent):
96
 
97
  return result
98
 
99
- async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
100
  """Process content with dynamic token allocation and visual formatting"""
101
  start_time = time.time()
102
 
103
  # Calculate dynamic tokens
104
- max_tokens = self._calculate_dynamic_tokens(prompt, len(text))
105
 
106
  system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
107
 
@@ -166,10 +210,16 @@ VISUAL ELEMENTS TO USE:
166
 
167
  return result
168
 
169
- async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
170
- """Handle large documents by processing in chunks"""
171
- from utils import chunk_text
172
- chunks = chunk_text(text, Config.CHUNK_SIZE)
 
 
 
 
 
 
173
  chunk_results = []
174
 
175
  system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."
 
5
  from typing import Optional, Dict, Any, List, AsyncGenerator
6
  import time
7
 
8
+ from utils import call_openai_chat, load_pdf_text_cached, load_pdf_text_chunked, get_document_metadata, get_cached_analysis, cache_analysis, smart_chunk_text, get_optimal_chunk_size
9
  from utils.visual_output import VisualOutputGenerator
10
  from config import Config
11
 
 
38
  super().__init__(name, model, tasks_completed)
39
  self.visual_generator = VisualOutputGenerator()
40
 
41
+ def _detect_document_type(self, text: str, prompt: str) -> str:
42
+ """Detect document type based on content and prompt"""
43
+ text_lower = text.lower()
44
+ prompt_lower = prompt.lower()
45
+
46
+ # Technical documents
47
+ if any(keyword in text_lower for keyword in ['api', 'function', 'method', 'class', 'code', 'implementation', 'technical specification']):
48
+ return "technical"
49
+
50
+ # Financial documents
51
+ if any(keyword in text_lower for keyword in ['revenue', 'profit', 'financial', 'balance sheet', 'income statement', 'cash flow', 'budget']):
52
+ return "financial"
53
+
54
+ # Legal documents
55
+ if any(keyword in text_lower for keyword in ['agreement', 'contract', 'terms', 'conditions', 'liability', 'legal', 'jurisdiction']):
56
+ return "legal"
57
+
58
+ # Academic papers
59
+ if any(keyword in text_lower for keyword in ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'research', 'study']):
60
+ return "academic"
61
+
62
+ # Business documents
63
+ if any(keyword in text_lower for keyword in ['business plan', 'strategy', 'market', 'customer', 'product', 'service']):
64
+ return "business"
65
+
66
+ # Creative content
67
+ if any(keyword in text_lower for keyword in ['creative', 'design', 'marketing', 'brand', 'advertising']):
68
+ return "creative"
69
+
70
+ # Check prompt for hints
71
+ if any(keyword in prompt_lower for keyword in ['technical', 'financial', 'legal', 'academic', 'business', 'creative']):
72
+ return prompt_lower.split()[0] # Use first keyword from prompt
73
+
74
+ return "general"
75
+
76
+ def _calculate_dynamic_tokens(self, prompt: str, text_length: int, document_type: str = "general") -> int:
77
+ """Calculate dynamic token allocation based on prompt complexity, text length, and document type"""
78
  base_tokens = Config.OPENAI_MAX_TOKENS
79
 
80
  # Increase tokens for complex prompts
 
88
  length_multiplier = min(2.0, 1.0 + (text_length / 50000)) # Cap at 2x for very long docs
89
 
90
  # Increase tokens for specific document types
91
+ doc_type_multipliers = {
92
+ "technical": 1.3,
93
+ "financial": 1.4,
94
+ "legal": 1.5,
95
+ "academic": 1.2,
96
+ "business": 1.1,
97
+ "creative": 1.0,
98
+ "general": 1.0
99
+ }
100
+ doc_type_multiplier = doc_type_multipliers.get(document_type, 1.0)
101
 
102
  final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
103
  return min(final_tokens, 4000) # Cap at 4000 tokens
 
119
  # Load text with caching
120
  text = load_pdf_text_cached(file_path)
121
 
122
+ # Detect document type
123
+ document_type = self._detect_document_type(text, prompt)
124
+ metadata['document_type'] = document_type
125
+
126
  # Check if document needs chunking
127
  if len(text) > Config.CHUNK_SIZE:
128
+ result = await self._handle_large_document(prompt, text, metadata, document_type)
129
  else:
130
  content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
131
+ result = await self._process_content(prompt, content, metadata, text, document_type)
132
  else:
133
  content = f"User prompt: {prompt}"
134
  metadata = {}
135
+ result = await self._process_content(prompt, content, metadata, "", "general")
136
 
137
  # Cache the result
138
  if file_path:
 
140
 
141
  return result
142
 
143
+ async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str, document_type: str = "general") -> Dict[str, Any]:
144
  """Process content with dynamic token allocation and visual formatting"""
145
  start_time = time.time()
146
 
147
  # Calculate dynamic tokens
148
+ max_tokens = self._calculate_dynamic_tokens(prompt, len(text), document_type)
149
 
150
  system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
151
 
 
210
 
211
  return result
212
 
213
+ async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any], document_type: str = "general") -> Dict[str, Any]:
214
+ """Handle large documents by processing in smart chunks"""
215
+ # Use smart chunking based on document type and content
216
+ chunks = smart_chunk_text(text, prompt, document_type)
217
+
218
+ # Get optimal chunk size for display
219
+ optimal_size, optimal_overlap = get_optimal_chunk_size(text, prompt, document_type)
220
+ metadata['chunk_size'] = optimal_size
221
+ metadata['chunk_overlap'] = optimal_overlap
222
+ metadata['total_chunks'] = len(chunks)
223
  chunk_results = []
224
 
225
  system = "You are AnalysisAgent: produce concise insights and structured summaries. Adapt your language and complexity to the target audience. Provide clear, actionable insights with appropriate examples and analogies for complex topics."
app.py CHANGED
@@ -262,6 +262,16 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
262
  with gr.Row():
263
  gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
264
  gr.Markdown("🎨 **Creative:** Briefs, Marketing")
 
 
 
 
 
 
 
 
 
 
265
 
266
  with gr.Column(scale=2):
267
  gr.Markdown("### Analysis Instructions")
 
262
  with gr.Row():
263
  gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
264
  gr.Markdown("🎨 **Creative:** Briefs, Marketing")
265
+
266
+ # Smart processing info
267
+ gr.Markdown("**🧠 Smart Processing:**")
268
+ gr.Markdown("• **Auto-optimized chunk sizes** based on document type")
269
+ gr.Markdown("• **Technical docs**: 8K chars (dense content)")
270
+ gr.Markdown("• **Financial docs**: 6K chars (precise data)")
271
+ gr.Markdown("• **Legal docs**: 5K chars (detailed terms)")
272
+ gr.Markdown("• **Academic papers**: 10K chars (research)")
273
+ gr.Markdown("• **Business docs**: 12K chars (standard)")
274
+ gr.Markdown("• **Creative content**: 18K chars (narrative)")
275
 
276
  with gr.Column(scale=2):
277
  gr.Markdown("### Analysis Instructions")
utils/__init__.py CHANGED
@@ -139,6 +139,61 @@ def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[
139
 
140
  return chunks
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  def get_file_hash(file_path: str) -> str:
143
  """Generate hash for file caching"""
144
  with open(file_path, 'rb') as f:
 
139
 
140
  return chunks
141
 
142
+ def get_optimal_chunk_size(text: str, prompt: str, document_type: str = "general") -> tuple[int, int]:
143
+ """
144
+ Calculate optimal chunk size and overlap based on content and analysis type
145
+ """
146
+ base_chunk_size = 15000
147
+ base_overlap = 1000
148
+
149
+ # Adjust based on document type
150
+ type_adjustments = {
151
+ "technical": (8000, 1200), # Smaller chunks for technical docs
152
+ "financial": (6000, 1000), # Even smaller for financial data
153
+ "legal": (5000, 800), # Small chunks for legal precision
154
+ "academic": (10000, 1500), # Medium chunks for academic papers
155
+ "business": (12000, 1000), # Standard for business docs
156
+ "creative": (18000, 1500), # Larger for creative content
157
+ "general": (15000, 1000) # Default
158
+ }
159
+
160
+ chunk_size, overlap = type_adjustments.get(document_type, (base_chunk_size, base_overlap))
161
+
162
+ # Adjust based on prompt complexity
163
+ complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete']
164
+ if any(keyword in prompt.lower() for keyword in complex_keywords):
165
+ chunk_size = int(chunk_size * 0.7) # Smaller chunks for complex analysis
166
+ overlap = int(overlap * 1.2) # More overlap for better context
167
+
168
+ # Adjust based on text length
169
+ if len(text) > 100000: # Very long documents
170
+ chunk_size = int(chunk_size * 0.8) # Smaller chunks
171
+ overlap = int(overlap * 1.3) # More overlap
172
+
173
+ # Adjust based on content density
174
+ avg_sentence_length = len(text) / text.count('.') if text.count('.') > 0 else 100
175
+ if avg_sentence_length > 200: # Dense technical content
176
+ chunk_size = int(chunk_size * 0.6) # Much smaller chunks
177
+ overlap = int(overlap * 1.5) # Much more overlap
178
+
179
+ # Ensure minimum and maximum bounds
180
+ chunk_size = max(3000, min(chunk_size, 20000))
181
+ overlap = max(500, min(overlap, chunk_size // 3))
182
+
183
+ return chunk_size, overlap
184
+
185
+ def smart_chunk_text(text: str, prompt: str, document_type: str = "general") -> List[str]:
186
+ """
187
+ Smart chunking that adapts to content and analysis needs
188
+ """
189
+ if len(text) <= 15000: # Small documents don't need chunking
190
+ return [text]
191
+
192
+ chunk_size, overlap = get_optimal_chunk_size(text, prompt, document_type)
193
+
194
+ # Use the optimized chunking
195
+ return chunk_text(text, chunk_size, overlap)
196
+
197
  def get_file_hash(file_path: str) -> str:
198
  """Generate hash for file caching"""
199
  with open(file_path, 'rb') as f: