JatsTheAIGen commited on
Commit
a929e66
·
1 Parent(s): ffff3e5

Simplify PDF Analysis Orchestrator: Remove Document Type Analysis and fix context length errors

Browse files

- Removed complex Document Type Analysis feature for better usability
- Implemented hierarchical summarization to handle large documents
- Added token counting utilities to prevent context length exceeded errors
- Simplified UI by removing document type selection
- Streamlined agent processing without dynamic token calculation
- Maintained all core analysis functionality with improved reliability

Files changed (3) hide show
  1. agents.py +20 -92
  2. app.py +4 -29
  3. utils/__init__.py +77 -51
agents.py CHANGED
@@ -38,70 +38,6 @@ class AnalysisAgent(BaseAgent):
38
  super().__init__(name, model, tasks_completed)
39
  self.visual_generator = VisualOutputGenerator()
40
 
41
- def _detect_document_type(self, text: str, prompt: str) -> str:
42
- """Detect document type based on content and prompt"""
43
- text_lower = text.lower()
44
- prompt_lower = prompt.lower()
45
-
46
- # Technical documents
47
- if any(keyword in text_lower for keyword in ['api', 'function', 'method', 'class', 'code', 'implementation', 'technical specification']):
48
- return "technical"
49
-
50
- # Financial documents
51
- if any(keyword in text_lower for keyword in ['revenue', 'profit', 'financial', 'balance sheet', 'income statement', 'cash flow', 'budget']):
52
- return "financial"
53
-
54
- # Legal documents
55
- if any(keyword in text_lower for keyword in ['agreement', 'contract', 'terms', 'conditions', 'liability', 'legal', 'jurisdiction']):
56
- return "legal"
57
-
58
- # Academic papers
59
- if any(keyword in text_lower for keyword in ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'research', 'study']):
60
- return "academic"
61
-
62
- # Business documents
63
- if any(keyword in text_lower for keyword in ['business plan', 'strategy', 'market', 'customer', 'product', 'service']):
64
- return "business"
65
-
66
- # Creative content
67
- if any(keyword in text_lower for keyword in ['creative', 'design', 'marketing', 'brand', 'advertising']):
68
- return "creative"
69
-
70
- # Check prompt for hints
71
- if any(keyword in prompt_lower for keyword in ['technical', 'financial', 'legal', 'academic', 'business', 'creative']):
72
- return prompt_lower.split()[0] # Use first keyword from prompt
73
-
74
- return "general"
75
-
76
- def _calculate_dynamic_tokens(self, prompt: str, text_length: int, document_type: str = "general") -> int:
77
- """Calculate dynamic token allocation based on prompt complexity, text length, and document type"""
78
- base_tokens = Config.OPENAI_MAX_TOKENS
79
-
80
- # Increase tokens for complex prompts
81
- complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete', 'extensive']
82
- complexity_multiplier = 1.0
83
- for keyword in complex_keywords:
84
- if keyword.lower() in prompt.lower():
85
- complexity_multiplier += 0.3
86
-
87
- # Increase tokens for longer documents
88
- length_multiplier = min(2.0, 1.0 + (text_length / 50000)) # Cap at 2x for very long docs
89
-
90
- # Increase tokens for specific document types
91
- doc_type_multipliers = {
92
- "technical": 1.3,
93
- "financial": 1.4,
94
- "legal": 1.5,
95
- "academic": 1.2,
96
- "business": 1.1,
97
- "creative": 1.0,
98
- "general": 1.0
99
- }
100
- doc_type_multiplier = doc_type_multipliers.get(document_type, 1.0)
101
-
102
- final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
103
- return min(final_tokens, 4000) # Cap at 4000 tokens
104
-
105
  async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
106
  start_time = time.time()
107
 
@@ -119,20 +55,16 @@ class AnalysisAgent(BaseAgent):
119
  # Load text with caching
120
  text = load_pdf_text_cached(file_path)
121
 
122
- # Detect document type
123
- document_type = self._detect_document_type(text, prompt)
124
- metadata['document_type'] = document_type
125
-
126
  # Check if document needs chunking
127
  if len(text) > Config.CHUNK_SIZE:
128
- result = await self._handle_large_document(prompt, text, metadata, document_type)
129
  else:
130
  content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
131
- result = await self._process_content(prompt, content, metadata, text, document_type)
132
  else:
133
  content = f"User prompt: {prompt}"
134
  metadata = {}
135
- result = await self._process_content(prompt, content, metadata, "", "general")
136
 
137
  # Cache the result
138
  if file_path:
@@ -140,12 +72,12 @@ class AnalysisAgent(BaseAgent):
140
 
141
  return result
142
 
143
- async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str, document_type: str = "general") -> Dict[str, Any]:
144
- """Process content with dynamic token allocation and visual formatting"""
145
  start_time = time.time()
146
 
147
- # Calculate dynamic tokens
148
- max_tokens = self._calculate_dynamic_tokens(prompt, len(text), document_type)
149
 
150
  system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
151
 
@@ -210,15 +142,13 @@ VISUAL ELEMENTS TO USE:
210
 
211
  return result
212
 
213
- async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any], document_type: str = "general") -> Dict[str, Any]:
214
- """Handle large documents by processing in smart chunks"""
215
- # Use smart chunking based on document type and content
216
- chunks = smart_chunk_text(text, prompt, document_type)
217
-
218
- # Get optimal chunk size for display
219
- optimal_size, optimal_overlap = get_optimal_chunk_size(text, prompt, document_type)
220
- metadata['chunk_size'] = optimal_size
221
- metadata['chunk_overlap'] = optimal_overlap
222
  metadata['total_chunks'] = len(chunks)
223
  chunk_results = []
224
 
@@ -243,16 +173,14 @@ VISUAL ELEMENTS TO USE:
243
  # Combine chunk results
244
  combined_analysis = "\n\n".join(chunk_results)
245
 
246
- # Create final summary
247
- summary_prompt = f"Please provide a comprehensive summary that combines insights from all chunks of this large document. Original prompt: {prompt}\n\nChunk analyses:\n{combined_analysis}"
248
-
249
  try:
250
- final_summary = await call_openai_chat(
 
 
 
251
  model=self.model,
252
- messages=[{"role": "system", "content": "You are AnalysisAgent: create comprehensive summaries from multiple document chunks."},
253
- {"role": "user", "content": summary_prompt}],
254
- temperature=Config.OPENAI_TEMPERATURE,
255
- max_tokens=Config.OPENAI_MAX_TOKENS
256
  )
257
  except Exception as e:
258
  logger.exception("AnalysisAgent failed on final summary")
 
38
  super().__init__(name, model, tasks_completed)
39
  self.visual_generator = VisualOutputGenerator()
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
42
  start_time = time.time()
43
 
 
55
  # Load text with caching
56
  text = load_pdf_text_cached(file_path)
57
 
 
 
 
 
58
  # Check if document needs chunking
59
  if len(text) > Config.CHUNK_SIZE:
60
+ result = await self._handle_large_document(prompt, text, metadata)
61
  else:
62
  content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
63
+ result = await self._process_content(prompt, content, metadata, text)
64
  else:
65
  content = f"User prompt: {prompt}"
66
  metadata = {}
67
+ result = await self._process_content(prompt, content, metadata, "")
68
 
69
  # Cache the result
70
  if file_path:
 
72
 
73
  return result
74
 
75
+ async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
76
+ """Process content with visual formatting"""
77
  start_time = time.time()
78
 
79
+ # Use standard token allocation
80
+ max_tokens = Config.OPENAI_MAX_TOKENS
81
 
82
  system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
83
 
 
142
 
143
  return result
144
 
145
+ async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
146
+ """Handle large documents by processing in chunks"""
147
+ # Use standard chunking
148
+ from utils import chunk_text
149
+ chunks = chunk_text(text, Config.CHUNK_SIZE)
150
+ metadata['chunk_size'] = Config.CHUNK_SIZE
151
+ metadata['chunk_overlap'] = 1000
 
 
152
  metadata['total_chunks'] = len(chunks)
153
  chunk_results = []
154
 
 
173
  # Combine chunk results
174
  combined_analysis = "\n\n".join(chunk_results)
175
 
176
+ # Create final summary using hierarchical approach to avoid token limits
 
 
177
  try:
178
+ from utils import create_hierarchical_summary
179
+ final_summary = await create_hierarchical_summary(
180
+ chunk_results=chunk_results,
181
+ prompt=prompt,
182
  model=self.model,
183
+ max_tokens=6000 # Conservative limit to avoid context length errors
 
 
 
184
  )
185
  except Exception as e:
186
  logger.exception("AnalysisAgent failed on final summary")
app.py CHANGED
@@ -240,38 +240,13 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
240
  username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
241
 
242
  # Custom Prompts Section
243
- with gr.Accordion("🎯 Document Type Analysis", open=True):
244
- gr.Markdown("**Choose a document type for specialized analysis:**")
245
  prompt_dropdown = gr.Dropdown(
246
  choices=get_custom_prompts(),
247
- label="📋 Select Document Type",
248
- value=None,
249
- info="Choose the type of document you're analyzing for better results"
250
  )
251
- load_prompt_btn = gr.Button("📥 Load Analysis Template", size="sm", variant="secondary")
252
-
253
- # Document type categories
254
- with gr.Row():
255
- gr.Markdown("**Quick Categories:**")
256
- with gr.Row():
257
- gr.Markdown("📄 **Business:** Whitepapers, Business Plans")
258
- gr.Markdown("⚙️ **Technical:** User Manuals, Specs")
259
- with gr.Row():
260
- gr.Markdown("💰 **Financial:** Reports, Bank Statements")
261
- gr.Markdown("🎓 **Academic:** Research Papers")
262
- with gr.Row():
263
- gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
264
- gr.Markdown("🎨 **Creative:** Briefs, Marketing")
265
-
266
- # Smart processing info
267
- gr.Markdown("**🧠 Smart Processing:**")
268
- gr.Markdown("• **Auto-optimized chunk sizes** based on document type")
269
- gr.Markdown("• **Technical docs**: 8K chars (dense content)")
270
- gr.Markdown("• **Financial docs**: 6K chars (precise data)")
271
- gr.Markdown("• **Legal docs**: 5K chars (detailed terms)")
272
- gr.Markdown("• **Academic papers**: 10K chars (research)")
273
- gr.Markdown("• **Business docs**: 12K chars (standard)")
274
- gr.Markdown("• **Creative content**: 18K chars (narrative)")
275
 
276
  with gr.Column(scale=2):
277
  gr.Markdown("### Analysis Instructions")
 
240
  username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
241
 
242
  # Custom Prompts Section
243
+ with gr.Accordion("🎯 Custom Prompts", open=False):
 
244
  prompt_dropdown = gr.Dropdown(
245
  choices=get_custom_prompts(),
246
+ label="Select Custom Prompt",
247
+ value=None
 
248
  )
249
+ load_prompt_btn = gr.Button("Load Prompt", size="sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
 
251
  with gr.Column(scale=2):
252
  gr.Markdown("### Analysis Instructions")
utils/__init__.py CHANGED
@@ -139,65 +139,91 @@ def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[
139
 
140
  return chunks
141
 
142
- def get_optimal_chunk_size(text: str, prompt: str, document_type: str = "general") -> tuple[int, int]:
143
- """
144
- Calculate optimal chunk size and overlap based on content and analysis type
145
- """
146
- base_chunk_size = 15000
147
- base_overlap = 1000
148
-
149
- # Adjust based on document type
150
- type_adjustments = {
151
- "technical": (8000, 1200), # Smaller chunks for technical docs
152
- "financial": (6000, 1000), # Even smaller for financial data
153
- "legal": (5000, 800), # Small chunks for legal precision
154
- "academic": (10000, 1500), # Medium chunks for academic papers
155
- "business": (12000, 1000), # Standard for business docs
156
- "creative": (18000, 1500), # Larger for creative content
157
- "general": (15000, 1000) # Default
158
- }
 
 
 
 
159
 
160
- chunk_size, overlap = type_adjustments.get(document_type, (base_chunk_size, base_overlap))
 
 
 
 
 
 
 
 
161
 
162
- # Adjust based on prompt complexity
163
- complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete']
164
- if any(keyword in prompt.lower() for keyword in complex_keywords):
165
- chunk_size = int(chunk_size * 0.7) # Smaller chunks for complex analysis
166
- overlap = int(overlap * 1.2) # More overlap for better context
167
 
168
- # Adjust based on text length
169
- if len(text) > 100000: # Very long documents
170
- chunk_size = int(chunk_size * 0.8) # Smaller chunks
171
- overlap = int(overlap * 1.3) # More overlap
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
172
 
173
- # Adjust based on content density
174
- avg_sentence_length = len(text) / text.count('.') if text.count('.') > 0 else 100
175
- if avg_sentence_length > 200: # Dense technical content
176
- chunk_size = int(chunk_size * 0.6) # Much smaller chunks
177
- overlap = int(overlap * 1.5) # Much more overlap
178
 
179
- # Ensure minimum and maximum bounds
180
- chunk_size = max(3000, min(chunk_size, 20000))
181
- overlap = max(500, min(overlap, chunk_size // 3))
182
 
183
- return chunk_size, overlap
184
-
185
- def smart_chunk_text(text: str, prompt: str, document_type: str = "general") -> List[str]:
186
- """
187
- Smart chunking that adapts to content and analysis needs
188
- """
189
- if len(text) <= 15000: # Small documents don't need chunking
190
- return [text]
191
 
192
- chunk_size, overlap = get_optimal_chunk_size(text, prompt, document_type)
193
 
194
- # Use the optimized chunking
195
- return chunk_text(text, chunk_size, overlap)
196
-
197
- def get_file_hash(file_path: str) -> str:
198
- """Generate hash for file caching"""
199
- with open(file_path, 'rb') as f:
200
- return hashlib.md5(f.read()).hexdigest()
 
 
 
 
 
 
201
 
202
  # ------------------------
203
  # Enhanced Caching System
 
139
 
140
  return chunks
141
 
142
+
143
+ def get_file_hash(file_path: str) -> str:
144
+ """Generate hash for file caching"""
145
+ with open(file_path, 'rb') as f:
146
+ return hashlib.md5(f.read()).hexdigest()
147
+
148
+ # ------------------------
149
+ # Token Counting Utilities
150
+ # ------------------------
151
+ def estimate_tokens(text: str) -> int:
152
+ """Rough estimation of token count (1 token 4 characters for English)"""
153
+ return len(text) // 4
154
+
155
+ def is_within_token_limit(text: str, max_tokens: int = 6000) -> bool:
156
+ """Check if text is within token limit for API calls"""
157
+ return estimate_tokens(text) <= max_tokens
158
+
159
+ def truncate_to_token_limit(text: str, max_tokens: int = 6000) -> str:
160
+ """Truncate text to fit within token limit"""
161
+ if is_within_token_limit(text, max_tokens):
162
+ return text
163
 
164
+ # Rough character limit based on token estimation
165
+ char_limit = max_tokens * 4
166
+ return text[:char_limit] + "\n\n[Content truncated due to length...]"
167
+
168
+ # ------------------------
169
+ # Hierarchical Summarization
170
+ # ------------------------
171
+ async def create_hierarchical_summary(chunk_results: List[str], prompt: str, model: str, max_tokens: int = 6000) -> str:
172
+ """Create a summary using hierarchical approach to avoid token limits"""
173
 
174
+ # First, create intermediate summaries of groups of chunks
175
+ intermediate_summaries = []
176
+ group_size = 3 # Process 3 chunks at a time
 
 
177
 
178
+ for i in range(0, len(chunk_results), group_size):
179
+ group = chunk_results[i:i + group_size]
180
+ group_text = "\n\n".join(group)
181
+
182
+ # Truncate if too long
183
+ if not is_within_token_limit(group_text, max_tokens):
184
+ group_text = truncate_to_token_limit(group_text, max_tokens)
185
+
186
+ group_prompt = f"Summarize the following chunk analyses, focusing on key insights and findings:\n\n{group_text}"
187
+
188
+ try:
189
+ summary = await call_openai_chat(
190
+ model=model,
191
+ messages=[
192
+ {"role": "system", "content": "You are a summarization expert. Create concise summaries that capture the most important insights."},
193
+ {"role": "user", "content": group_prompt}
194
+ ],
195
+ temperature=0.2,
196
+ max_tokens=800
197
+ )
198
+ intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\n{summary}")
199
+ except Exception as e:
200
+ intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\nError: {str(e)}")
201
 
202
+ # Now create final summary from intermediate summaries
203
+ if len(intermediate_summaries) == 1:
204
+ return intermediate_summaries[0]
 
 
205
 
206
+ final_text = "\n\n".join(intermediate_summaries)
 
 
207
 
208
+ # If still too long, create another level of summarization
209
+ if not is_within_token_limit(final_text, max_tokens):
210
+ final_text = truncate_to_token_limit(final_text, max_tokens)
 
 
 
 
 
211
 
212
+ final_prompt = f"Create a comprehensive final summary based on the following intermediate summaries. Original prompt: {prompt}\n\n{final_text}"
213
 
214
+ try:
215
+ final_summary = await call_openai_chat(
216
+ model=model,
217
+ messages=[
218
+ {"role": "system", "content": "You are an expert at creating comprehensive summaries from multiple sources. Synthesize the key insights into a coherent final summary."},
219
+ {"role": "user", "content": final_prompt}
220
+ ],
221
+ temperature=0.2,
222
+ max_tokens=1000
223
+ )
224
+ return final_summary
225
+ except Exception as e:
226
+ return f"Error creating final summary: {str(e)}\n\nIntermediate summaries:\n{final_text}"
227
 
228
  # ------------------------
229
  # Enhanced Caching System