Commit ·
a929e66
1
Parent(s): ffff3e5
Simplify PDF Analysis Orchestrator: Remove Document Type Analysis and fix context length errors
Browse files- Removed complex Document Type Analysis feature for better usability
- Implemented hierarchical summarization to handle large documents
- Added token counting utilities to prevent context length exceeded errors
- Simplified UI by removing document type selection
- Streamlined agent processing without dynamic token calculation
- Maintained all core analysis functionality with improved reliability
- agents.py +20 -92
- app.py +4 -29
- utils/__init__.py +77 -51
agents.py
CHANGED
|
@@ -38,70 +38,6 @@ class AnalysisAgent(BaseAgent):
|
|
| 38 |
super().__init__(name, model, tasks_completed)
|
| 39 |
self.visual_generator = VisualOutputGenerator()
|
| 40 |
|
| 41 |
-
def _detect_document_type(self, text: str, prompt: str) -> str:
|
| 42 |
-
"""Detect document type based on content and prompt"""
|
| 43 |
-
text_lower = text.lower()
|
| 44 |
-
prompt_lower = prompt.lower()
|
| 45 |
-
|
| 46 |
-
# Technical documents
|
| 47 |
-
if any(keyword in text_lower for keyword in ['api', 'function', 'method', 'class', 'code', 'implementation', 'technical specification']):
|
| 48 |
-
return "technical"
|
| 49 |
-
|
| 50 |
-
# Financial documents
|
| 51 |
-
if any(keyword in text_lower for keyword in ['revenue', 'profit', 'financial', 'balance sheet', 'income statement', 'cash flow', 'budget']):
|
| 52 |
-
return "financial"
|
| 53 |
-
|
| 54 |
-
# Legal documents
|
| 55 |
-
if any(keyword in text_lower for keyword in ['agreement', 'contract', 'terms', 'conditions', 'liability', 'legal', 'jurisdiction']):
|
| 56 |
-
return "legal"
|
| 57 |
-
|
| 58 |
-
# Academic papers
|
| 59 |
-
if any(keyword in text_lower for keyword in ['abstract', 'introduction', 'methodology', 'conclusion', 'references', 'research', 'study']):
|
| 60 |
-
return "academic"
|
| 61 |
-
|
| 62 |
-
# Business documents
|
| 63 |
-
if any(keyword in text_lower for keyword in ['business plan', 'strategy', 'market', 'customer', 'product', 'service']):
|
| 64 |
-
return "business"
|
| 65 |
-
|
| 66 |
-
# Creative content
|
| 67 |
-
if any(keyword in text_lower for keyword in ['creative', 'design', 'marketing', 'brand', 'advertising']):
|
| 68 |
-
return "creative"
|
| 69 |
-
|
| 70 |
-
# Check prompt for hints
|
| 71 |
-
if any(keyword in prompt_lower for keyword in ['technical', 'financial', 'legal', 'academic', 'business', 'creative']):
|
| 72 |
-
return prompt_lower.split()[0] # Use first keyword from prompt
|
| 73 |
-
|
| 74 |
-
return "general"
|
| 75 |
-
|
| 76 |
-
def _calculate_dynamic_tokens(self, prompt: str, text_length: int, document_type: str = "general") -> int:
|
| 77 |
-
"""Calculate dynamic token allocation based on prompt complexity, text length, and document type"""
|
| 78 |
-
base_tokens = Config.OPENAI_MAX_TOKENS
|
| 79 |
-
|
| 80 |
-
# Increase tokens for complex prompts
|
| 81 |
-
complex_keywords = ['analyze', 'comprehensive', 'detailed', 'thorough', 'complete', 'extensive']
|
| 82 |
-
complexity_multiplier = 1.0
|
| 83 |
-
for keyword in complex_keywords:
|
| 84 |
-
if keyword.lower() in prompt.lower():
|
| 85 |
-
complexity_multiplier += 0.3
|
| 86 |
-
|
| 87 |
-
# Increase tokens for longer documents
|
| 88 |
-
length_multiplier = min(2.0, 1.0 + (text_length / 50000)) # Cap at 2x for very long docs
|
| 89 |
-
|
| 90 |
-
# Increase tokens for specific document types
|
| 91 |
-
doc_type_multipliers = {
|
| 92 |
-
"technical": 1.3,
|
| 93 |
-
"financial": 1.4,
|
| 94 |
-
"legal": 1.5,
|
| 95 |
-
"academic": 1.2,
|
| 96 |
-
"business": 1.1,
|
| 97 |
-
"creative": 1.0,
|
| 98 |
-
"general": 1.0
|
| 99 |
-
}
|
| 100 |
-
doc_type_multiplier = doc_type_multipliers.get(document_type, 1.0)
|
| 101 |
-
|
| 102 |
-
final_tokens = int(base_tokens * complexity_multiplier * length_multiplier * doc_type_multiplier)
|
| 103 |
-
return min(final_tokens, 4000) # Cap at 4000 tokens
|
| 104 |
-
|
| 105 |
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 106 |
start_time = time.time()
|
| 107 |
|
|
@@ -119,20 +55,16 @@ class AnalysisAgent(BaseAgent):
|
|
| 119 |
# Load text with caching
|
| 120 |
text = load_pdf_text_cached(file_path)
|
| 121 |
|
| 122 |
-
# Detect document type
|
| 123 |
-
document_type = self._detect_document_type(text, prompt)
|
| 124 |
-
metadata['document_type'] = document_type
|
| 125 |
-
|
| 126 |
# Check if document needs chunking
|
| 127 |
if len(text) > Config.CHUNK_SIZE:
|
| 128 |
-
result = await self._handle_large_document(prompt, text, metadata
|
| 129 |
else:
|
| 130 |
content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
|
| 131 |
-
result = await self._process_content(prompt, content, metadata, text
|
| 132 |
else:
|
| 133 |
content = f"User prompt: {prompt}"
|
| 134 |
metadata = {}
|
| 135 |
-
result = await self._process_content(prompt, content, metadata, ""
|
| 136 |
|
| 137 |
# Cache the result
|
| 138 |
if file_path:
|
|
@@ -140,12 +72,12 @@ class AnalysisAgent(BaseAgent):
|
|
| 140 |
|
| 141 |
return result
|
| 142 |
|
| 143 |
-
async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str
|
| 144 |
-
"""Process content with
|
| 145 |
start_time = time.time()
|
| 146 |
|
| 147 |
-
#
|
| 148 |
-
max_tokens =
|
| 149 |
|
| 150 |
system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
|
| 151 |
|
|
@@ -210,15 +142,13 @@ VISUAL ELEMENTS TO USE:
|
|
| 210 |
|
| 211 |
return result
|
| 212 |
|
| 213 |
-
async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]
|
| 214 |
-
"""Handle large documents by processing in
|
| 215 |
-
# Use
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
metadata['chunk_size'] = optimal_size
|
| 221 |
-
metadata['chunk_overlap'] = optimal_overlap
|
| 222 |
metadata['total_chunks'] = len(chunks)
|
| 223 |
chunk_results = []
|
| 224 |
|
|
@@ -243,16 +173,14 @@ VISUAL ELEMENTS TO USE:
|
|
| 243 |
# Combine chunk results
|
| 244 |
combined_analysis = "\n\n".join(chunk_results)
|
| 245 |
|
| 246 |
-
# Create final summary
|
| 247 |
-
summary_prompt = f"Please provide a comprehensive summary that combines insights from all chunks of this large document. Original prompt: {prompt}\n\nChunk analyses:\n{combined_analysis}"
|
| 248 |
-
|
| 249 |
try:
|
| 250 |
-
|
|
|
|
|
|
|
|
|
|
| 251 |
model=self.model,
|
| 252 |
-
|
| 253 |
-
{"role": "user", "content": summary_prompt}],
|
| 254 |
-
temperature=Config.OPENAI_TEMPERATURE,
|
| 255 |
-
max_tokens=Config.OPENAI_MAX_TOKENS
|
| 256 |
)
|
| 257 |
except Exception as e:
|
| 258 |
logger.exception("AnalysisAgent failed on final summary")
|
|
|
|
| 38 |
super().__init__(name, model, tasks_completed)
|
| 39 |
self.visual_generator = VisualOutputGenerator()
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
async def handle(self, user_id: str, prompt: str, file_path: Optional[str] = None, context: Optional[Dict[str, Any]] = None):
|
| 42 |
start_time = time.time()
|
| 43 |
|
|
|
|
| 55 |
# Load text with caching
|
| 56 |
text = load_pdf_text_cached(file_path)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# Check if document needs chunking
|
| 59 |
if len(text) > Config.CHUNK_SIZE:
|
| 60 |
+
result = await self._handle_large_document(prompt, text, metadata)
|
| 61 |
else:
|
| 62 |
content = f"User prompt: {prompt}\n\nDocument text:\n{text}"
|
| 63 |
+
result = await self._process_content(prompt, content, metadata, text)
|
| 64 |
else:
|
| 65 |
content = f"User prompt: {prompt}"
|
| 66 |
metadata = {}
|
| 67 |
+
result = await self._process_content(prompt, content, metadata, "")
|
| 68 |
|
| 69 |
# Cache the result
|
| 70 |
if file_path:
|
|
|
|
| 72 |
|
| 73 |
return result
|
| 74 |
|
| 75 |
+
async def _process_content(self, prompt: str, content: str, metadata: Dict[str, Any], text: str) -> Dict[str, Any]:
|
| 76 |
+
"""Process content with visual formatting"""
|
| 77 |
start_time = time.time()
|
| 78 |
|
| 79 |
+
# Use standard token allocation
|
| 80 |
+
max_tokens = Config.OPENAI_MAX_TOKENS
|
| 81 |
|
| 82 |
system = """You are AnalysisAgent: produce stunning, visually rich, and highly engaging insights.
|
| 83 |
|
|
|
|
| 142 |
|
| 143 |
return result
|
| 144 |
|
| 145 |
+
async def _handle_large_document(self, prompt: str, text: str, metadata: Dict[str, Any]) -> Dict[str, Any]:
|
| 146 |
+
"""Handle large documents by processing in chunks"""
|
| 147 |
+
# Use standard chunking
|
| 148 |
+
from utils import chunk_text
|
| 149 |
+
chunks = chunk_text(text, Config.CHUNK_SIZE)
|
| 150 |
+
metadata['chunk_size'] = Config.CHUNK_SIZE
|
| 151 |
+
metadata['chunk_overlap'] = 1000
|
|
|
|
|
|
|
| 152 |
metadata['total_chunks'] = len(chunks)
|
| 153 |
chunk_results = []
|
| 154 |
|
|
|
|
| 173 |
# Combine chunk results
|
| 174 |
combined_analysis = "\n\n".join(chunk_results)
|
| 175 |
|
| 176 |
+
# Create final summary using hierarchical approach to avoid token limits
|
|
|
|
|
|
|
| 177 |
try:
|
| 178 |
+
from utils import create_hierarchical_summary
|
| 179 |
+
final_summary = await create_hierarchical_summary(
|
| 180 |
+
chunk_results=chunk_results,
|
| 181 |
+
prompt=prompt,
|
| 182 |
model=self.model,
|
| 183 |
+
max_tokens=6000 # Conservative limit to avoid context length errors
|
|
|
|
|
|
|
|
|
|
| 184 |
)
|
| 185 |
except Exception as e:
|
| 186 |
logger.exception("AnalysisAgent failed on final summary")
|
app.py
CHANGED
|
@@ -240,38 +240,13 @@ with gr.Blocks(title="PDF Analysis & Orchestrator", theme=gr.themes.Soft()) as d
|
|
| 240 |
username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
|
| 241 |
|
| 242 |
# Custom Prompts Section
|
| 243 |
-
with gr.Accordion("🎯
|
| 244 |
-
gr.Markdown("**Choose a document type for specialized analysis:**")
|
| 245 |
prompt_dropdown = gr.Dropdown(
|
| 246 |
choices=get_custom_prompts(),
|
| 247 |
-
label="
|
| 248 |
-
value=None
|
| 249 |
-
info="Choose the type of document you're analyzing for better results"
|
| 250 |
)
|
| 251 |
-
load_prompt_btn = gr.Button("
|
| 252 |
-
|
| 253 |
-
# Document type categories
|
| 254 |
-
with gr.Row():
|
| 255 |
-
gr.Markdown("**Quick Categories:**")
|
| 256 |
-
with gr.Row():
|
| 257 |
-
gr.Markdown("📄 **Business:** Whitepapers, Business Plans")
|
| 258 |
-
gr.Markdown("⚙️ **Technical:** User Manuals, Specs")
|
| 259 |
-
with gr.Row():
|
| 260 |
-
gr.Markdown("💰 **Financial:** Reports, Bank Statements")
|
| 261 |
-
gr.Markdown("🎓 **Academic:** Research Papers")
|
| 262 |
-
with gr.Row():
|
| 263 |
-
gr.Markdown("⚖️ **Legal:** Contracts, Agreements")
|
| 264 |
-
gr.Markdown("🎨 **Creative:** Briefs, Marketing")
|
| 265 |
-
|
| 266 |
-
# Smart processing info
|
| 267 |
-
gr.Markdown("**🧠 Smart Processing:**")
|
| 268 |
-
gr.Markdown("• **Auto-optimized chunk sizes** based on document type")
|
| 269 |
-
gr.Markdown("• **Technical docs**: 8K chars (dense content)")
|
| 270 |
-
gr.Markdown("• **Financial docs**: 6K chars (precise data)")
|
| 271 |
-
gr.Markdown("• **Legal docs**: 5K chars (detailed terms)")
|
| 272 |
-
gr.Markdown("• **Academic papers**: 10K chars (research)")
|
| 273 |
-
gr.Markdown("• **Business docs**: 12K chars (standard)")
|
| 274 |
-
gr.Markdown("• **Creative content**: 18K chars (narrative)")
|
| 275 |
|
| 276 |
with gr.Column(scale=2):
|
| 277 |
gr.Markdown("### Analysis Instructions")
|
|
|
|
| 240 |
username_input = gr.Textbox(label="Username (optional)", placeholder="anonymous", elem_id="username")
|
| 241 |
|
| 242 |
# Custom Prompts Section
|
| 243 |
+
with gr.Accordion("🎯 Custom Prompts", open=False):
|
|
|
|
| 244 |
prompt_dropdown = gr.Dropdown(
|
| 245 |
choices=get_custom_prompts(),
|
| 246 |
+
label="Select Custom Prompt",
|
| 247 |
+
value=None
|
|
|
|
| 248 |
)
|
| 249 |
+
load_prompt_btn = gr.Button("Load Prompt", size="sm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
with gr.Column(scale=2):
|
| 252 |
gr.Markdown("### Analysis Instructions")
|
utils/__init__.py
CHANGED
|
@@ -139,65 +139,91 @@ def chunk_text(text: str, chunk_size: int = 15000, overlap: int = 1000) -> List[
|
|
| 139 |
|
| 140 |
return chunks
|
| 141 |
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
chunk_size = int(chunk_size * 0.7) # Smaller chunks for complex analysis
|
| 166 |
-
overlap = int(overlap * 1.2) # More overlap for better context
|
| 167 |
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
-
#
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
chunk_size = int(chunk_size * 0.6) # Much smaller chunks
|
| 177 |
-
overlap = int(overlap * 1.5) # Much more overlap
|
| 178 |
|
| 179 |
-
|
| 180 |
-
chunk_size = max(3000, min(chunk_size, 20000))
|
| 181 |
-
overlap = max(500, min(overlap, chunk_size // 3))
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
"""
|
| 187 |
-
Smart chunking that adapts to content and analysis needs
|
| 188 |
-
"""
|
| 189 |
-
if len(text) <= 15000: # Small documents don't need chunking
|
| 190 |
-
return [text]
|
| 191 |
|
| 192 |
-
|
| 193 |
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
|
| 202 |
# ------------------------
|
| 203 |
# Enhanced Caching System
|
|
|
|
| 139 |
|
| 140 |
return chunks
|
| 141 |
|
| 142 |
+
|
| 143 |
+
def get_file_hash(file_path: str) -> str:
|
| 144 |
+
"""Generate hash for file caching"""
|
| 145 |
+
with open(file_path, 'rb') as f:
|
| 146 |
+
return hashlib.md5(f.read()).hexdigest()
|
| 147 |
+
|
| 148 |
+
# ------------------------
|
| 149 |
+
# Token Counting Utilities
|
| 150 |
+
# ------------------------
|
| 151 |
+
def estimate_tokens(text: str) -> int:
|
| 152 |
+
"""Rough estimation of token count (1 token ≈ 4 characters for English)"""
|
| 153 |
+
return len(text) // 4
|
| 154 |
+
|
| 155 |
+
def is_within_token_limit(text: str, max_tokens: int = 6000) -> bool:
|
| 156 |
+
"""Check if text is within token limit for API calls"""
|
| 157 |
+
return estimate_tokens(text) <= max_tokens
|
| 158 |
+
|
| 159 |
+
def truncate_to_token_limit(text: str, max_tokens: int = 6000) -> str:
|
| 160 |
+
"""Truncate text to fit within token limit"""
|
| 161 |
+
if is_within_token_limit(text, max_tokens):
|
| 162 |
+
return text
|
| 163 |
|
| 164 |
+
# Rough character limit based on token estimation
|
| 165 |
+
char_limit = max_tokens * 4
|
| 166 |
+
return text[:char_limit] + "\n\n[Content truncated due to length...]"
|
| 167 |
+
|
| 168 |
+
# ------------------------
|
| 169 |
+
# Hierarchical Summarization
|
| 170 |
+
# ------------------------
|
| 171 |
+
async def create_hierarchical_summary(chunk_results: List[str], prompt: str, model: str, max_tokens: int = 6000) -> str:
|
| 172 |
+
"""Create a summary using hierarchical approach to avoid token limits"""
|
| 173 |
|
| 174 |
+
# First, create intermediate summaries of groups of chunks
|
| 175 |
+
intermediate_summaries = []
|
| 176 |
+
group_size = 3 # Process 3 chunks at a time
|
|
|
|
|
|
|
| 177 |
|
| 178 |
+
for i in range(0, len(chunk_results), group_size):
|
| 179 |
+
group = chunk_results[i:i + group_size]
|
| 180 |
+
group_text = "\n\n".join(group)
|
| 181 |
+
|
| 182 |
+
# Truncate if too long
|
| 183 |
+
if not is_within_token_limit(group_text, max_tokens):
|
| 184 |
+
group_text = truncate_to_token_limit(group_text, max_tokens)
|
| 185 |
+
|
| 186 |
+
group_prompt = f"Summarize the following chunk analyses, focusing on key insights and findings:\n\n{group_text}"
|
| 187 |
+
|
| 188 |
+
try:
|
| 189 |
+
summary = await call_openai_chat(
|
| 190 |
+
model=model,
|
| 191 |
+
messages=[
|
| 192 |
+
{"role": "system", "content": "You are a summarization expert. Create concise summaries that capture the most important insights."},
|
| 193 |
+
{"role": "user", "content": group_prompt}
|
| 194 |
+
],
|
| 195 |
+
temperature=0.2,
|
| 196 |
+
max_tokens=800
|
| 197 |
+
)
|
| 198 |
+
intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\n{summary}")
|
| 199 |
+
except Exception as e:
|
| 200 |
+
intermediate_summaries.append(f"Group {i//group_size + 1} Summary:\nError: {str(e)}")
|
| 201 |
|
| 202 |
+
# Now create final summary from intermediate summaries
|
| 203 |
+
if len(intermediate_summaries) == 1:
|
| 204 |
+
return intermediate_summaries[0]
|
|
|
|
|
|
|
| 205 |
|
| 206 |
+
final_text = "\n\n".join(intermediate_summaries)
|
|
|
|
|
|
|
| 207 |
|
| 208 |
+
# If still too long, create another level of summarization
|
| 209 |
+
if not is_within_token_limit(final_text, max_tokens):
|
| 210 |
+
final_text = truncate_to_token_limit(final_text, max_tokens)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
|
| 212 |
+
final_prompt = f"Create a comprehensive final summary based on the following intermediate summaries. Original prompt: {prompt}\n\n{final_text}"
|
| 213 |
|
| 214 |
+
try:
|
| 215 |
+
final_summary = await call_openai_chat(
|
| 216 |
+
model=model,
|
| 217 |
+
messages=[
|
| 218 |
+
{"role": "system", "content": "You are an expert at creating comprehensive summaries from multiple sources. Synthesize the key insights into a coherent final summary."},
|
| 219 |
+
{"role": "user", "content": final_prompt}
|
| 220 |
+
],
|
| 221 |
+
temperature=0.2,
|
| 222 |
+
max_tokens=1000
|
| 223 |
+
)
|
| 224 |
+
return final_summary
|
| 225 |
+
except Exception as e:
|
| 226 |
+
return f"Error creating final summary: {str(e)}\n\nIntermediate summaries:\n{final_text}"
|
| 227 |
|
| 228 |
# ------------------------
|
| 229 |
# Enhanced Caching System
|