Navada25 commited on
Commit
0a0ec2a
·
verified ·
1 Parent(s): 149e33a

Update document_intelligence.py with stock analysis features

Browse files
Files changed (1) hide show
  1. document_intelligence.py +569 -0
document_intelligence.py ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AI-Powered Document Intelligence System for NAVADA
2
+ """
3
+ Advanced document intelligence system providing:
4
+ - Smart content suggestions while editing documents
5
+ - Auto-completion of financial projections based on industry data
6
+ - Compliance checking for regulatory requirements
7
+ - Risk assessment with real-time scoring
8
+ - Version control with diff tracking
9
+ """
10
+
11
+ import json
12
+ import re
13
+ from datetime import datetime
14
+ from typing import Dict, List, Optional, Any, Tuple
15
+ import pandas as pd
16
+ import numpy as np
17
+ from openai import OpenAI
18
+ import asyncio
19
+ import logging
20
+ from difflib import SequenceMatcher
21
+ import hashlib
22
+
23
+ class DocumentIntelligenceEngine:
24
+ """AI-powered document intelligence and assistance system."""
25
+
26
+ def __init__(self, openai_client: OpenAI):
27
+ self.openai_client = openai_client
28
+ self.document_versions = {}
29
+ self.compliance_rules = self._load_compliance_rules()
30
+ self.industry_benchmarks = self._load_industry_benchmarks()
31
+ self.risk_factors = self._load_risk_factors()
32
+
33
+ def _load_compliance_rules(self) -> Dict[str, List[str]]:
34
+ """Load regulatory compliance rules by document type."""
35
+ return {
36
+ 'business_case': [
37
+ 'Include forward-looking statement disclaimers',
38
+ 'Verify market size claims with sources',
39
+ 'Ensure financial projections include assumptions',
40
+ 'Include risk disclosures for material factors'
41
+ ],
42
+ 'investor_memo': [
43
+ 'Include securities law disclaimers',
44
+ 'Verify accredited investor requirements',
45
+ 'Ensure material risk disclosures',
46
+ 'Include subscription agreement references'
47
+ ],
48
+ 'term_sheet': [
49
+ 'Verify liquidation preference terms',
50
+ 'Include anti-dilution provisions',
51
+ 'Specify board composition clearly',
52
+ 'Include standard protective provisions'
53
+ ],
54
+ 'executive_summary': [
55
+ 'Include company formation jurisdiction',
56
+ 'Verify intellectual property claims',
57
+ 'Include material contract disclosures',
58
+ 'Ensure competitive landscape accuracy'
59
+ ]
60
+ }
61
+
62
+ def _load_industry_benchmarks(self) -> Dict[str, Dict[str, Any]]:
63
+ """Load industry benchmark data for auto-completion."""
64
+ return {
65
+ 'saas': {
66
+ 'gross_margin': {'min': 65, 'median': 75, 'max': 85},
67
+ 'churn_rate': {'min': 3, 'median': 7, 'max': 15},
68
+ 'cac_ltv_ratio': {'min': 3, 'median': 5, 'max': 8},
69
+ 'growth_rate': {'min': 20, 'median': 50, 'max': 100},
70
+ 'burn_multiple': {'min': 1.2, 'median': 2.0, 'max': 3.5}
71
+ },
72
+ 'fintech': {
73
+ 'gross_margin': {'min': 45, 'median': 60, 'max': 80},
74
+ 'customer_acquisition_cost': {'min': 50, 'median': 200, 'max': 500},
75
+ 'transaction_volume_growth': {'min': 30, 'median': 80, 'max': 150},
76
+ 'regulatory_capital_ratio': {'min': 8, 'median': 12, 'max': 20}
77
+ },
78
+ 'ecommerce': {
79
+ 'gross_margin': {'min': 20, 'median': 35, 'max': 60},
80
+ 'conversion_rate': {'min': 1, 'median': 3, 'max': 8},
81
+ 'average_order_value': {'min': 25, 'median': 75, 'max': 200},
82
+ 'customer_lifetime_value': {'min': 100, 'median': 300, 'max': 800}
83
+ },
84
+ 'biotech': {
85
+ 'rd_expense_ratio': {'min': 40, 'median': 70, 'max': 90},
86
+ 'clinical_trial_success_rate': {'min': 10, 'median': 25, 'max': 45},
87
+ 'time_to_market': {'min': 5, 'median': 8, 'max': 12},
88
+ 'patent_portfolio_size': {'min': 5, 'median': 25, 'max': 100}
89
+ }
90
+ }
91
+
92
+ def _load_risk_factors(self) -> Dict[str, List[Dict[str, Any]]]:
93
+ """Load common risk factors by industry/stage."""
94
+ return {
95
+ 'early_stage': [
96
+ {'risk': 'Market Risk', 'probability': 0.7, 'impact': 'high',
97
+ 'description': 'Unproven market demand for product/service'},
98
+ {'risk': 'Execution Risk', 'probability': 0.6, 'impact': 'high',
99
+ 'description': 'Team may lack experience in scaling operations'},
100
+ {'risk': 'Funding Risk', 'probability': 0.5, 'impact': 'critical',
101
+ 'description': 'Difficulty raising subsequent funding rounds'},
102
+ {'risk': 'Technology Risk', 'probability': 0.4, 'impact': 'medium',
103
+ 'description': 'Technical challenges in product development'}
104
+ ],
105
+ 'growth_stage': [
106
+ {'risk': 'Competition Risk', 'probability': 0.8, 'impact': 'high',
107
+ 'description': 'Increased competition from established players'},
108
+ {'risk': 'Scaling Risk', 'probability': 0.6, 'impact': 'high',
109
+ 'description': 'Challenges in scaling operations efficiently'},
110
+ {'risk': 'Regulatory Risk', 'probability': 0.4, 'impact': 'medium',
111
+ 'description': 'Changing regulatory environment'},
112
+ {'risk': 'Key Person Risk', 'probability': 0.3, 'impact': 'high',
113
+ 'description': 'Dependence on key management personnel'}
114
+ ]
115
+ }
116
+
117
+ async def analyze_document_content(self, content: str, document_type: str,
118
+ industry: str = None) -> Dict[str, Any]:
119
+ """Analyze document content and provide intelligent suggestions."""
120
+ try:
121
+ analysis_results = {
122
+ 'content_analysis': await self._analyze_content_quality(content, document_type),
123
+ 'compliance_check': self._check_compliance(content, document_type),
124
+ 'risk_assessment': self._assess_risks(content, industry),
125
+ 'completion_suggestions': await self._generate_completion_suggestions(content, document_type, industry),
126
+ 'improvement_suggestions': await self._generate_improvement_suggestions(content, document_type),
127
+ 'readability_score': self._calculate_readability_score(content),
128
+ 'timestamp': datetime.now().isoformat()
129
+ }
130
+
131
+ return analysis_results
132
+
133
+ except Exception as e:
134
+ logging.error(f"Document analysis error: {e}")
135
+ return {'error': str(e)}
136
+
137
+ async def _analyze_content_quality(self, content: str, document_type: str) -> Dict[str, Any]:
138
+ """Analyze content quality using AI."""
139
+ try:
140
+ prompt = f"""
141
+ Analyze this {document_type} document content for quality, completeness, and professionalism.
142
+
143
+ Content: {content[:3000]}...
144
+
145
+ Provide analysis in this JSON format:
146
+ {{
147
+ "completeness_score": 0.85,
148
+ "professionalism_score": 0.92,
149
+ "clarity_score": 0.78,
150
+ "missing_sections": ["Financial Projections", "Risk Analysis"],
151
+ "strengths": ["Clear problem statement", "Strong market analysis"],
152
+ "weaknesses": ["Vague revenue model", "Limited competitive analysis"],
153
+ "overall_score": 0.85
154
+ }}
155
+ """
156
+
157
+ response = await asyncio.to_thread(
158
+ self.openai_client.chat.completions.create,
159
+ model="gpt-4",
160
+ messages=[{"role": "user", "content": prompt}],
161
+ temperature=0.3
162
+ )
163
+
164
+ try:
165
+ analysis = json.loads(response.choices[0].message.content)
166
+ return analysis
167
+ except json.JSONDecodeError:
168
+ # Fallback to basic analysis
169
+ return self._basic_content_analysis(content, document_type)
170
+
171
+ except Exception as e:
172
+ logging.error(f"AI content analysis error: {e}")
173
+ return self._basic_content_analysis(content, document_type)
174
+
175
+ def _basic_content_analysis(self, content: str, document_type: str) -> Dict[str, Any]:
176
+ """Basic content analysis without AI."""
177
+ word_count = len(content.split())
178
+
179
+ # Basic scoring based on content length and structure
180
+ completeness_score = min(1.0, word_count / 2000) # Assume 2000 words is complete
181
+
182
+ # Check for key sections
183
+ key_sections = {
184
+ 'business_case': ['executive summary', 'problem', 'solution', 'market', 'financial'],
185
+ 'investor_memo': ['investment', 'team', 'market', 'traction', 'financial'],
186
+ 'term_sheet': ['valuation', 'investment', 'liquidation', 'board', 'rights']
187
+ }
188
+
189
+ sections = key_sections.get(document_type, [])
190
+ found_sections = sum(1 for section in sections if section in content.lower())
191
+ section_score = found_sections / len(sections) if sections else 0.5
192
+
193
+ return {
194
+ 'completeness_score': completeness_score,
195
+ 'professionalism_score': 0.7, # Default
196
+ 'clarity_score': section_score,
197
+ 'missing_sections': [s for s in sections if s not in content.lower()],
198
+ 'strengths': ['Document structure present'],
199
+ 'weaknesses': ['Needs AI analysis for detailed feedback'],
200
+ 'overall_score': (completeness_score + section_score) / 2
201
+ }
202
+
203
+ def _check_compliance(self, content: str, document_type: str) -> Dict[str, Any]:
204
+ """Check document compliance with regulatory requirements."""
205
+ rules = self.compliance_rules.get(document_type, [])
206
+ compliance_results = {
207
+ 'total_rules': len(rules),
208
+ 'compliant_count': 0,
209
+ 'violations': [],
210
+ 'warnings': [],
211
+ 'compliance_score': 0.0
212
+ }
213
+
214
+ content_lower = content.lower()
215
+
216
+ # Check each compliance rule
217
+ for rule in rules:
218
+ is_compliant = False
219
+
220
+ if 'disclaimer' in rule.lower():
221
+ is_compliant = any(term in content_lower for term in
222
+ ['disclaimer', 'forward-looking', 'risk', 'projection'])
223
+ elif 'source' in rule.lower():
224
+ is_compliant = any(term in content_lower for term in
225
+ ['source', 'reference', 'data from', 'according to'])
226
+ elif 'assumption' in rule.lower():
227
+ is_compliant = any(term in content_lower for term in
228
+ ['assumption', 'estimate', 'projection', 'forecast'])
229
+ elif 'risk' in rule.lower():
230
+ is_compliant = any(term in content_lower for term in
231
+ ['risk', 'uncertainty', 'challenge', 'limitation'])
232
+ else:
233
+ # Default check for key terms
234
+ is_compliant = True
235
+
236
+ if is_compliant:
237
+ compliance_results['compliant_count'] += 1
238
+ else:
239
+ compliance_results['violations'].append(rule)
240
+
241
+ compliance_results['compliance_score'] = (
242
+ compliance_results['compliant_count'] / compliance_results['total_rules']
243
+ if compliance_results['total_rules'] > 0 else 1.0
244
+ )
245
+
246
+ return compliance_results
247
+
248
+ def _assess_risks(self, content: str, industry: str = None) -> Dict[str, Any]:
249
+ """Assess risks mentioned in document and suggest additional ones."""
250
+ content_lower = content.lower()
251
+
252
+ # Detect mentioned risks
253
+ mentioned_risks = []
254
+ risk_keywords = {
255
+ 'market risk': ['market', 'demand', 'customer', 'competition'],
256
+ 'technology risk': ['technology', 'technical', 'development', 'infrastructure'],
257
+ 'financial risk': ['financial', 'funding', 'cash', 'revenue'],
258
+ 'regulatory risk': ['regulatory', 'compliance', 'legal', 'policy'],
259
+ 'execution risk': ['execution', 'operational', 'management', 'team'],
260
+ 'competitive risk': ['competitive', 'competition', 'competitor', 'market share']
261
+ }
262
+
263
+ for risk_type, keywords in risk_keywords.items():
264
+ if any(keyword in content_lower for keyword in keywords):
265
+ mentioned_risks.append(risk_type)
266
+
267
+ # Suggest additional risks based on stage/industry
268
+ stage = 'early_stage' if 'startup' in content_lower or 'early' in content_lower else 'growth_stage'
269
+ suggested_risks = self.risk_factors.get(stage, [])
270
+
271
+ # Calculate overall risk score
272
+ total_possible_risks = len(risk_keywords)
273
+ risk_coverage = len(mentioned_risks) / total_possible_risks
274
+
275
+ return {
276
+ 'mentioned_risks': mentioned_risks,
277
+ 'suggested_additional_risks': suggested_risks[:3], # Top 3 suggestions
278
+ 'risk_coverage_score': risk_coverage,
279
+ 'risk_level': 'high' if risk_coverage < 0.4 else 'medium' if risk_coverage < 0.7 else 'low',
280
+ 'recommendations': self._generate_risk_recommendations(mentioned_risks, suggested_risks)
281
+ }
282
+
283
+ def _generate_risk_recommendations(self, mentioned_risks: List[str],
284
+ suggested_risks: List[Dict]) -> List[str]:
285
+ """Generate risk-related recommendations."""
286
+ recommendations = []
287
+
288
+ if len(mentioned_risks) < 3:
289
+ recommendations.append("Consider adding more comprehensive risk analysis")
290
+
291
+ if 'financial risk' not in mentioned_risks:
292
+ recommendations.append("Include financial and funding risks in your analysis")
293
+
294
+ if 'regulatory risk' not in mentioned_risks:
295
+ recommendations.append("Assess potential regulatory and compliance risks")
296
+
297
+ # Add suggestions based on highest probability risks
298
+ high_prob_risks = [r for r in suggested_risks if r['probability'] > 0.6]
299
+ if high_prob_risks:
300
+ recommendations.append(f"Pay special attention to {high_prob_risks[0]['risk'].lower()}")
301
+
302
+ return recommendations
303
+
304
+ async def _generate_completion_suggestions(self, content: str, document_type: str,
305
+ industry: str = None) -> Dict[str, Any]:
306
+ """Generate smart completion suggestions based on industry benchmarks."""
307
+ suggestions = {
308
+ 'financial_metrics': [],
309
+ 'market_sizing': [],
310
+ 'competitive_analysis': [],
311
+ 'growth_projections': []
312
+ }
313
+
314
+ # Get industry benchmarks if available
315
+ if industry and industry.lower() in self.industry_benchmarks:
316
+ benchmarks = self.industry_benchmarks[industry.lower()]
317
+
318
+ # Generate financial metric suggestions
319
+ if 'gross margin' not in content.lower():
320
+ margin_data = benchmarks.get('gross_margin', {})
321
+ if margin_data:
322
+ suggestions['financial_metrics'].append({
323
+ 'metric': 'Gross Margin',
324
+ 'suggested_range': f"{margin_data['min']}-{margin_data['max']}%",
325
+ 'industry_median': f"{margin_data['median']}%",
326
+ 'context': f"Typical for {industry} companies"
327
+ })
328
+
329
+ # Generate growth projection suggestions
330
+ growth_data = benchmarks.get('growth_rate', {})
331
+ if growth_data and 'growth' in content.lower():
332
+ suggestions['growth_projections'].append({
333
+ 'metric': 'Annual Growth Rate',
334
+ 'suggested_range': f"{growth_data['min']}-{growth_data['max']}%",
335
+ 'industry_median': f"{growth_data['median']}%",
336
+ 'context': f"Based on {industry} industry benchmarks"
337
+ })
338
+
339
+ # Add market sizing suggestions
340
+ if 'market' in content.lower() and 'tam' not in content.lower():
341
+ suggestions['market_sizing'].append({
342
+ 'suggestion': 'Include TAM/SAM/SOM analysis',
343
+ 'template': 'Total Addressable Market (TAM): $X billion\nServiceable Addressable Market (SAM): $Y billion\nServiceable Obtainable Market (SOM): $Z million',
344
+ 'priority': 'high'
345
+ })
346
+
347
+ return suggestions
348
+
349
+ async def _generate_improvement_suggestions(self, content: str,
350
+ document_type: str) -> List[Dict[str, Any]]:
351
+ """Generate AI-powered improvement suggestions."""
352
+ try:
353
+ prompt = f"""
354
+ Review this {document_type} content and suggest 3-5 specific improvements.
355
+ Focus on structure, clarity, persuasiveness, and completeness.
356
+
357
+ Content: {content[:2000]}...
358
+
359
+ Provide suggestions in this JSON format:
360
+ {{
361
+ "suggestions": [
362
+ {{
363
+ "category": "Structure",
364
+ "suggestion": "Add executive summary at the beginning",
365
+ "priority": "high",
366
+ "rationale": "Investors typically read executive summary first"
367
+ }}
368
+ ]
369
+ }}
370
+ """
371
+
372
+ response = await asyncio.to_thread(
373
+ self.openai_client.chat.completions.create,
374
+ model="gpt-4",
375
+ messages=[{"role": "user", "content": prompt}],
376
+ temperature=0.3
377
+ )
378
+
379
+ try:
380
+ result = json.loads(response.choices[0].message.content)
381
+ return result.get('suggestions', [])
382
+ except json.JSONDecodeError:
383
+ return self._basic_improvement_suggestions(content, document_type)
384
+
385
+ except Exception as e:
386
+ logging.error(f"AI improvement suggestions error: {e}")
387
+ return self._basic_improvement_suggestions(content, document_type)
388
+
389
+ def _basic_improvement_suggestions(self, content: str, document_type: str) -> List[Dict[str, Any]]:
390
+ """Generate basic improvement suggestions without AI."""
391
+ suggestions = []
392
+
393
+ word_count = len(content.split())
394
+
395
+ if word_count < 500:
396
+ suggestions.append({
397
+ 'category': 'Content',
398
+ 'suggestion': 'Expand content with more detailed analysis',
399
+ 'priority': 'high',
400
+ 'rationale': 'Document appears too brief for comprehensive evaluation'
401
+ })
402
+
403
+ if 'financial' not in content.lower() and document_type != 'term_sheet':
404
+ suggestions.append({
405
+ 'category': 'Financial Analysis',
406
+ 'suggestion': 'Include financial projections and metrics',
407
+ 'priority': 'high',
408
+ 'rationale': 'Financial data is critical for investor evaluation'
409
+ })
410
+
411
+ return suggestions
412
+
413
+ def _calculate_readability_score(self, content: str) -> Dict[str, Any]:
414
+ """Calculate readability metrics for the document."""
415
+ words = content.split()
416
+ sentences = content.count('.') + content.count('!') + content.count('?')
417
+
418
+ if not words or not sentences:
419
+ return {'score': 0, 'level': 'unclear'}
420
+
421
+ avg_words_per_sentence = len(words) / sentences
422
+
423
+ # Simple readability score (simplified Flesch formula)
424
+ if avg_words_per_sentence < 15:
425
+ score = 85
426
+ level = 'easy'
427
+ elif avg_words_per_sentence < 20:
428
+ score = 70
429
+ level = 'moderate'
430
+ else:
431
+ score = 50
432
+ level = 'difficult'
433
+
434
+ return {
435
+ 'score': score,
436
+ 'level': level,
437
+ 'avg_words_per_sentence': avg_words_per_sentence,
438
+ 'total_words': len(words),
439
+ 'total_sentences': sentences
440
+ }
441
+
442
+ def track_document_version(self, document_id: str, content: str,
443
+ author: str = 'user') -> Dict[str, Any]:
444
+ """Track document versions and changes."""
445
+ content_hash = hashlib.md5(content.encode()).hexdigest()
446
+ timestamp = datetime.now().isoformat()
447
+
448
+ if document_id not in self.document_versions:
449
+ self.document_versions[document_id] = []
450
+
451
+ # Check if this is actually a new version
452
+ if (self.document_versions[document_id] and
453
+ self.document_versions[document_id][-1]['content_hash'] == content_hash):
454
+ return {'message': 'No changes detected'}
455
+
456
+ version_number = len(self.document_versions[document_id]) + 1
457
+
458
+ version_info = {
459
+ 'version': version_number,
460
+ 'content_hash': content_hash,
461
+ 'author': author,
462
+ 'timestamp': timestamp,
463
+ 'content_length': len(content),
464
+ 'word_count': len(content.split())
465
+ }
466
+
467
+ # Calculate diff if there's a previous version
468
+ if self.document_versions[document_id]:
469
+ prev_version = self.document_versions[document_id][-1]
470
+ if 'content' in prev_version: # If we stored content
471
+ diff_ratio = SequenceMatcher(None, prev_version['content'], content).ratio()
472
+ version_info['change_ratio'] = 1 - diff_ratio
473
+ version_info['changes'] = self._calculate_changes(prev_version['content'], content)
474
+
475
+ # Store version (optionally store full content for diff)
476
+ version_info['content'] = content[:1000] # Store snippet for diff
477
+ self.document_versions[document_id].append(version_info)
478
+
479
+ return {
480
+ 'version_created': version_number,
481
+ 'timestamp': timestamp,
482
+ 'changes_detected': version_info.get('change_ratio', 0) > 0.1
483
+ }
484
+
485
+ def _calculate_changes(self, old_content: str, new_content: str) -> Dict[str, Any]:
486
+ """Calculate specific changes between document versions."""
487
+ old_words = set(old_content.split())
488
+ new_words = set(new_content.split())
489
+
490
+ added_words = new_words - old_words
491
+ removed_words = old_words - new_words
492
+
493
+ return {
494
+ 'words_added': len(added_words),
495
+ 'words_removed': len(removed_words),
496
+ 'new_words': list(added_words)[:10], # First 10 new words
497
+ 'removed_words': list(removed_words)[:10] # First 10 removed words
498
+ }
499
+
500
+ def get_version_history(self, document_id: str) -> List[Dict[str, Any]]:
501
+ """Get version history for a document."""
502
+ return self.document_versions.get(document_id, [])
503
+
504
+ async def suggest_next_content(self, current_content: str, cursor_position: int,
505
+ document_type: str, industry: str = None) -> List[str]:
506
+ """Suggest next content based on current context."""
507
+ try:
508
+ # Get context around cursor
509
+ context_start = max(0, cursor_position - 200)
510
+ context_end = min(len(current_content), cursor_position + 50)
511
+ context = current_content[context_start:context_end]
512
+
513
+ prompt = f"""
514
+ Given this document context for a {document_type}, suggest 3 possible next sentences or phrases:
515
+
516
+ Context: ...{context}...
517
+
518
+ Provide 3 suggestions that would logically continue this content:
519
+ 1. [suggestion 1]
520
+ 2. [suggestion 2]
521
+ 3. [suggestion 3]
522
+ """
523
+
524
+ response = await asyncio.to_thread(
525
+ self.openai_client.chat.completions.create,
526
+ model="gpt-4",
527
+ messages=[{"role": "user", "content": prompt}],
528
+ temperature=0.7,
529
+ max_tokens=200
530
+ )
531
+
532
+ content = response.choices[0].message.content
533
+ suggestions = []
534
+
535
+ # Parse numbered suggestions
536
+ for line in content.split('\n'):
537
+ if re.match(r'^\d+\.', line.strip()):
538
+ suggestion = re.sub(r'^\d+\.\s*', '', line.strip())
539
+ if suggestion:
540
+ suggestions.append(suggestion)
541
+
542
+ return suggestions[:3]
543
+
544
+ except Exception as e:
545
+ logging.error(f"Content suggestion error: {e}")
546
+ return self._basic_content_suggestions(current_content, document_type)
547
+
548
+ def _basic_content_suggestions(self, current_content: str, document_type: str) -> List[str]:
549
+ """Generate basic content suggestions without AI."""
550
+ suggestions = []
551
+
552
+ if 'market' in current_content.lower():
553
+ suggestions.append("Our target market consists of...")
554
+ suggestions.append("Market research indicates that...")
555
+ suggestions.append("The competitive landscape shows...")
556
+ elif 'financial' in current_content.lower():
557
+ suggestions.append("Revenue projections for the next 3 years...")
558
+ suggestions.append("Our unit economics demonstrate...")
559
+ suggestions.append("Key financial metrics include...")
560
+ else:
561
+ suggestions.append("Additionally, it's important to note that...")
562
+ suggestions.append("This approach provides several benefits...")
563
+ suggestions.append("The strategic implications include...")
564
+
565
+ return suggestions
566
+
567
+
568
+ # Export the class
569
+ __all__ = ['DocumentIntelligenceEngine']