Spaces:

Navada25
/

Start-Up_Viability_Agent

Build error

App Files Files Community

Navada25 commited on Oct 20, 2025

Commit

0a0ec2a

verified ·

1 Parent(s): 149e33a

Update document_intelligence.py with stock analysis features

Browse files

Files changed (1) hide show

document_intelligence.py +569 -0

document_intelligence.py ADDED Viewed

	@@ -0,0 +1,569 @@

+# AI-Powered Document Intelligence System for NAVADA
+"""
+Advanced document intelligence system providing:
+- Smart content suggestions while editing documents
+- Auto-completion of financial projections based on industry data
+- Compliance checking for regulatory requirements
+- Risk assessment with real-time scoring
+- Version control with diff tracking
+"""
+import json
+import re
+from datetime import datetime
+from typing import Dict, List, Optional, Any, Tuple
+import pandas as pd
+import numpy as np
+from openai import OpenAI
+import asyncio
+import logging
+from difflib import SequenceMatcher
+import hashlib
+class DocumentIntelligenceEngine:
+    """AI-powered document intelligence and assistance system."""
+    def __init__(self, openai_client: OpenAI):
+        self.openai_client = openai_client
+        self.document_versions = {}
+        self.compliance_rules = self._load_compliance_rules()
+        self.industry_benchmarks = self._load_industry_benchmarks()
+        self.risk_factors = self._load_risk_factors()
+    def _load_compliance_rules(self) -> Dict[str, List[str]]:
+        """Load regulatory compliance rules by document type."""
+        return {
+            'business_case': [
+                'Include forward-looking statement disclaimers',
+                'Verify market size claims with sources',
+                'Ensure financial projections include assumptions',
+                'Include risk disclosures for material factors'
+            ],
+            'investor_memo': [
+                'Include securities law disclaimers',
+                'Verify accredited investor requirements',
+                'Ensure material risk disclosures',
+                'Include subscription agreement references'
+            ],
+            'term_sheet': [
+                'Verify liquidation preference terms',
+                'Include anti-dilution provisions',
+                'Specify board composition clearly',
+                'Include standard protective provisions'
+            ],
+            'executive_summary': [
+                'Include company formation jurisdiction',
+                'Verify intellectual property claims',
+                'Include material contract disclosures',
+                'Ensure competitive landscape accuracy'
+            ]
+        }
+    def _load_industry_benchmarks(self) -> Dict[str, Dict[str, Any]]:
+        """Load industry benchmark data for auto-completion."""
+        return {
+            'saas': {
+                'gross_margin': {'min': 65, 'median': 75, 'max': 85},
+                'churn_rate': {'min': 3, 'median': 7, 'max': 15},
+                'cac_ltv_ratio': {'min': 3, 'median': 5, 'max': 8},
+                'growth_rate': {'min': 20, 'median': 50, 'max': 100},
+                'burn_multiple': {'min': 1.2, 'median': 2.0, 'max': 3.5}
+            },
+            'fintech': {
+                'gross_margin': {'min': 45, 'median': 60, 'max': 80},
+                'customer_acquisition_cost': {'min': 50, 'median': 200, 'max': 500},
+                'transaction_volume_growth': {'min': 30, 'median': 80, 'max': 150},
+                'regulatory_capital_ratio': {'min': 8, 'median': 12, 'max': 20}
+            },
+            'ecommerce': {
+                'gross_margin': {'min': 20, 'median': 35, 'max': 60},
+                'conversion_rate': {'min': 1, 'median': 3, 'max': 8},
+                'average_order_value': {'min': 25, 'median': 75, 'max': 200},
+                'customer_lifetime_value': {'min': 100, 'median': 300, 'max': 800}
+            },
+            'biotech': {
+                'rd_expense_ratio': {'min': 40, 'median': 70, 'max': 90},
+                'clinical_trial_success_rate': {'min': 10, 'median': 25, 'max': 45},
+                'time_to_market': {'min': 5, 'median': 8, 'max': 12},
+                'patent_portfolio_size': {'min': 5, 'median': 25, 'max': 100}
+            }
+        }
+    def _load_risk_factors(self) -> Dict[str, List[Dict[str, Any]]]:
+        """Load common risk factors by industry/stage."""
+        return {
+            'early_stage': [
+                {'risk': 'Market Risk', 'probability': 0.7, 'impact': 'high',
+                 'description': 'Unproven market demand for product/service'},
+                {'risk': 'Execution Risk', 'probability': 0.6, 'impact': 'high',
+                 'description': 'Team may lack experience in scaling operations'},
+                {'risk': 'Funding Risk', 'probability': 0.5, 'impact': 'critical',
+                 'description': 'Difficulty raising subsequent funding rounds'},
+                {'risk': 'Technology Risk', 'probability': 0.4, 'impact': 'medium',
+                 'description': 'Technical challenges in product development'}
+            ],
+            'growth_stage': [
+                {'risk': 'Competition Risk', 'probability': 0.8, 'impact': 'high',
+                 'description': 'Increased competition from established players'},
+                {'risk': 'Scaling Risk', 'probability': 0.6, 'impact': 'high',
+                 'description': 'Challenges in scaling operations efficiently'},
+                {'risk': 'Regulatory Risk', 'probability': 0.4, 'impact': 'medium',
+                 'description': 'Changing regulatory environment'},
+                {'risk': 'Key Person Risk', 'probability': 0.3, 'impact': 'high',
+                 'description': 'Dependence on key management personnel'}
+            ]
+        }
+    async def analyze_document_content(self, content: str, document_type: str,
+                                     industry: str = None) -> Dict[str, Any]:
+        """Analyze document content and provide intelligent suggestions."""
+        try:
+            analysis_results = {
+                'content_analysis': await self._analyze_content_quality(content, document_type),
+                'compliance_check': self._check_compliance(content, document_type),
+                'risk_assessment': self._assess_risks(content, industry),
+                'completion_suggestions': await self._generate_completion_suggestions(content, document_type, industry),
+                'improvement_suggestions': await self._generate_improvement_suggestions(content, document_type),
+                'readability_score': self._calculate_readability_score(content),
+                'timestamp': datetime.now().isoformat()
+            }
+            return analysis_results
+        except Exception as e:
+            logging.error(f"Document analysis error: {e}")
+            return {'error': str(e)}
+    async def _analyze_content_quality(self, content: str, document_type: str) -> Dict[str, Any]:
+        """Analyze content quality using AI."""
+        try:
+            prompt = f"""
+            Analyze this {document_type} document content for quality, completeness, and professionalism.
+            Content: {content[:3000]}...
+            Provide analysis in this JSON format:
+            {{
+                "completeness_score": 0.85,
+                "professionalism_score": 0.92,
+                "clarity_score": 0.78,
+                "missing_sections": ["Financial Projections", "Risk Analysis"],
+                "strengths": ["Clear problem statement", "Strong market analysis"],
+                "weaknesses": ["Vague revenue model", "Limited competitive analysis"],
+                "overall_score": 0.85
+            }}
+            """
+            response = await asyncio.to_thread(
+                self.openai_client.chat.completions.create,
+                model="gpt-4",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3
+            )
+            try:
+                analysis = json.loads(response.choices[0].message.content)
+                return analysis
+            except json.JSONDecodeError:
+                # Fallback to basic analysis
+                return self._basic_content_analysis(content, document_type)
+        except Exception as e:
+            logging.error(f"AI content analysis error: {e}")
+            return self._basic_content_analysis(content, document_type)
+    def _basic_content_analysis(self, content: str, document_type: str) -> Dict[str, Any]:
+        """Basic content analysis without AI."""
+        word_count = len(content.split())
+        # Basic scoring based on content length and structure
+        completeness_score = min(1.0, word_count / 2000)  # Assume 2000 words is complete
+        # Check for key sections
+        key_sections = {
+            'business_case': ['executive summary', 'problem', 'solution', 'market', 'financial'],
+            'investor_memo': ['investment', 'team', 'market', 'traction', 'financial'],
+            'term_sheet': ['valuation', 'investment', 'liquidation', 'board', 'rights']
+        }
+        sections = key_sections.get(document_type, [])
+        found_sections = sum(1 for section in sections if section in content.lower())
+        section_score = found_sections / len(sections) if sections else 0.5
+        return {
+            'completeness_score': completeness_score,
+            'professionalism_score': 0.7,  # Default
+            'clarity_score': section_score,
+            'missing_sections': [s for s in sections if s not in content.lower()],
+            'strengths': ['Document structure present'],
+            'weaknesses': ['Needs AI analysis for detailed feedback'],
+            'overall_score': (completeness_score + section_score) / 2
+        }
+    def _check_compliance(self, content: str, document_type: str) -> Dict[str, Any]:
+        """Check document compliance with regulatory requirements."""
+        rules = self.compliance_rules.get(document_type, [])
+        compliance_results = {
+            'total_rules': len(rules),
+            'compliant_count': 0,
+            'violations': [],
+            'warnings': [],
+            'compliance_score': 0.0
+        }
+        content_lower = content.lower()
+        # Check each compliance rule
+        for rule in rules:
+            is_compliant = False
+            if 'disclaimer' in rule.lower():
+                is_compliant = any(term in content_lower for term in
+                                 ['disclaimer', 'forward-looking', 'risk', 'projection'])
+            elif 'source' in rule.lower():
+                is_compliant = any(term in content_lower for term in
+                                 ['source', 'reference', 'data from', 'according to'])
+            elif 'assumption' in rule.lower():
+                is_compliant = any(term in content_lower for term in
+                                 ['assumption', 'estimate', 'projection', 'forecast'])
+            elif 'risk' in rule.lower():
+                is_compliant = any(term in content_lower for term in
+                                 ['risk', 'uncertainty', 'challenge', 'limitation'])
+            else:
+                # Default check for key terms
+                is_compliant = True
+            if is_compliant:
+                compliance_results['compliant_count'] += 1
+            else:
+                compliance_results['violations'].append(rule)
+        compliance_results['compliance_score'] = (
+            compliance_results['compliant_count'] / compliance_results['total_rules']
+            if compliance_results['total_rules'] > 0 else 1.0
+        )
+        return compliance_results
+    def _assess_risks(self, content: str, industry: str = None) -> Dict[str, Any]:
+        """Assess risks mentioned in document and suggest additional ones."""
+        content_lower = content.lower()
+        # Detect mentioned risks
+        mentioned_risks = []
+        risk_keywords = {
+            'market risk': ['market', 'demand', 'customer', 'competition'],
+            'technology risk': ['technology', 'technical', 'development', 'infrastructure'],
+            'financial risk': ['financial', 'funding', 'cash', 'revenue'],
+            'regulatory risk': ['regulatory', 'compliance', 'legal', 'policy'],
+            'execution risk': ['execution', 'operational', 'management', 'team'],
+            'competitive risk': ['competitive', 'competition', 'competitor', 'market share']
+        }
+        for risk_type, keywords in risk_keywords.items():
+            if any(keyword in content_lower for keyword in keywords):
+                mentioned_risks.append(risk_type)
+        # Suggest additional risks based on stage/industry
+        stage = 'early_stage' if 'startup' in content_lower or 'early' in content_lower else 'growth_stage'
+        suggested_risks = self.risk_factors.get(stage, [])
+        # Calculate overall risk score
+        total_possible_risks = len(risk_keywords)
+        risk_coverage = len(mentioned_risks) / total_possible_risks
+        return {
+            'mentioned_risks': mentioned_risks,
+            'suggested_additional_risks': suggested_risks[:3],  # Top 3 suggestions
+            'risk_coverage_score': risk_coverage,
+            'risk_level': 'high' if risk_coverage < 0.4 else 'medium' if risk_coverage < 0.7 else 'low',
+            'recommendations': self._generate_risk_recommendations(mentioned_risks, suggested_risks)
+        }
+    def _generate_risk_recommendations(self, mentioned_risks: List[str],
+                                     suggested_risks: List[Dict]) -> List[str]:
+        """Generate risk-related recommendations."""
+        recommendations = []
+        if len(mentioned_risks) < 3:
+            recommendations.append("Consider adding more comprehensive risk analysis")
+        if 'financial risk' not in mentioned_risks:
+            recommendations.append("Include financial and funding risks in your analysis")
+        if 'regulatory risk' not in mentioned_risks:
+            recommendations.append("Assess potential regulatory and compliance risks")
+        # Add suggestions based on highest probability risks
+        high_prob_risks = [r for r in suggested_risks if r['probability'] > 0.6]
+        if high_prob_risks:
+            recommendations.append(f"Pay special attention to {high_prob_risks[0]['risk'].lower()}")
+        return recommendations
+    async def _generate_completion_suggestions(self, content: str, document_type: str,
+                                             industry: str = None) -> Dict[str, Any]:
+        """Generate smart completion suggestions based on industry benchmarks."""
+        suggestions = {
+            'financial_metrics': [],
+            'market_sizing': [],
+            'competitive_analysis': [],
+            'growth_projections': []
+        }
+        # Get industry benchmarks if available
+        if industry and industry.lower() in self.industry_benchmarks:
+            benchmarks = self.industry_benchmarks[industry.lower()]
+            # Generate financial metric suggestions
+            if 'gross margin' not in content.lower():
+                margin_data = benchmarks.get('gross_margin', {})
+                if margin_data:
+                    suggestions['financial_metrics'].append({
+                        'metric': 'Gross Margin',
+                        'suggested_range': f"{margin_data['min']}-{margin_data['max']}%",
+                        'industry_median': f"{margin_data['median']}%",
+                        'context': f"Typical for {industry} companies"
+                    })
+            # Generate growth projection suggestions
+            growth_data = benchmarks.get('growth_rate', {})
+            if growth_data and 'growth' in content.lower():
+                suggestions['growth_projections'].append({
+                    'metric': 'Annual Growth Rate',
+                    'suggested_range': f"{growth_data['min']}-{growth_data['max']}%",
+                    'industry_median': f"{growth_data['median']}%",
+                    'context': f"Based on {industry} industry benchmarks"
+                })
+        # Add market sizing suggestions
+        if 'market' in content.lower() and 'tam' not in content.lower():
+            suggestions['market_sizing'].append({
+                'suggestion': 'Include TAM/SAM/SOM analysis',
+                'template': 'Total Addressable Market (TAM): $X billion\nServiceable Addressable Market (SAM): $Y billion\nServiceable Obtainable Market (SOM): $Z million',
+                'priority': 'high'
+            })
+        return suggestions
+    async def _generate_improvement_suggestions(self, content: str,
+                                              document_type: str) -> List[Dict[str, Any]]:
+        """Generate AI-powered improvement suggestions."""
+        try:
+            prompt = f"""
+            Review this {document_type} content and suggest 3-5 specific improvements.
+            Focus on structure, clarity, persuasiveness, and completeness.
+            Content: {content[:2000]}...
+            Provide suggestions in this JSON format:
+            {{
+                "suggestions": [
+                    {{
+                        "category": "Structure",
+                        "suggestion": "Add executive summary at the beginning",
+                        "priority": "high",
+                        "rationale": "Investors typically read executive summary first"
+                    }}
+                ]
+            }}
+            """
+            response = await asyncio.to_thread(
+                self.openai_client.chat.completions.create,
+                model="gpt-4",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3
+            )
+            try:
+                result = json.loads(response.choices[0].message.content)
+                return result.get('suggestions', [])
+            except json.JSONDecodeError:
+                return self._basic_improvement_suggestions(content, document_type)
+        except Exception as e:
+            logging.error(f"AI improvement suggestions error: {e}")
+            return self._basic_improvement_suggestions(content, document_type)
+    def _basic_improvement_suggestions(self, content: str, document_type: str) -> List[Dict[str, Any]]:
+        """Generate basic improvement suggestions without AI."""
+        suggestions = []
+        word_count = len(content.split())
+        if word_count < 500:
+            suggestions.append({
+                'category': 'Content',
+                'suggestion': 'Expand content with more detailed analysis',
+                'priority': 'high',
+                'rationale': 'Document appears too brief for comprehensive evaluation'
+            })
+        if 'financial' not in content.lower() and document_type != 'term_sheet':
+            suggestions.append({
+                'category': 'Financial Analysis',
+                'suggestion': 'Include financial projections and metrics',
+                'priority': 'high',
+                'rationale': 'Financial data is critical for investor evaluation'
+            })
+        return suggestions
+    def _calculate_readability_score(self, content: str) -> Dict[str, Any]:
+        """Calculate readability metrics for the document."""
+        words = content.split()
+        sentences = content.count('.') + content.count('!') + content.count('?')
+        if not words or not sentences:
+            return {'score': 0, 'level': 'unclear'}
+        avg_words_per_sentence = len(words) / sentences
+        # Simple readability score (simplified Flesch formula)
+        if avg_words_per_sentence < 15:
+            score = 85
+            level = 'easy'
+        elif avg_words_per_sentence < 20:
+            score = 70
+            level = 'moderate'
+        else:
+            score = 50
+            level = 'difficult'
+        return {
+            'score': score,
+            'level': level,
+            'avg_words_per_sentence': avg_words_per_sentence,
+            'total_words': len(words),
+            'total_sentences': sentences
+        }
+    def track_document_version(self, document_id: str, content: str,
+                              author: str = 'user') -> Dict[str, Any]:
+        """Track document versions and changes."""
+        content_hash = hashlib.md5(content.encode()).hexdigest()
+        timestamp = datetime.now().isoformat()
+        if document_id not in self.document_versions:
+            self.document_versions[document_id] = []
+        # Check if this is actually a new version
+        if (self.document_versions[document_id] and
+            self.document_versions[document_id][-1]['content_hash'] == content_hash):
+            return {'message': 'No changes detected'}
+        version_number = len(self.document_versions[document_id]) + 1
+        version_info = {
+            'version': version_number,
+            'content_hash': content_hash,
+            'author': author,
+            'timestamp': timestamp,
+            'content_length': len(content),
+            'word_count': len(content.split())
+        }
+        # Calculate diff if there's a previous version
+        if self.document_versions[document_id]:
+            prev_version = self.document_versions[document_id][-1]
+            if 'content' in prev_version:  # If we stored content
+                diff_ratio = SequenceMatcher(None, prev_version['content'], content).ratio()
+                version_info['change_ratio'] = 1 - diff_ratio
+                version_info['changes'] = self._calculate_changes(prev_version['content'], content)
+        # Store version (optionally store full content for diff)
+        version_info['content'] = content[:1000]  # Store snippet for diff
+        self.document_versions[document_id].append(version_info)
+        return {
+            'version_created': version_number,
+            'timestamp': timestamp,
+            'changes_detected': version_info.get('change_ratio', 0) > 0.1
+        }
+    def _calculate_changes(self, old_content: str, new_content: str) -> Dict[str, Any]:
+        """Calculate specific changes between document versions."""
+        old_words = set(old_content.split())
+        new_words = set(new_content.split())
+        added_words = new_words - old_words
+        removed_words = old_words - new_words
+        return {
+            'words_added': len(added_words),
+            'words_removed': len(removed_words),
+            'new_words': list(added_words)[:10],  # First 10 new words
+            'removed_words': list(removed_words)[:10]  # First 10 removed words
+        }
+    def get_version_history(self, document_id: str) -> List[Dict[str, Any]]:
+        """Get version history for a document."""
+        return self.document_versions.get(document_id, [])
+    async def suggest_next_content(self, current_content: str, cursor_position: int,
+                                 document_type: str, industry: str = None) -> List[str]:
+        """Suggest next content based on current context."""
+        try:
+            # Get context around cursor
+            context_start = max(0, cursor_position - 200)
+            context_end = min(len(current_content), cursor_position + 50)
+            context = current_content[context_start:context_end]
+            prompt = f"""
+            Given this document context for a {document_type}, suggest 3 possible next sentences or phrases:
+            Context: ...{context}...
+            Provide 3 suggestions that would logically continue this content:
+            1. [suggestion 1]
+            2. [suggestion 2]
+            3. [suggestion 3]
+            """
+            response = await asyncio.to_thread(
+                self.openai_client.chat.completions.create,
+                model="gpt-4",
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.7,
+                max_tokens=200
+            )
+            content = response.choices[0].message.content
+            suggestions = []
+            # Parse numbered suggestions
+            for line in content.split('\n'):
+                if re.match(r'^\d+\.', line.strip()):
+                    suggestion = re.sub(r'^\d+\.\s*', '', line.strip())
+                    if suggestion:
+                        suggestions.append(suggestion)
+            return suggestions[:3]
+        except Exception as e:
+            logging.error(f"Content suggestion error: {e}")
+            return self._basic_content_suggestions(current_content, document_type)
+    def _basic_content_suggestions(self, current_content: str, document_type: str) -> List[str]:
+        """Generate basic content suggestions without AI."""
+        suggestions = []
+        if 'market' in current_content.lower():
+            suggestions.append("Our target market consists of...")
+            suggestions.append("Market research indicates that...")
+            suggestions.append("The competitive landscape shows...")
+        elif 'financial' in current_content.lower():
+            suggestions.append("Revenue projections for the next 3 years...")
+            suggestions.append("Our unit economics demonstrate...")
+            suggestions.append("Key financial metrics include...")
+        else:
+            suggestions.append("Additionally, it's important to note that...")
+            suggestions.append("This approach provides several benefits...")
+            suggestions.append("The strategic implications include...")
+        return suggestions
+# Export the class
+__all__ = ['DocumentIntelligenceEngine']