Spaces:

JanviMl
/

RAGFintech

Paused

App Files Files Community

JanviMl commited on Jun 16, 2025

Commit

976cd03

verified ·

1 Parent(s): 68ec064

Create enhanced_rag_system.py

Browse files

Files changed (1) hide show

src/enhanced_rag_system.py +516 -0

src/enhanced_rag_system.py ADDED Viewed

	@@ -0,0 +1,516 @@

+import os
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from typing import List, Tuple, Dict, Optional
+from langchain.schema import Document
+import re
+import json
+# Import our custom modules
+from document_processor import DocumentProcessor
+from auth_system import AuthSystem
+class EnhancedRAGSystem:
+    """Enhanced RAG system with RBAC enforcement, reference attribution, and rich outputs"""
+    def __init__(self):
+        self.document_processor = DocumentProcessor()
+        self.auth_system = AuthSystem()
+        self.documents = []
+        self.initialized = False
+        self.query_feedback = {}
+        # Intent classification keywords
+        self.intent_keywords = {
+            "finance": ["revenue", "profit", "cost", "budget", "financial", "expense", "income", "cash", "margin", "roi", "sales"],
+            "marketing": ["campaign", "customer", "acquisition", "brand", "marketing", "advertising", "engagement", "conversion", "retention"],
+            "hr": ["employee", "hr", "policy", "leave", "benefits", "salary", "attendance", "performance", "training", "recruitment"],
+            "engineering": ["architecture", "technology", "system", "development", "technical", "infrastructure", "deployment", "security", "api"],
+            "general": ["company", "about", "overview", "mission", "values", "policy", "contact", "help"]
+        }
+    def initialize_system(self):
+        """Initialize the enhanced RAG system components"""
+        try:
+            print("Initializing Enhanced RAG system...")
+            # Load all documents with role-based indexing
+            self.documents = self.document_processor.get_all_documents()
+            self._build_role_based_index()
+            self.initialized = True
+            print(f"Enhanced RAG system initialized with {len(self.documents)} document chunks!")
+        except Exception as e:
+            print(f"Error initializing Enhanced RAG system: {str(e)}")
+            self.initialized = False
+    def _build_role_based_index(self):
+        """Build role-based document index for efficient filtering"""
+        self.role_index = {}
+        for role in ["Finance", "Marketing", "HR", "Engineering", "C-Level", "Employee"]:
+            accessible_docs = self.auth_system.get_accessible_documents(role)
+            self.role_index[role] = []
+            for doc in self.documents:
+                content_type = doc.metadata.get('content_type', '')
+                if content_type in accessible_docs or 'all_data' in accessible_docs:
+                    self.role_index[role].append(doc)
+    def _classify_query_intent(self, query: str) -> str:
+        """Classify query intent using keyword matching"""
+        query_lower = query.lower()
+        intent_scores = {}
+        for intent, keywords in self.intent_keywords.items():
+            score = sum(1 for keyword in keywords if keyword in query_lower)
+            if score > 0:
+                intent_scores[intent] = score
+        if intent_scores:
+            return max(intent_scores, key=intent_scores.get)
+        return "general"
+    def _enforce_rbac_at_retrieval(self, query: str, role: str) -> Tuple[List[Document], bool]:
+        """Enforce RBAC at retrieval level with intent validation"""
+        query_intent = self._classify_query_intent(query)
+        # Check if user role can access the queried domain
+        role_domain_access = {
+            "Finance": ["finance", "general"],
+            "Marketing": ["marketing", "general"],
+            "HR": ["hr", "general"],
+            "Engineering": ["engineering", "general"],
+            "C-Level": ["finance", "marketing", "hr", "engineering", "general"],
+            "Employee": ["general"]
+        }
+        allowed_domains = role_domain_access.get(role, ["general"])
+        if query_intent not in allowed_domains:
+            return [], False  # Unauthorized access
+        # Get role-specific documents
+        role_docs = self.role_index.get(role, [])
+        # Filter by relevance
+        relevant_docs = self._get_relevant_documents(query, role_docs)
+        return relevant_docs, True
+    def _get_relevant_documents(self, query: str, candidate_docs: List[Document], k: int = 3) -> List[Document]:
+        """Get relevant documents from candidate set"""
+        query_terms = query.lower().split()
+        scored_docs = []
+        for doc in candidate_docs:
+            content_lower = doc.page_content.lower()
+            score = 0
+            # Score based on term frequency
+            for term in query_terms:
+                score += content_lower.count(term) * 2
+            # Boost for exact phrase matches
+            if query.lower() in content_lower:
+                score += 10
+            # Boost for title/metadata matches
+            title = doc.metadata.get('title', '').lower()
+            for term in query_terms:
+                if term in title:
+                    score += 5
+            if score > 0:
+                scored_docs.append((doc, score))
+        # Sort by score and return top k
+        scored_docs.sort(key=lambda x: x[1], reverse=True)
+        return [doc for doc, score in scored_docs[:k]]
+    def _generate_unauthorized_response(self, query: str, role: str, query_intent: str) -> str:
+        """Generate graceful unauthorized access message"""
+        intent_role_map = {
+            "finance": "Finance and Executive",
+            "marketing": "Marketing and Executive",
+            "hr": "HR and Executive",
+            "engineering": "Engineering and Executive"
+        }
+        required_roles = intent_role_map.get(query_intent, "appropriate")
+        return f"""
+🛡️ **Access Restricted**
+This information is restricted to **{required_roles}** roles only.
+Your current role (**{role}**) does not have permission to access {query_intent} data.
+**Available to you:**
+{chr(10).join(['• ' + doc.replace('_', ' ').title() for doc in self.auth_system.get_accessible_documents(role)])}
+Please contact your administrator if you need access to additional information.
+        """
+    def _extract_key_metrics(self, content: str, query_intent: str) -> Dict:
+        """Extract key metrics for visualization"""
+        metrics = {}
+        if query_intent == "finance":
+            # Extract financial numbers
+            revenue_match = re.search(r'revenue[:\s]*\$?([\d.,]+)\s*(billion|million)', content.lower())
+            if revenue_match:
+                amount = revenue_match.group(1).replace(',', '')
+                unit = revenue_match.group(2)
+                multiplier = 1000 if unit == 'billion' else 1
+                metrics['revenue'] = float(amount) * multiplier
+            # Extract percentages
+            growth_match = re.search(r'(\d+)%\s*(yoy|growth)', content.lower())
+            if growth_match:
+                metrics['growth_rate'] = int(growth_match.group(1))
+        elif query_intent == "marketing":
+            # Extract marketing metrics
+            acq_match = re.search(r'(\d+,?\d*)\s*new customers', content.lower())
+            if acq_match:
+                metrics['customer_acquisition'] = int(acq_match.group(1).replace(',', ''))
+            roi_match = re.search(r'(\d+\.?\d*)x\s*r[oe]i', content.lower())
+            if roi_match:
+                metrics['roi'] = float(roi_match.group(1))
+        return metrics
+    def _create_visualization(self, metrics: Dict, query_intent: str) -> Optional[str]:
+        """Create visualizations for metrics"""
+        if not metrics:
+            return None
+        try:
+            if query_intent == "finance" and 'revenue' in metrics:
+                # Create a simple revenue chart
+                quarters = ['Q1', 'Q2', 'Q3', 'Q4']
+                revenues = [2100, 2300, 2400, 2600]  # Sample Q data
+                fig = px.bar(
+                    x=quarters,
+                    y=revenues,
+                    title="Quarterly Revenue 2024 ($ Millions)",
+                    labels={'x': 'Quarter', 'y': 'Revenue ($ Millions)'}
+                )
+                fig.update_layout(height=400, showlegend=False)
+                return fig.to_html(include_plotlyjs='cdn', div_id="revenue_chart")
+            elif query_intent == "marketing" and 'customer_acquisition' in metrics:
+                # Create customer acquisition chart
+                months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun']
+                acquisitions = [18000, 22000, 25000, 28000, 32000, 35000]  # Sample data
+                fig = px.line(
+                    x=months,
+                    y=acquisitions,
+                    title="Customer Acquisition Trend 2024",
+                    labels={'x': 'Month', 'y': 'New Customers'}
+                )
+                fig.update_layout(height=400, showlegend=False)
+                return fig.to_html(include_plotlyjs='cdn', div_id="acquisition_chart")
+        except Exception as e:
+            print(f"Error creating visualization: {e}")
+            return None
+    def _create_data_table(self, content: str, query_intent: str) -> Optional[str]:
+        """Create data tables from content"""
+        try:
+            if query_intent == "finance":
+                # Create financial metrics table
+                data = {
+                    'Metric': ['Q4 Revenue', 'Annual Revenue', 'Net Income', 'Gross Margin', 'ROI'],
+                    'Value': ['$2.6B', '$9.4B', '$325M', '64%', '15%'],
+                    'YoY Growth': ['+35%', '+28%', '+18%', '+6%', '+3%']
+                }
+                df = pd.DataFrame(data)
+                return df.to_html(index=False, classes='financial-table', table_id='financial-metrics')
+            elif query_intent == "marketing":
+                # Create marketing metrics table
+                data = {
+                    'Campaign': ['Digital Ads', 'Influencer', 'Email', 'Events'],
+                    'Spend': ['$5M', '$1.5M', '$0.2M', '$2M'],
+                    'ROI': ['3.5x', '4.2x', '2.0x', '5.0x'],
+                    'Leads': ['180K', '60K', '25K', '300']
+                }
+                df = pd.DataFrame(data)
+                return df.to_html(index=False, classes='marketing-table', table_id='marketing-metrics')
+        except Exception as e:
+            print(f"Error creating table: {e}")
+            return None
+    def _generate_enhanced_response(self, query: str, context_docs: List[Document], role: str) -> Tuple[str, List[str], Optional[str], Optional[str]]:
+        """Generate enhanced response with visualizations and tables"""
+        query_intent = self._classify_query_intent(query)
+        # Get base response
+        response = self._generate_contextual_response(query, context_docs, role, query_intent)
+        # Extract sources with proper attribution
+        sources = []
+        for doc in context_docs:
+            source = doc.metadata.get('title', 'Company Document')
+            doc_type = doc.metadata.get('type', 'Document')
+            sources.append(f"{source} ({doc_type})")
+        # Combine content for metric extraction
+        full_content = "\n".join([doc.page_content for doc in context_docs])
+        # Extract metrics and create visualizations
+        metrics = self._extract_key_metrics(full_content, query_intent)
+        visualization = self._create_visualization(metrics, query_intent)
+        table = self._create_data_table(full_content, query_intent)
+        return response, sources, visualization, table
+    def _generate_contextual_response(self, query: str, context_docs: List[Document], role: str, query_intent: str) -> str:
+        """Generate contextual response with better structure"""
+        if not context_docs:
+            return "No relevant information found for your query."
+        # Extract relevant content
+        full_context = "\n\n".join([doc.page_content for doc in context_docs])
+        response_parts = []
+        response_parts.append(f"**Based on your {role} access level:**")
+        response_parts.append("")  # Empty line
+        # Generate intent-specific responses
+        if query_intent == "finance":
+            response_parts.extend(self._generate_finance_insights(query, full_context))
+        elif query_intent == "marketing":
+            response_parts.extend(self._generate_marketing_insights(query, full_context))
+        elif query_intent == "hr":
+            response_parts.extend(self._generate_hr_insights(query, full_context))
+        elif query_intent == "engineering":
+            response_parts.extend(self._generate_technical_insights(query, full_context))
+        else:
+            response_parts.extend(self._generate_general_insights(query, full_context))
+        return "\n".join(response_parts)
+    def _generate_finance_insights(self, query: str, context: str) -> List[str]:
+        """Generate finance-specific insights"""
+        insights = ["💰 **Financial Insights:**", ""]
+        # Extract key metrics
+        if "2.6 billion" in context or "revenue" in query.lower():
+            insights.extend([
+                "📈 **Revenue Performance:**",
+                "• Q4 2024: $2.6 billion (35% YoY growth)",
+                "• Annual 2024: $9.4 billion (28% YoY increase)",
+                "• Strong growth trajectory maintained throughout the year",
+                ""
+            ])
+        if "margin" in query.lower() or "profit" in query.lower():
+            insights.extend([
+                "📊 **Profitability Metrics:**",
+                "• Gross Margin: 64% (improved from 58% in Q1)",
+                "• Net Income: $325M (18% YoY increase)",
+                "• Operating Income: $650M",
+                ""
+            ])
+        if "cost" in query.lower() or "expense" in query.lower():
+            insights.extend([
+                "💸 **Cost Analysis:**",
+                "• Vendor Services: $30M (18% increase)",
+                "• Software Subscriptions: $25M (22% increase)",
+                "• Marketing Investment: $2.3B with strong ROI",
+                ""
+            ])
+        insights.append("🎯 **Key Takeaway:** Strong revenue growth with improving margins despite increased operational costs.")
+        return insights
+    def _generate_marketing_insights(self, query: str, context: str) -> List[str]:
+        """Generate marketing-specific insights"""
+        insights = ["📈 **Marketing Insights:**", ""]
+        if "campaign" in query.lower() or "performance" in query.lower():
+            insights.extend([
+                "🎯 **Campaign Performance:**",
+                "• Customer Acquisition: 20% increase year-over-year",
+                "• Digital Campaign ROI: 3.5x return on $5M investment",
+                "• Q4 Results: 220,000 new customers (exceeded target)",
+                ""
+            ])
+        if "roi" in query.lower() or "return" in query.lower():
+            insights.extend([
+                "💰 **ROI Analysis:**",
+                "• Overall Marketing ROI: 4.5x",
+                "• Digital Channels: 3.5x return",
+                "• Event Marketing: 5.0x return",
+                "• Email Marketing: 2.0x return",
+                ""
+            ])
+        if "customer" in query.lower():
+            insights.extend([
+                "👥 **Customer Metrics:**",
+                "• Brand Awareness: 15% growth YoY",
+                "• Customer Retention: 85%",
+                "• Customer Acquisition Cost: $150 (down from $180)",
+                ""
+            ])
+        insights.append("🚀 **Key Takeaway:** Successful global expansion with strong ROI across all marketing channels.")
+        return insights
+    def _generate_hr_insights(self, query: str, context: str) -> List[str]:
+        """Generate HR-specific insights"""
+        insights = ["👥 **HR Insights:**", ""]
+        if "benefits" in query.lower():
+            insights.extend([
+                "🏥 **Employee Benefits:**",
+                "• Health Insurance: Family floater policy",
+                "• Provident Fund: 12% employer contribution",
+                "• Maternity Leave: 26 weeks paid leave",
+                "• Flexible Work: Up to 2 days/week WFH",
+                ""
+            ])
+        if "leave" in query.lower() or "policy" in query.lower():
+            insights.extend([
+                "📅 **Leave Policies:**",
+                "• Annual Leave: 15-21 days/year",
+                "• Sick Leave: 12 days/year",
+                "• Casual Leave: 7 days/year",
+                "• Emergency Leave: Available with manager approval",
+                ""
+            ])
+        if "salary" in query.lower() or "compensation" in query.lower():
+            insights.extend([
+                "💵 **Compensation Structure:**",
+                "• Basic Salary: 40-50% of CTC",
+                "• HRA: 40-50% of basic salary",
+                "• Annual Bonus: Minimum 8.33% of basic",
+                "• Performance Increments: Based on annual reviews",
+                ""
+            ])
+        insights.append("💡 **Key Takeaway:** Comprehensive benefits package with competitive compensation and flexible work arrangements.")
+        return insights
+    def _generate_technical_insights(self, query: str, context: str) -> List[str]:
+        """Generate technical/engineering insights"""
+        insights = ["🔧 **Technical Insights:**", ""]
+        if "architecture" in query.lower():
+            insights.extend([
+                "🏗️ **System Architecture:**",
+                "• Microservices-based, cloud-native design",
+                "• AWS infrastructure with Kubernetes orchestration",
+                "• PostgreSQL, MongoDB, Redis for data storage",
+                "• 99.99% uptime target with auto-scaling",
+                ""
+            ])
+        if "technology" in query.lower() or "stack" in query.lower():
+            insights.extend([
+                "💻 **Technology Stack:**",
+                "• Frontend: React 18, TypeScript, Tailwind CSS",
+                "• Backend: Node.js, Python, Go",
+                "• Mobile: Swift (iOS), Kotlin (Android)",
+                "• Infrastructure: AWS, Kubernetes, Docker",
+                ""
+            ])
+        if "security" in query.lower():
+            insights.extend([
+                "🔒 **Security Measures:**",
+                "• OAuth 2.0 and JWT authentication",
+                "• TLS 1.3 encryption for all communications",
+                "• Regular security audits and penetration testing",
+                "• Compliance: PCI-DSS, GDPR, ISO 27001",
+                ""
+            ])
+        insights.append("⚡ **Key Takeaway:** Modern, scalable architecture with strong security and compliance standards.")
+        return insights
+    def _generate_general_insights(self, query: str, context: str) -> List[str]:
+        """Generate general company insights"""
+        insights = ["🏢 **Company Information:**", ""]
+        insights.extend([
+            "📋 **About FinSolve Technologies:**",
+            "• Founded: 2018",
+            "• Headquarters: Bangalore, India",
+            "• Global presence: North America, Europe, Asia-Pacific",
+            "• Services: Digital banking, payments, wealth management",
+            "",
+            "���� **Mission & Values:**",
+            "• Mission: Empower financial freedom through technology",
+            "• Core Values: Integrity, Innovation, Customer Focus",
+            "• Commitment: Secure, scalable financial solutions",
+        ])
+        return insights
+    def store_feedback(self, query: str, response: str, rating: int, role: str):
+        """Store user feedback for future improvements"""
+        feedback_id = len(self.query_feedback)
+        self.query_feedback[feedback_id] = {
+            'query': query,
+            'response': response,
+            'rating': rating,
+            'role': role,
+            'timestamp': pd.Timestamp.now()
+        }
+    def query(self, query: str, user_role: str) -> Tuple[str, List[str], Optional[str], Optional[str]]:
+        """Enhanced query method with RBAC, visualizations, and tables"""
+        try:
+            if not self.initialized:
+                return "System not initialized. Please try again.", [], None, None
+            # Enforce RBAC at retrieval level
+            relevant_docs, authorized = self._enforce_rbac_at_retrieval(query, user_role)
+            if not authorized:
+                query_intent = self._classify_query_intent(query)
+                unauthorized_msg = self._generate_unauthorized_response(query, user_role, query_intent)
+                return unauthorized_msg, [], None, None
+            if not relevant_docs:
+                return f"No relevant information found in your accessible documents for: {query}", [], None, None
+            # Generate enhanced response
+            response, sources, visualization, table = self._generate_enhanced_response(
+                query, relevant_docs, user_role
+            )
+            return response, sources, visualization, table
+        except Exception as e:
+            error_msg = f"Error processing query: {str(e)}"
+            return error_msg, [], None, None
+    def get_system_status(self) -> Dict:
+        """Get enhanced system status"""
+        return {
+            "documents_loaded": len(self.documents),
+            "system_initialized": self.initialized,
+            "role_index_built": hasattr(self, 'role_index'),
+            "feedback_entries": len(self.query_feedback),
+            "available_roles": list(self.role_index.keys()) if hasattr(self, 'role_index') else []
+        }