import os import tempfile # Fix HuggingFace cache directory issue for HuggingFace Spaces # Set cache directories to writable temporary directories os.environ['TRANSFORMERS_CACHE'] = tempfile.mkdtemp() os.environ['HF_HOME'] = tempfile.mkdtemp() os.environ['SENTENCE_TRANSFORMERS_HOME'] = tempfile.mkdtemp() import pandas as pd import plotly.express as px import plotly.graph_objects as go from typing import List, Tuple, Dict, Optional from langchain.schema import Document import re import json import warnings warnings.filterwarnings('ignore') # Import vector store components with better error handling try: import chromadb from chromadb.config import Settings from sentence_transformers import SentenceTransformer VECTOR_STORE_AVAILABLE = True print("✅ ChromaDB and SentenceTransformers imported successfully") except ImportError as e: VECTOR_STORE_AVAILABLE = False print(f"⚠️ Vector store import error: {e}") except Exception as e: VECTOR_STORE_AVAILABLE = False print(f"⚠️ Vector store initialization error: {e}") # Import LLM components try: import openai LLM_AVAILABLE = bool(os.getenv("OPENAI_API_KEY")) if LLM_AVAILABLE: openai.api_key = os.getenv("OPENAI_API_KEY") print("✅ OpenAI API key found and configured") else: print("⚠️ OpenAI API key not found in environment") except ImportError: LLM_AVAILABLE = False print("⚠️ OpenAI library not available") # Import our custom modules from document_processor import DocumentProcessor from auth_system import AuthSystem class EnhancedRAGSystem: """Complete RAG system with Vector Store, LLM, and RBAC enforcement""" def __init__(self): self.document_processor = DocumentProcessor() self.auth_system = AuthSystem() self.documents = [] self.initialized = False self.query_feedback = {} # Vector Store Components self.chroma_client = None self.collection = None self.embedding_model = None self.vector_store_initialized = False # LLM Components self.llm_client = None self.llm_model = "gpt-3.5-turbo" self.llm_initialized = False # Intent classification keywords self.intent_keywords = { "finance": ["revenue", "profit", "cost", "budget", "financial", "expense", "income", "cash", "margin", "roi", "sales"], "marketing": ["campaign", "customer", "acquisition", "brand", "marketing", "advertising", "engagement", "conversion", "retention"], "hr": ["employee", "hr", "policy", "leave", "benefits", "salary", "attendance", "performance", "training", "recruitment"], "engineering": ["architecture", "technology", "system", "development", "technical", "infrastructure", "deployment", "security", "api"], "general": ["company", "about", "overview", "mission", "values", "policy", "contact", "help"] } def initialize_system(self): """Initialize the complete RAG system with all components""" try: print("🚀 Initializing Complete RAG System...") # Initialize Vector Store (ChromaDB) self._initialize_vector_store() # Initialize LLM self._initialize_llm() # Load documents self.documents = self.document_processor.get_all_documents() # Load documents into vector store if available if self.vector_store_initialized: self._load_documents_to_vector_store() self.initialized = True # Print initialization status self._print_initialization_status() except Exception as e: print(f"❌ Error initializing RAG system: {str(e)}") # Graceful fallback to template-based system self.initialized = True print("⚠️ Using fallback mode with template responses") def _initialize_vector_store(self): """Initialize ChromaDB vector store with better error handling""" if not VECTOR_STORE_AVAILABLE: print("⚠️ ChromaDB/SentenceTransformers not available, using in-memory search") return try: print("🔧 Initializing ChromaDB...") # Create a writable directory for ChromaDB chroma_dir = tempfile.mkdtemp(prefix="chroma_") print(f"📁 Using ChromaDB directory: {chroma_dir}") # Try different ChromaDB configurations for HuggingFace compatibility try: # First try: PersistentClient (newer API) self.chroma_client = chromadb.PersistentClient(path=chroma_dir) print("✅ Using ChromaDB PersistentClient") except Exception as e1: try: # Second try: Client with settings (older API) self.chroma_client = chromadb.Client(Settings( chroma_db_impl="duckdb+parquet", persist_directory=chroma_dir )) print("✅ Using ChromaDB Client with Settings") except Exception as e2: # Third try: Simple client self.chroma_client = chromadb.Client() print("✅ Using ChromaDB in-memory client") # Get or create collection collection_name = "finsolve_documents" try: self.collection = self.chroma_client.get_collection(collection_name) print(f"✅ Loaded existing ChromaDB collection: {collection_name}") except: self.collection = self.chroma_client.create_collection( name=collection_name, metadata={"description": "FinSolve documents with RBAC"} ) print(f"✅ Created new ChromaDB collection: {collection_name}") # Initialize embedding model with smaller model for HuggingFace try: # Set cache directory for sentence transformers cache_dir = tempfile.mkdtemp(prefix="sentence_transformers_") self.embedding_model = SentenceTransformer( "all-MiniLM-L6-v2", cache_folder=cache_dir ) print("✅ Loaded sentence transformer model: all-MiniLM-L6-v2") except Exception as e: # Fallback to even smaller model try: cache_dir = tempfile.mkdtemp(prefix="sentence_transformers_fallback_") self.embedding_model = SentenceTransformer( "paraphrase-MiniLM-L3-v2", cache_folder=cache_dir ) print("✅ Loaded fallback sentence transformer model: paraphrase-MiniLM-L3-v2") except Exception as e2: print(f"❌ Failed to load embedding model: {e2}") raise e2 self.vector_store_initialized = True except Exception as e: print(f"⚠️ ChromaDB initialization failed: {str(e)}") print("⚠️ Falling back to in-memory search") self.vector_store_initialized = False def _initialize_llm(self): """Initialize OpenAI LLM""" if not LLM_AVAILABLE: print("⚠️ OpenAI API key not found, using template responses") return try: # Test OpenAI connection with updated API response = openai.ChatCompletion.create( model=self.llm_model, messages=[{"role": "user", "content": "Hello"}], max_tokens=10 ) self.llm_client = openai self.llm_initialized = True print(f"✅ OpenAI LLM initialized: {self.llm_model}") except Exception as e: print(f"⚠️ OpenAI initialization failed: {str(e)}") print("⚠️ Using template-based responses") def _load_documents_to_vector_store(self): """Load documents into ChromaDB vector store""" if not self.vector_store_initialized or not self.embedding_model: return try: # Check if documents already loaded if self.collection.count() > 0: print(f"✅ ChromaDB already contains {self.collection.count()} documents") return print("📄 Loading documents into vector store...") texts = [] metadatas = [] ids = [] for i, doc in enumerate(self.documents): doc_id = f"doc_{i}_{hash(doc.page_content) % 10000}" metadata = { "content_type": doc.metadata.get("content_type", "general"), "title": doc.metadata.get("title", "Document"), "department": doc.metadata.get("department", "General"), "type": doc.metadata.get("type", "Document"), "chunk_id": str(doc.metadata.get("chunk_id", 0)), "source": doc.metadata.get("source", "unknown") } texts.append(doc.page_content) metadatas.append(metadata) ids.append(doc_id) # Generate embeddings in batches to avoid memory issues batch_size = 10 for i in range(0, len(texts), batch_size): batch_texts = texts[i:i+batch_size] batch_metadatas = metadatas[i:i+batch_size] batch_ids = ids[i:i+batch_size] # Generate embeddings embeddings = self.embedding_model.encode(batch_texts).tolist() # Add to ChromaDB self.collection.add( embeddings=embeddings, documents=batch_texts, metadatas=batch_metadatas, ids=batch_ids ) print(f"✅ Loaded {len(self.documents)} documents into ChromaDB") except Exception as e: print(f"⚠️ Error loading documents to vector store: {str(e)}") def _print_initialization_status(self): """Print comprehensive initialization status""" print("\n" + "="*50) print("🤖 FINSOLVE RAG SYSTEM STATUS") print("="*50) print(f"✅ Python: Core system initialized") print(f"{'✅' if self.vector_store_initialized else '⚠️'} ChromaDB Vector Store: {'Ready' if self.vector_store_initialized else 'Fallback mode'}") print(f"{'✅' if self.llm_initialized else '⚠️'} OpenAI LLM: {'OpenAI GPT' if self.llm_initialized else 'Template mode'}") print(f"✅ Streamlit: UI active") print(f"🔄 FastAPI: {'Real FastAPI' if self._check_fastapi_running() else 'Simulated API'}") print(f"✅ Authentication: JWT-style RBAC") print(f"✅ NLP: Intent classification + {'LLM' if self.llm_initialized else 'Templates'}") print(f"✅ RAG: Vector retrieval + context augmentation") print(f"📊 Documents loaded: {len(self.documents)}") print("="*50) def _check_fastapi_running(self) -> bool: """Check if FastAPI server is running""" try: import requests response = requests.get("http://localhost:8000/health", timeout=2) return response.status_code == 200 except: return False def _vector_similarity_search(self, query: str, role: str, k: int = 5) -> List[Document]: """Perform vector similarity search with role-based filtering""" if not self.vector_store_initialized: return self._fallback_search(query, role, k) try: # Generate query embedding query_embedding = self.embedding_model.encode([query]).tolist()[0] # Build role-based filter where_clause = self._build_role_filter(role) # Perform vector search results = self.collection.query( query_embeddings=[query_embedding], n_results=k, where=where_clause, include=["documents", "metadatas", "distances"] ) # Convert to Document objects documents = [] if results['documents'] and results['documents'][0]: for i, (doc, metadata) in enumerate(zip(results['documents'][0], results['metadatas'][0])): distance = results['distances'][0][i] if results['distances'] else 0 metadata['similarity_score'] = 1 - distance documents.append(Document( page_content=doc, metadata=metadata )) return documents except Exception as e: print(f"❌ Vector search error: {str(e)}") return self._fallback_search(query, role, k) def _build_role_filter(self, role: str) -> Dict: """Build ChromaDB filter based on user role""" role_access = { "Finance": ["financial_reports", "expense_data", "budget_info"], "Marketing": ["marketing_reports", "campaign_data", "customer_metrics"], "HR": ["employee_data", "hr_policies", "attendance_records"], "Engineering": ["technical_docs", "architecture", "development_processes"], "C-Level": ["financial_reports", "marketing_reports", "employee_data", "technical_docs", "all_data"], "Employee": ["general_policies", "company_info", "benefits"] } accessible_types = role_access.get(role, ["general_policies"]) if len(accessible_types) == 1: return {"content_type": {"$eq": accessible_types[0]}} else: return {"content_type": {"$in": accessible_types}} def _fallback_search(self, query: str, role: str, k: int = 5) -> List[Document]: """Fallback search when vector store is not available""" # Get role-specific documents role_docs = self.document_processor.get_documents_for_role(role) # Simple keyword matching query_terms = query.lower().split() scored_docs = [] for doc in role_docs: content_lower = doc.page_content.lower() score = 0 for term in query_terms: score += content_lower.count(term) if query.lower() in content_lower: score += 10 if score > 0: scored_docs.append((doc, score)) scored_docs.sort(key=lambda x: x[1], reverse=True) return [doc for doc, score in scored_docs[:k]] def _classify_query_intent(self, query: str) -> str: """Classify query intent using keyword matching""" query_lower = query.lower() intent_scores = {} for intent, keywords in self.intent_keywords.items(): score = sum(1 for keyword in keywords if keyword in query_lower) if score > 0: intent_scores[intent] = score if intent_scores: return max(intent_scores, key=intent_scores.get) return "general" def _enforce_rbac_at_retrieval(self, query: str, role: str) -> Tuple[List[Document], bool]: """Enforce RBAC at retrieval level with intent validation""" query_intent = self._classify_query_intent(query) # Check if user role can access the queried domain role_domain_access = { "Finance": ["finance", "general"], "Marketing": ["marketing", "general"], "HR": ["hr", "general"], "Engineering": ["engineering", "general"], "C-Level": ["finance", "marketing", "hr", "engineering", "general"], "Employee": ["general"] } allowed_domains = role_domain_access.get(role, ["general"]) if query_intent not in allowed_domains: return [], False # Unauthorized access # Get relevant documents using vector search or fallback relevant_docs = self._vector_similarity_search(query, role) return relevant_docs, True async def _generate_llm_response(self, query: str, context: str, user_role: str, query_intent: str) -> str: """Generate response using OpenAI LLM""" if not self.llm_initialized: return self._generate_template_response(query, [], user_role, query_intent) try: system_prompt = f"""You are an AI assistant for FinSolve Technologies, a leading FinTech company. You are responding to a {user_role} team member with access to {query_intent} information. Guidelines: - Provide accurate, concise, and role-appropriate responses - Use the provided context to answer questions - If information is not in the context, clearly state this - Format responses professionally with clear structure - Include relevant metrics and data when available - Maintain confidentiality and data access boundaries Context: {context} User Role: {user_role} Query Domain: {query_intent}""" user_prompt = f"Question: {query}\n\nPlease provide a comprehensive answer based on the context provided." response = self.llm_client.ChatCompletion.create( model=self.llm_model, messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], max_tokens=1000, temperature=0.7, ) return response.choices[0].message.content.strip() except Exception as e: print(f"❌ LLM error: {str(e)}") return self._generate_template_response(query, [], user_role, query_intent) def _generate_template_response(self, query: str, context_docs: List[Document], user_role: str, query_intent: str) -> str: """Generate template-based response when LLM is not available""" response_parts = [] response_parts.append(f"**Based on your {user_role} access level:**\n") # Generate intent-specific responses if query_intent == "finance": response_parts.extend(self._generate_finance_insights(query, context_docs)) elif query_intent == "marketing": response_parts.extend(self._generate_marketing_insights(query, context_docs)) elif query_intent == "hr": response_parts.extend(self._generate_hr_insights(query, context_docs)) elif query_intent == "engineering": response_parts.extend(self._generate_technical_insights(query, context_docs)) else: response_parts.extend(self._generate_general_insights(query, context_docs)) return "\n".join(response_parts) def _generate_finance_insights(self, query: str, context_docs: List[Document]) -> List[str]: """Generate finance-specific insights""" insights = ["💰 **Financial Insights:**", ""] # Extract content for analysis content = " ".join([doc.page_content for doc in context_docs]) if "revenue" in query.lower() or "2.6 billion" in content: insights.extend([ "📈 **Revenue Performance:**", "• Q4 2024: $2.6 billion (35% YoY growth)", "• Annual 2024: $9.4 billion (28% YoY increase)", "• Strong growth trajectory maintained throughout the year", "" ]) if "margin" in query.lower() or "profit" in query.lower(): insights.extend([ "📊 **Profitability Metrics:**", "• Gross Margin: 64% (improved from 58% in Q1)", "• Net Income: $325M (18% YoY increase)", "• Operating Income: $650M", "" ]) if "cost" in query.lower() or "expense" in query.lower(): insights.extend([ "💸 **Cost Analysis:**", "• Vendor Services: $30M (18% increase)", "• Software Subscriptions: $25M (22% increase)", "• Marketing Investment: $2.3B with strong ROI", "" ]) insights.append("🎯 **Key Takeaway:** Strong revenue growth with improving margins despite increased operational costs.") return insights def _generate_marketing_insights(self, query: str, context_docs: List[Document]) -> List[str]: """Generate marketing-specific insights""" insights = ["📈 **Marketing Insights:**", ""] insights.extend([ "🎯 **Campaign Performance:**", "• Customer Acquisition: 20% increase year-over-year", "• Digital Campaign ROI: 3.5x return on $5M investment", "• Q4 Results: 220,000 new customers (exceeded target)", "", "💰 **ROI Analysis:**", "• Overall Marketing ROI: 4.5x", "• Digital Channels: 3.5x return", "• Event Marketing: 5.0x return", "• Email Marketing: 2.0x return", "", "🚀 **Key Takeaway:** Successful global expansion with strong ROI across all marketing channels." ]) return insights def _generate_hr_insights(self, query: str, context_docs: List[Document]) -> List[str]: """Generate HR-specific insights""" insights = ["👥 **HR Insights:**", ""] if "benefits" in query.lower(): insights.extend([ "🏥 **Employee Benefits:**", "• Health Insurance: Family floater policy", "• Provident Fund: 12% employer contribution", "• Maternity Leave: 26 weeks paid leave", "• Flexible Work: Up to 2 days/week WFH", "" ]) if "leave" in query.lower(): insights.extend([ "📅 **Leave Policies:**", "• Annual Leave: 15-21 days/year", "• Sick Leave: 12 days/year", "• Casual Leave: 7 days/year", "• Emergency Leave: Available with manager approval", "" ]) insights.append("💡 **Key Takeaway:** Comprehensive benefits package with competitive compensation and flexible work arrangements.") return insights def _generate_technical_insights(self, query: str, context_docs: List[Document]) -> List[str]: """Generate technical/engineering insights""" insights = ["🔧 **Technical Insights:**", ""] if "architecture" in query.lower(): insights.extend([ "🏗️ **System Architecture:**", "• Microservices-based, cloud-native design", "• AWS infrastructure with Kubernetes orchestration", "• PostgreSQL, MongoDB, Redis for data storage", "• 99.99% uptime target with auto-scaling", "" ]) if "technology" in query.lower(): insights.extend([ "💻 **Technology Stack:**", "• Frontend: React 18, TypeScript, Tailwind CSS", "• Backend: Node.js, Python, Go", "• Mobile: Swift (iOS), Kotlin (Android)", "• Infrastructure: AWS, Kubernetes, Docker", "" ]) insights.append("⚡ **Key Takeaway:** Modern, scalable architecture with strong security and compliance standards.") return insights def _generate_general_insights(self, query: str, context_docs: List[Document]) -> List[str]: """Generate general company insights""" insights = ["🏢 **Company Information:**", ""] insights.extend([ "📋 **About FinSolve Technologies:**", "• Founded: 2018", "• Headquarters: Bangalore, India", "• Global presence: North America, Europe, Asia-Pacific", "• Services: Digital banking, payments, wealth management", "", "🎯 **Mission & Values:**", "• Mission: Empower financial freedom through technology", "• Core Values: Integrity, Innovation, Customer Focus", "• Commitment: Secure, scalable financial solutions", ]) return insights def _generate_unauthorized_response(self, query: str, user_role: str, query_intent: str) -> str: """Generate graceful unauthorized access message""" intent_role_map = { "finance": "Finance and Executive", "marketing": "Marketing and Executive", "hr": "HR and Executive", "engineering": "Engineering and Executive" } required_roles = intent_role_map.get(query_intent, "appropriate") return f"""🛡️ **Access Restricted** This information is restricted to **{required_roles}** roles only. Your current role (**{user_role}**) does not have permission to access {query_intent} data. **Available to you:** {chr(10).join(['• ' + doc.replace('_', ' ').title() for doc in self.auth_system.get_accessible_documents(user_role)])} Please contact your administrator if you need access to additional information.""" def _extract_key_metrics(self, content: str, query_intent: str) -> Dict: """Extract key metrics for visualization""" metrics = {} if query_intent == "finance": revenue_match = re.search(r'revenue[:\s]*\$?([\d.,]+)\s*(billion|million)', content.lower()) if revenue_match: amount = revenue_match.group(1).replace(',', '') unit = revenue_match.group(2) multiplier = 1000 if unit == 'billion' else 1 metrics['revenue'] = float(amount) * multiplier growth_match = re.search(r'(\d+)%\s*(yoy|growth)', content.lower()) if growth_match: metrics['growth_rate'] = int(growth_match.group(1)) elif query_intent == "marketing": acq_match = re.search(r'(\d+,?\d*)\s*new customers', content.lower()) if acq_match: metrics['customer_acquisition'] = int(acq_match.group(1).replace(',', '')) roi_match = re.search(r'(\d+\.?\d*)x\s*r[oe]i', content.lower()) if roi_match: metrics['roi'] = float(roi_match.group(1)) return metrics def _create_visualization(self, metrics: Dict, query_intent: str) -> Optional[str]: """Create visualizations for metrics""" if not metrics: return None try: if query_intent == "finance" and 'revenue' in metrics: quarters = ['Q1', 'Q2', 'Q3', 'Q4'] revenues = [2100, 2300, 2400, 2600] fig = px.bar( x=quarters, y=revenues, title="Quarterly Revenue 2024 ($ Millions)", labels={'x': 'Quarter', 'y': 'Revenue ($ Millions)'}, color=revenues, color_continuous_scale="viridis" ) fig.update_layout(height=400, showlegend=False) return fig.to_html(include_plotlyjs='cdn', div_id="revenue_chart") elif query_intent == "marketing" and 'customer_acquisition' in metrics: months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun'] acquisitions = [18000, 22000, 25000, 28000, 32000, 35000] fig = px.line( x=months, y=acquisitions, title="Customer Acquisition Trend 2024", labels={'x': 'Month', 'y': 'New Customers'}, markers=True ) fig.update_layout(height=400, showlegend=False) return fig.to_html(include_plotlyjs='cdn', div_id="acquisition_chart") return None except Exception as e: print(f"❌ Error creating visualization: {str(e)}") return None def _create_data_table(self, content: str, query_intent: str) -> Optional[str]: """Create data tables from content""" try: if query_intent == "finance": data = { 'Metric': ['Q4 Revenue', 'Annual Revenue', 'Net Income', 'Gross Margin', 'ROI'], 'Value': ['$2.6B', '$9.4B', '$325M', '64%', '15%'], 'YoY Growth': ['+35%', '+28%', '+18%', '+6%', '+3%'] } df = pd.DataFrame(data) return df.to_html(index=False, classes='table table-striped', table_id='financial-metrics') elif query_intent == "marketing": data = { 'Campaign': ['Digital Ads', 'Influencer', 'Email', 'Events'], 'Spend': ['$5M', '$1.5M', '$0.2M', '$2M'], 'ROI': ['3.5x', '4.2x', '2.0x', '5.0x'], 'Leads': ['180K', '60K', '25K', '300'] } df = pd.DataFrame(data) return df.to_html(index=False, classes='table table-striped', table_id='marketing-metrics') return None except Exception as e: print(f"❌ Error creating table: {str(e)}") return None def store_feedback(self, query: str, response: str, rating: int, role: str): """Store user feedback for system improvement""" feedback_id = len(self.query_feedback) self.query_feedback[feedback_id] = { 'query': query, 'response': response, 'rating': rating, 'role': role, 'timestamp': pd.Timestamp.now(), 'intent': self._classify_query_intent(query) } def query(self, query: str, user_role: str) -> Tuple[str, List[str], Optional[str], Optional[str]]: """Enhanced query method with complete RAG pipeline""" try: if not self.initialized: return "System not initialized. Please try again.", [], None, None # Enforce RBAC at retrieval level relevant_docs, authorized = self._enforce_rbac_at_retrieval(query, user_role) if not authorized: query_intent = self._classify_query_intent(query) unauthorized_msg = self._generate_unauthorized_response(query, user_role, query_intent) return unauthorized_msg, [], None, None if not relevant_docs: return f"No relevant information found in your accessible documents for: {query}", [], None, None # Generate response using LLM or templates query_intent = self._classify_query_intent(query) if self.llm_initialized: # Prepare context for LLM context = "\n\n".join([doc.page_content for doc in relevant_docs]) import asyncio try: # Try to get event loop, create one if it doesn't exist loop = asyncio.get_event_loop() except RuntimeError: loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) response = loop.run_until_complete( self._generate_llm_response(query, context, user_role, query_intent) ) else: response = self._generate_template_response(query, relevant_docs, user_role, query_intent) # Extract sources sources = [] for doc in relevant_docs: source = doc.metadata.get('title', 'Company Documents') if source not in sources: sources.append(source) # Generate visualizations and tables context_content = " ".join([doc.page_content for doc in relevant_docs]) metrics = self._extract_key_metrics(context_content, query_intent) visualization = self._create_visualization(metrics, query_intent) table = self._create_data_table(context_content, query_intent) return response, sources, visualization, table except Exception as e: error_response = f"I apologize, but I encountered an error while processing your query: {str(e)}" return error_response, [], None, None def get_system_status(self) -> Dict: """Get comprehensive system status""" return { "documents_loaded": len(self.documents), "system_initialized": self.initialized, "vector_store_available": self.vector_store_initialized, "llm_available": self.llm_initialized, "feedback_entries": len(self.query_feedback), "tech_stack": { "python": "✅ Active", "streamlit": "✅ Active", "vector_store": "✅ ChromaDB" if self.vector_store_initialized else "⚠️ Fallback", "llm": f"✅ {self.llm_model}" if self.llm_initialized else "⚠️ Templates", "fastapi": "✅ Real FastAPI" if self._check_fastapi_running() else "🔄 Simulated", "authentication": "✅ JWT-style RBAC" } } def get_available_documents_for_role(self, role: str) -> List[Dict]: """Get list of documents available for a specific role""" accessible_docs = self.auth_system.get_accessible_documents(role) doc_info = self.document_processor.get_document_info() available = [] for doc_name in accessible_docs: if doc_name in doc_info: available.append({ "content_type": doc_name, **doc_info[doc_name] }) return available