import os import pandas as pd from typing import List, Dict, Tuple from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.schema import Document class DocumentProcessor: """Process and prepare documents for the RAG system""" def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=1000, chunk_overlap=200, length_function=len, ) # Embedded document content for demonstration self.embedded_content = { "financial_reports": { "content": """ FinSolve Technologies Financial Performance 2024: Q4 2024 Financial Highlights: - Revenue: $2.6 billion (35% YoY growth) - Gross Margin: 64% (improvement from 58% in Q1) - Net Income: $325 million (18% YoY increase) - Operating Income: $650 million - Marketing Spend: $650 million for Q4 Annual 2024 Summary: - Total Revenue: $9.4 billion (28% YoY increase) - Marketing Investment: $2.3 billion - Vendor Costs: $510 million - Cash Flow from Operations: $1.5 billion (14% YoY increase) Key Cost Drivers: - Vendor Services: $30M (18% increase) - Software Subscriptions: $25M (22% increase) - Employee Benefits: Increased 10% YoY Growth Metrics: - Customer Acquisition: 20% increase - Market Expansion: Successful entry into Europe and Southeast Asia - ROI: Marketing campaigns achieved 4.5x return on investment """, "metadata": { "title": "Financial Reports 2024", "department": "Finance", "type": "Financial Report", "content_type": "financial_reports" } }, "marketing_reports": { "content": """ FinSolve Technologies Marketing Performance 2024: Campaign Highlights: - New Customer Acquisition: 220,000 (Q4) - exceeded target - Digital Campaign ROI: 3.5x return on $5M investment - Brand Awareness: 15% growth YoY - Customer Engagement: 5% increase Geographic Expansion: - Europe: Successful market entry in UK, Germany, France - Southeast Asia: Strong performance in Indonesia, Thailand, Vietnam - Latin America: Expanded into Brazil, Mexico, Colombia Q4 2024 Specific Results: - Revenue Target: $11 million (achieved) - Marketing Spend: $2.5 million - Conversion Rate: 15.0% (target met) - Customer Retention: 85% Key Marketing Channels: - Digital Advertising: 40% of budget - Influencer Partnerships: Generated 600,000 impressions - Email Marketing: 25% open rate, 15% click-through rate - Event Marketing: 300 new enterprise leads from events Marketing Technology: - InstantPay feature launch: 52,000 sign-ups - Loyalty program: 50,000 enrolled customers - Social media: 25% engagement rate achieved """, "metadata": { "title": "Marketing Reports 2024", "department": "Marketing", "type": "Marketing Report", "content_type": "marketing_reports" } }, "employee_data": { "content": """ FinSolve Technologies Employee Information: Company Overview: - Founded: 2018 - Headquarters: Bangalore, India - Global Operations: North America, Europe, Asia-Pacific - Employees: 2 million+ individual users, 10,000+ business clients served Employee Benefits: - Health Insurance: Family floater policy - Provident Fund: 12% employer & employee contribution - Maternity Benefit: 26 weeks paid leave - Flexible Work: Remote work and flexible hours available - Professional Development: Access to online learning platforms Leave Policies: - Annual Leave: 15-21 days/year - Sick Leave: 12 days/year - Casual Leave: 7 days/year - Maternity Leave: 26 weeks - Paternity Leave: 7-15 days Work Hours: - Standard: 9 hours/day (including 1 hour break) - Flexible timings available for eligible roles - Attendance tracking via biometric/HRMS app Performance Management: - Annual and mid-year reviews - Based on KPIs, goals, competencies - Regular 1:1 meetings with managers - Recognition and rewards program Compensation Structure: - Basic Salary: 40-50% of CTC - HRA: 40-50% of basic salary - Annual bonus: Minimum 8.33% of basic salary - Performance-based increments """, "metadata": { "title": "Employee Handbook & HR Data", "department": "HR", "type": "HR Policy", "content_type": "employee_data" } }, "technical_docs": { "content": """ FinSolve Technologies Engineering Architecture: System Architecture: - Microservices-based, cloud-native system - Designed for scalability, resilience, and security - Modular design supporting rapid feature development Technology Stack: Frontend: - React 18, Redux Toolkit, Tailwind CSS - TypeScript, React Query, D3.js - Mobile: Swift 5.5 (iOS), Kotlin 1.6 (Android) Backend: - Node.js 18 LTS, Python 3.11 (FastAPI), Go 1.19 - Express.js, Pydantic, Gin - APIs: REST, GraphQL, gRPC Database: - PostgreSQL 15 (primary relational database) - MongoDB 6.0 (user profiles, metadata) - Redis 7.0 (caching, session management) - Amazon S3 (documents, backups) Infrastructure: - AWS (primary cloud provider) - Kubernetes 1.25+ (container orchestration) - Terraform (Infrastructure as Code) - Docker containers with security scanning Development Process: - Agile methodology with 2-week sprints - Git workflow with feature branches - CI/CD pipeline using Jenkins/GitHub Actions - Code review requirements: 2 approvals minimum Security: - OAuth 2.0, JWT tokens - TLS 1.3 for all communications - AES-256 encryption for data at rest - Regular security audits and penetration testing Performance Targets: - API response time: P95 < 200ms - Uptime: 99.99% - Page load time: < 2 seconds """, "metadata": { "title": "Engineering Master Document", "department": "Engineering", "type": "Technical Documentation", "content_type": "technical_docs" } }, "general_policies": { "content": """ FinSolve Technologies General Company Information: Company Mission: "To empower financial freedom through secure, scalable, and innovative technology solutions." Core Values: - Integrity: Act with honesty and transparency - Respect: Value diversity and treat everyone with dignity - Innovation: Encourage creativity and continuous improvement - Customer Focus: Customers at the heart of everything we do - Accountability: Take responsibility for actions and results General Policies: - Code of Conduct: Professional behavior and respect for all - Anti-Discrimination: Equal opportunity regardless of background - Work from Home: Up to 2 days/week for eligible roles - Dress Code: Business casual Monday-Thursday, smart casual Friday Employee Services: - Employee Assistance Program (EAP) - Mental health support and counseling - Wellness programs and health check-ups - Team outings and social activities Communication: - Internal communications through official channels - Quarterly all-hands meetings - Regular newsletter updates - Open door policy for feedback Training & Development: - Mandatory induction training for new hires - Technical and soft skills workshops - Certification reimbursement up to ₹50,000/year - Internal job postings for career growth """, "metadata": { "title": "General Company Policies", "department": "General", "type": "Policy Document", "content_type": "general_policies" } } } def get_documents_for_role(self, role: str) -> List[Document]: """Get documents accessible to a specific role""" from auth_system import AuthSystem auth_system = AuthSystem() accessible_docs = auth_system.get_accessible_documents(role) documents = [] for content_type in accessible_docs: if content_type in self.embedded_content: content_data = self.embedded_content[content_type] # Create document with metadata doc = Document( page_content=content_data["content"], metadata={ **content_data["metadata"], "accessible_to": role } ) # Split into chunks chunks = self.text_splitter.split_documents([doc]) # Add chunk information to metadata for i, chunk in enumerate(chunks): chunk.metadata['chunk_id'] = i chunk.metadata['total_chunks'] = len(chunks) documents.extend(chunks) return documents def get_all_documents(self) -> List[Document]: """Get all available documents""" all_documents = [] for content_type, content_data in self.embedded_content.items(): doc = Document( page_content=content_data["content"], metadata=content_data["metadata"] ) # Split into chunks chunks = self.text_splitter.split_documents([doc]) # Add chunk information to metadata for i, chunk in enumerate(chunks): chunk.metadata['chunk_id'] = i chunk.metadata['total_chunks'] = len(chunks) all_documents.extend(chunks) return all_documents def get_document_info(self) -> Dict: """Get information about all available documents""" doc_info = {} for content_type, content_data in self.embedded_content.items(): doc_info[content_type] = content_data["metadata"] return doc_info def search_content(self, query: str, role: str) -> List[Document]: """Search content based on query and role""" documents = self.get_documents_for_role(role) # Simple keyword matching for demonstration query_lower = query.lower() relevant_docs = [] for doc in documents: content_lower = doc.page_content.lower() # Check if query terms are in the content if any(term in content_lower for term in query_lower.split()): relevant_docs.append(doc) return relevant_docs[:5] # Return top 5 matches