Spaces:
Paused
Paused
| import os | |
| import pandas as pd | |
| from typing import List, Dict, Tuple | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.schema import Document | |
| class DocumentProcessor: | |
| """Process and prepare documents for the RAG system""" | |
| def __init__(self): | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=1000, | |
| chunk_overlap=200, | |
| length_function=len, | |
| ) | |
| # Embedded document content for demonstration | |
| self.embedded_content = { | |
| "financial_reports": { | |
| "content": """ | |
| FinSolve Technologies Financial Performance 2024: | |
| Q4 2024 Financial Highlights: | |
| - Revenue: $2.6 billion (35% YoY growth) | |
| - Gross Margin: 64% (improvement from 58% in Q1) | |
| - Net Income: $325 million (18% YoY increase) | |
| - Operating Income: $650 million | |
| - Marketing Spend: $650 million for Q4 | |
| Annual 2024 Summary: | |
| - Total Revenue: $9.4 billion (28% YoY increase) | |
| - Marketing Investment: $2.3 billion | |
| - Vendor Costs: $510 million | |
| - Cash Flow from Operations: $1.5 billion (14% YoY increase) | |
| Key Cost Drivers: | |
| - Vendor Services: $30M (18% increase) | |
| - Software Subscriptions: $25M (22% increase) | |
| - Employee Benefits: Increased 10% YoY | |
| Growth Metrics: | |
| - Customer Acquisition: 20% increase | |
| - Market Expansion: Successful entry into Europe and Southeast Asia | |
| - ROI: Marketing campaigns achieved 4.5x return on investment | |
| """, | |
| "metadata": { | |
| "title": "Financial Reports 2024", | |
| "department": "Finance", | |
| "type": "Financial Report", | |
| "content_type": "financial_reports" | |
| } | |
| }, | |
| "marketing_reports": { | |
| "content": """ | |
| FinSolve Technologies Marketing Performance 2024: | |
| Campaign Highlights: | |
| - New Customer Acquisition: 220,000 (Q4) - exceeded target | |
| - Digital Campaign ROI: 3.5x return on $5M investment | |
| - Brand Awareness: 15% growth YoY | |
| - Customer Engagement: 5% increase | |
| Geographic Expansion: | |
| - Europe: Successful market entry in UK, Germany, France | |
| - Southeast Asia: Strong performance in Indonesia, Thailand, Vietnam | |
| - Latin America: Expanded into Brazil, Mexico, Colombia | |
| Q4 2024 Specific Results: | |
| - Revenue Target: $11 million (achieved) | |
| - Marketing Spend: $2.5 million | |
| - Conversion Rate: 15.0% (target met) | |
| - Customer Retention: 85% | |
| Key Marketing Channels: | |
| - Digital Advertising: 40% of budget | |
| - Influencer Partnerships: Generated 600,000 impressions | |
| - Email Marketing: 25% open rate, 15% click-through rate | |
| - Event Marketing: 300 new enterprise leads from events | |
| Marketing Technology: | |
| - InstantPay feature launch: 52,000 sign-ups | |
| - Loyalty program: 50,000 enrolled customers | |
| - Social media: 25% engagement rate achieved | |
| """, | |
| "metadata": { | |
| "title": "Marketing Reports 2024", | |
| "department": "Marketing", | |
| "type": "Marketing Report", | |
| "content_type": "marketing_reports" | |
| } | |
| }, | |
| "employee_data": { | |
| "content": """ | |
| FinSolve Technologies Employee Information: | |
| Company Overview: | |
| - Founded: 2018 | |
| - Headquarters: Bangalore, India | |
| - Global Operations: North America, Europe, Asia-Pacific | |
| - Employees: 2 million+ individual users, 10,000+ business clients served | |
| Employee Benefits: | |
| - Health Insurance: Family floater policy | |
| - Provident Fund: 12% employer & employee contribution | |
| - Maternity Benefit: 26 weeks paid leave | |
| - Flexible Work: Remote work and flexible hours available | |
| - Professional Development: Access to online learning platforms | |
| Leave Policies: | |
| - Annual Leave: 15-21 days/year | |
| - Sick Leave: 12 days/year | |
| - Casual Leave: 7 days/year | |
| - Maternity Leave: 26 weeks | |
| - Paternity Leave: 7-15 days | |
| Work Hours: | |
| - Standard: 9 hours/day (including 1 hour break) | |
| - Flexible timings available for eligible roles | |
| - Attendance tracking via biometric/HRMS app | |
| Performance Management: | |
| - Annual and mid-year reviews | |
| - Based on KPIs, goals, competencies | |
| - Regular 1:1 meetings with managers | |
| - Recognition and rewards program | |
| Compensation Structure: | |
| - Basic Salary: 40-50% of CTC | |
| - HRA: 40-50% of basic salary | |
| - Annual bonus: Minimum 8.33% of basic salary | |
| - Performance-based increments | |
| """, | |
| "metadata": { | |
| "title": "Employee Handbook & HR Data", | |
| "department": "HR", | |
| "type": "HR Policy", | |
| "content_type": "employee_data" | |
| } | |
| }, | |
| "technical_docs": { | |
| "content": """ | |
| FinSolve Technologies Engineering Architecture: | |
| System Architecture: | |
| - Microservices-based, cloud-native system | |
| - Designed for scalability, resilience, and security | |
| - Modular design supporting rapid feature development | |
| Technology Stack: | |
| Frontend: | |
| - React 18, Redux Toolkit, Tailwind CSS | |
| - TypeScript, React Query, D3.js | |
| - Mobile: Swift 5.5 (iOS), Kotlin 1.6 (Android) | |
| Backend: | |
| - Node.js 18 LTS, Python 3.11 (FastAPI), Go 1.19 | |
| - Express.js, Pydantic, Gin | |
| - APIs: REST, GraphQL, gRPC | |
| Database: | |
| - PostgreSQL 15 (primary relational database) | |
| - MongoDB 6.0 (user profiles, metadata) | |
| - Redis 7.0 (caching, session management) | |
| - Amazon S3 (documents, backups) | |
| Infrastructure: | |
| - AWS (primary cloud provider) | |
| - Kubernetes 1.25+ (container orchestration) | |
| - Terraform (Infrastructure as Code) | |
| - Docker containers with security scanning | |
| Development Process: | |
| - Agile methodology with 2-week sprints | |
| - Git workflow with feature branches | |
| - CI/CD pipeline using Jenkins/GitHub Actions | |
| - Code review requirements: 2 approvals minimum | |
| Security: | |
| - OAuth 2.0, JWT tokens | |
| - TLS 1.3 for all communications | |
| - AES-256 encryption for data at rest | |
| - Regular security audits and penetration testing | |
| Performance Targets: | |
| - API response time: P95 < 200ms | |
| - Uptime: 99.99% | |
| - Page load time: < 2 seconds | |
| """, | |
| "metadata": { | |
| "title": "Engineering Master Document", | |
| "department": "Engineering", | |
| "type": "Technical Documentation", | |
| "content_type": "technical_docs" | |
| } | |
| }, | |
| "general_policies": { | |
| "content": """ | |
| FinSolve Technologies General Company Information: | |
| Company Mission: | |
| "To empower financial freedom through secure, scalable, and innovative technology solutions." | |
| Core Values: | |
| - Integrity: Act with honesty and transparency | |
| - Respect: Value diversity and treat everyone with dignity | |
| - Innovation: Encourage creativity and continuous improvement | |
| - Customer Focus: Customers at the heart of everything we do | |
| - Accountability: Take responsibility for actions and results | |
| General Policies: | |
| - Code of Conduct: Professional behavior and respect for all | |
| - Anti-Discrimination: Equal opportunity regardless of background | |
| - Work from Home: Up to 2 days/week for eligible roles | |
| - Dress Code: Business casual Monday-Thursday, smart casual Friday | |
| Employee Services: | |
| - Employee Assistance Program (EAP) | |
| - Mental health support and counseling | |
| - Wellness programs and health check-ups | |
| - Team outings and social activities | |
| Communication: | |
| - Internal communications through official channels | |
| - Quarterly all-hands meetings | |
| - Regular newsletter updates | |
| - Open door policy for feedback | |
| Training & Development: | |
| - Mandatory induction training for new hires | |
| - Technical and soft skills workshops | |
| - Certification reimbursement up to ₹50,000/year | |
| - Internal job postings for career growth | |
| """, | |
| "metadata": { | |
| "title": "General Company Policies", | |
| "department": "General", | |
| "type": "Policy Document", | |
| "content_type": "general_policies" | |
| } | |
| } | |
| } | |
| def get_documents_for_role(self, role: str) -> List[Document]: | |
| """Get documents accessible to a specific role""" | |
| from auth_system import AuthSystem | |
| auth_system = AuthSystem() | |
| accessible_docs = auth_system.get_accessible_documents(role) | |
| documents = [] | |
| for content_type in accessible_docs: | |
| if content_type in self.embedded_content: | |
| content_data = self.embedded_content[content_type] | |
| # Create document with metadata | |
| doc = Document( | |
| page_content=content_data["content"], | |
| metadata={ | |
| **content_data["metadata"], | |
| "accessible_to": role | |
| } | |
| ) | |
| # Split into chunks | |
| chunks = self.text_splitter.split_documents([doc]) | |
| # Add chunk information to metadata | |
| for i, chunk in enumerate(chunks): | |
| chunk.metadata['chunk_id'] = i | |
| chunk.metadata['total_chunks'] = len(chunks) | |
| documents.extend(chunks) | |
| return documents | |
| def get_all_documents(self) -> List[Document]: | |
| """Get all available documents""" | |
| all_documents = [] | |
| for content_type, content_data in self.embedded_content.items(): | |
| doc = Document( | |
| page_content=content_data["content"], | |
| metadata=content_data["metadata"] | |
| ) | |
| # Split into chunks | |
| chunks = self.text_splitter.split_documents([doc]) | |
| # Add chunk information to metadata | |
| for i, chunk in enumerate(chunks): | |
| chunk.metadata['chunk_id'] = i | |
| chunk.metadata['total_chunks'] = len(chunks) | |
| all_documents.extend(chunks) | |
| return all_documents | |
| def get_document_info(self) -> Dict: | |
| """Get information about all available documents""" | |
| doc_info = {} | |
| for content_type, content_data in self.embedded_content.items(): | |
| doc_info[content_type] = content_data["metadata"] | |
| return doc_info | |
| def search_content(self, query: str, role: str) -> List[Document]: | |
| """Search content based on query and role""" | |
| documents = self.get_documents_for_role(role) | |
| # Simple keyword matching for demonstration | |
| query_lower = query.lower() | |
| relevant_docs = [] | |
| for doc in documents: | |
| content_lower = doc.page_content.lower() | |
| # Check if query terms are in the content | |
| if any(term in content_lower for term in query_lower.split()): | |
| relevant_docs.append(doc) | |
| return relevant_docs[:5] # Return top 5 matches |