RAGFintech / src /document_processor.py
JanviMl's picture
Create document_processor.py
c8f8b24 verified
import os
import pandas as pd
from typing import List, Dict, Tuple
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
class DocumentProcessor:
"""Process and prepare documents for the RAG system"""
def __init__(self):
self.text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
)
# Embedded document content for demonstration
self.embedded_content = {
"financial_reports": {
"content": """
FinSolve Technologies Financial Performance 2024:
Q4 2024 Financial Highlights:
- Revenue: $2.6 billion (35% YoY growth)
- Gross Margin: 64% (improvement from 58% in Q1)
- Net Income: $325 million (18% YoY increase)
- Operating Income: $650 million
- Marketing Spend: $650 million for Q4
Annual 2024 Summary:
- Total Revenue: $9.4 billion (28% YoY increase)
- Marketing Investment: $2.3 billion
- Vendor Costs: $510 million
- Cash Flow from Operations: $1.5 billion (14% YoY increase)
Key Cost Drivers:
- Vendor Services: $30M (18% increase)
- Software Subscriptions: $25M (22% increase)
- Employee Benefits: Increased 10% YoY
Growth Metrics:
- Customer Acquisition: 20% increase
- Market Expansion: Successful entry into Europe and Southeast Asia
- ROI: Marketing campaigns achieved 4.5x return on investment
""",
"metadata": {
"title": "Financial Reports 2024",
"department": "Finance",
"type": "Financial Report",
"content_type": "financial_reports"
}
},
"marketing_reports": {
"content": """
FinSolve Technologies Marketing Performance 2024:
Campaign Highlights:
- New Customer Acquisition: 220,000 (Q4) - exceeded target
- Digital Campaign ROI: 3.5x return on $5M investment
- Brand Awareness: 15% growth YoY
- Customer Engagement: 5% increase
Geographic Expansion:
- Europe: Successful market entry in UK, Germany, France
- Southeast Asia: Strong performance in Indonesia, Thailand, Vietnam
- Latin America: Expanded into Brazil, Mexico, Colombia
Q4 2024 Specific Results:
- Revenue Target: $11 million (achieved)
- Marketing Spend: $2.5 million
- Conversion Rate: 15.0% (target met)
- Customer Retention: 85%
Key Marketing Channels:
- Digital Advertising: 40% of budget
- Influencer Partnerships: Generated 600,000 impressions
- Email Marketing: 25% open rate, 15% click-through rate
- Event Marketing: 300 new enterprise leads from events
Marketing Technology:
- InstantPay feature launch: 52,000 sign-ups
- Loyalty program: 50,000 enrolled customers
- Social media: 25% engagement rate achieved
""",
"metadata": {
"title": "Marketing Reports 2024",
"department": "Marketing",
"type": "Marketing Report",
"content_type": "marketing_reports"
}
},
"employee_data": {
"content": """
FinSolve Technologies Employee Information:
Company Overview:
- Founded: 2018
- Headquarters: Bangalore, India
- Global Operations: North America, Europe, Asia-Pacific
- Employees: 2 million+ individual users, 10,000+ business clients served
Employee Benefits:
- Health Insurance: Family floater policy
- Provident Fund: 12% employer & employee contribution
- Maternity Benefit: 26 weeks paid leave
- Flexible Work: Remote work and flexible hours available
- Professional Development: Access to online learning platforms
Leave Policies:
- Annual Leave: 15-21 days/year
- Sick Leave: 12 days/year
- Casual Leave: 7 days/year
- Maternity Leave: 26 weeks
- Paternity Leave: 7-15 days
Work Hours:
- Standard: 9 hours/day (including 1 hour break)
- Flexible timings available for eligible roles
- Attendance tracking via biometric/HRMS app
Performance Management:
- Annual and mid-year reviews
- Based on KPIs, goals, competencies
- Regular 1:1 meetings with managers
- Recognition and rewards program
Compensation Structure:
- Basic Salary: 40-50% of CTC
- HRA: 40-50% of basic salary
- Annual bonus: Minimum 8.33% of basic salary
- Performance-based increments
""",
"metadata": {
"title": "Employee Handbook & HR Data",
"department": "HR",
"type": "HR Policy",
"content_type": "employee_data"
}
},
"technical_docs": {
"content": """
FinSolve Technologies Engineering Architecture:
System Architecture:
- Microservices-based, cloud-native system
- Designed for scalability, resilience, and security
- Modular design supporting rapid feature development
Technology Stack:
Frontend:
- React 18, Redux Toolkit, Tailwind CSS
- TypeScript, React Query, D3.js
- Mobile: Swift 5.5 (iOS), Kotlin 1.6 (Android)
Backend:
- Node.js 18 LTS, Python 3.11 (FastAPI), Go 1.19
- Express.js, Pydantic, Gin
- APIs: REST, GraphQL, gRPC
Database:
- PostgreSQL 15 (primary relational database)
- MongoDB 6.0 (user profiles, metadata)
- Redis 7.0 (caching, session management)
- Amazon S3 (documents, backups)
Infrastructure:
- AWS (primary cloud provider)
- Kubernetes 1.25+ (container orchestration)
- Terraform (Infrastructure as Code)
- Docker containers with security scanning
Development Process:
- Agile methodology with 2-week sprints
- Git workflow with feature branches
- CI/CD pipeline using Jenkins/GitHub Actions
- Code review requirements: 2 approvals minimum
Security:
- OAuth 2.0, JWT tokens
- TLS 1.3 for all communications
- AES-256 encryption for data at rest
- Regular security audits and penetration testing
Performance Targets:
- API response time: P95 < 200ms
- Uptime: 99.99%
- Page load time: < 2 seconds
""",
"metadata": {
"title": "Engineering Master Document",
"department": "Engineering",
"type": "Technical Documentation",
"content_type": "technical_docs"
}
},
"general_policies": {
"content": """
FinSolve Technologies General Company Information:
Company Mission:
"To empower financial freedom through secure, scalable, and innovative technology solutions."
Core Values:
- Integrity: Act with honesty and transparency
- Respect: Value diversity and treat everyone with dignity
- Innovation: Encourage creativity and continuous improvement
- Customer Focus: Customers at the heart of everything we do
- Accountability: Take responsibility for actions and results
General Policies:
- Code of Conduct: Professional behavior and respect for all
- Anti-Discrimination: Equal opportunity regardless of background
- Work from Home: Up to 2 days/week for eligible roles
- Dress Code: Business casual Monday-Thursday, smart casual Friday
Employee Services:
- Employee Assistance Program (EAP)
- Mental health support and counseling
- Wellness programs and health check-ups
- Team outings and social activities
Communication:
- Internal communications through official channels
- Quarterly all-hands meetings
- Regular newsletter updates
- Open door policy for feedback
Training & Development:
- Mandatory induction training for new hires
- Technical and soft skills workshops
- Certification reimbursement up to ₹50,000/year
- Internal job postings for career growth
""",
"metadata": {
"title": "General Company Policies",
"department": "General",
"type": "Policy Document",
"content_type": "general_policies"
}
}
}
def get_documents_for_role(self, role: str) -> List[Document]:
"""Get documents accessible to a specific role"""
from auth_system import AuthSystem
auth_system = AuthSystem()
accessible_docs = auth_system.get_accessible_documents(role)
documents = []
for content_type in accessible_docs:
if content_type in self.embedded_content:
content_data = self.embedded_content[content_type]
# Create document with metadata
doc = Document(
page_content=content_data["content"],
metadata={
**content_data["metadata"],
"accessible_to": role
}
)
# Split into chunks
chunks = self.text_splitter.split_documents([doc])
# Add chunk information to metadata
for i, chunk in enumerate(chunks):
chunk.metadata['chunk_id'] = i
chunk.metadata['total_chunks'] = len(chunks)
documents.extend(chunks)
return documents
def get_all_documents(self) -> List[Document]:
"""Get all available documents"""
all_documents = []
for content_type, content_data in self.embedded_content.items():
doc = Document(
page_content=content_data["content"],
metadata=content_data["metadata"]
)
# Split into chunks
chunks = self.text_splitter.split_documents([doc])
# Add chunk information to metadata
for i, chunk in enumerate(chunks):
chunk.metadata['chunk_id'] = i
chunk.metadata['total_chunks'] = len(chunks)
all_documents.extend(chunks)
return all_documents
def get_document_info(self) -> Dict:
"""Get information about all available documents"""
doc_info = {}
for content_type, content_data in self.embedded_content.items():
doc_info[content_type] = content_data["metadata"]
return doc_info
def search_content(self, query: str, role: str) -> List[Document]:
"""Search content based on query and role"""
documents = self.get_documents_for_role(role)
# Simple keyword matching for demonstration
query_lower = query.lower()
relevant_docs = []
for doc in documents:
content_lower = doc.page_content.lower()
# Check if query terms are in the content
if any(term in content_lower for term in query_lower.split()):
relevant_docs.append(doc)
return relevant_docs[:5] # Return top 5 matches