JanviMl commited on
Commit
c8f8b24
·
verified ·
1 Parent(s): ab4032a

Create document_processor.py

Browse files
Files changed (1) hide show
  1. src/document_processor.py +323 -0
src/document_processor.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ from typing import List, Dict, Tuple
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.schema import Document
6
+
7
+ class DocumentProcessor:
8
+ """Process and prepare documents for the RAG system"""
9
+
10
+ def __init__(self):
11
+ self.text_splitter = RecursiveCharacterTextSplitter(
12
+ chunk_size=1000,
13
+ chunk_overlap=200,
14
+ length_function=len,
15
+ )
16
+
17
+ # Embedded document content for demonstration
18
+ self.embedded_content = {
19
+ "financial_reports": {
20
+ "content": """
21
+ FinSolve Technologies Financial Performance 2024:
22
+
23
+ Q4 2024 Financial Highlights:
24
+ - Revenue: $2.6 billion (35% YoY growth)
25
+ - Gross Margin: 64% (improvement from 58% in Q1)
26
+ - Net Income: $325 million (18% YoY increase)
27
+ - Operating Income: $650 million
28
+ - Marketing Spend: $650 million for Q4
29
+
30
+ Annual 2024 Summary:
31
+ - Total Revenue: $9.4 billion (28% YoY increase)
32
+ - Marketing Investment: $2.3 billion
33
+ - Vendor Costs: $510 million
34
+ - Cash Flow from Operations: $1.5 billion (14% YoY increase)
35
+
36
+ Key Cost Drivers:
37
+ - Vendor Services: $30M (18% increase)
38
+ - Software Subscriptions: $25M (22% increase)
39
+ - Employee Benefits: Increased 10% YoY
40
+
41
+ Growth Metrics:
42
+ - Customer Acquisition: 20% increase
43
+ - Market Expansion: Successful entry into Europe and Southeast Asia
44
+ - ROI: Marketing campaigns achieved 4.5x return on investment
45
+ """,
46
+ "metadata": {
47
+ "title": "Financial Reports 2024",
48
+ "department": "Finance",
49
+ "type": "Financial Report",
50
+ "content_type": "financial_reports"
51
+ }
52
+ },
53
+
54
+ "marketing_reports": {
55
+ "content": """
56
+ FinSolve Technologies Marketing Performance 2024:
57
+
58
+ Campaign Highlights:
59
+ - New Customer Acquisition: 220,000 (Q4) - exceeded target
60
+ - Digital Campaign ROI: 3.5x return on $5M investment
61
+ - Brand Awareness: 15% growth YoY
62
+ - Customer Engagement: 5% increase
63
+
64
+ Geographic Expansion:
65
+ - Europe: Successful market entry in UK, Germany, France
66
+ - Southeast Asia: Strong performance in Indonesia, Thailand, Vietnam
67
+ - Latin America: Expanded into Brazil, Mexico, Colombia
68
+
69
+ Q4 2024 Specific Results:
70
+ - Revenue Target: $11 million (achieved)
71
+ - Marketing Spend: $2.5 million
72
+ - Conversion Rate: 15.0% (target met)
73
+ - Customer Retention: 85%
74
+
75
+ Key Marketing Channels:
76
+ - Digital Advertising: 40% of budget
77
+ - Influencer Partnerships: Generated 600,000 impressions
78
+ - Email Marketing: 25% open rate, 15% click-through rate
79
+ - Event Marketing: 300 new enterprise leads from events
80
+
81
+ Marketing Technology:
82
+ - InstantPay feature launch: 52,000 sign-ups
83
+ - Loyalty program: 50,000 enrolled customers
84
+ - Social media: 25% engagement rate achieved
85
+ """,
86
+ "metadata": {
87
+ "title": "Marketing Reports 2024",
88
+ "department": "Marketing",
89
+ "type": "Marketing Report",
90
+ "content_type": "marketing_reports"
91
+ }
92
+ },
93
+
94
+ "employee_data": {
95
+ "content": """
96
+ FinSolve Technologies Employee Information:
97
+
98
+ Company Overview:
99
+ - Founded: 2018
100
+ - Headquarters: Bangalore, India
101
+ - Global Operations: North America, Europe, Asia-Pacific
102
+ - Employees: 2 million+ individual users, 10,000+ business clients served
103
+
104
+ Employee Benefits:
105
+ - Health Insurance: Family floater policy
106
+ - Provident Fund: 12% employer & employee contribution
107
+ - Maternity Benefit: 26 weeks paid leave
108
+ - Flexible Work: Remote work and flexible hours available
109
+ - Professional Development: Access to online learning platforms
110
+
111
+ Leave Policies:
112
+ - Annual Leave: 15-21 days/year
113
+ - Sick Leave: 12 days/year
114
+ - Casual Leave: 7 days/year
115
+ - Maternity Leave: 26 weeks
116
+ - Paternity Leave: 7-15 days
117
+
118
+ Work Hours:
119
+ - Standard: 9 hours/day (including 1 hour break)
120
+ - Flexible timings available for eligible roles
121
+ - Attendance tracking via biometric/HRMS app
122
+
123
+ Performance Management:
124
+ - Annual and mid-year reviews
125
+ - Based on KPIs, goals, competencies
126
+ - Regular 1:1 meetings with managers
127
+ - Recognition and rewards program
128
+
129
+ Compensation Structure:
130
+ - Basic Salary: 40-50% of CTC
131
+ - HRA: 40-50% of basic salary
132
+ - Annual bonus: Minimum 8.33% of basic salary
133
+ - Performance-based increments
134
+ """,
135
+ "metadata": {
136
+ "title": "Employee Handbook & HR Data",
137
+ "department": "HR",
138
+ "type": "HR Policy",
139
+ "content_type": "employee_data"
140
+ }
141
+ },
142
+
143
+ "technical_docs": {
144
+ "content": """
145
+ FinSolve Technologies Engineering Architecture:
146
+
147
+ System Architecture:
148
+ - Microservices-based, cloud-native system
149
+ - Designed for scalability, resilience, and security
150
+ - Modular design supporting rapid feature development
151
+
152
+ Technology Stack:
153
+ Frontend:
154
+ - React 18, Redux Toolkit, Tailwind CSS
155
+ - TypeScript, React Query, D3.js
156
+ - Mobile: Swift 5.5 (iOS), Kotlin 1.6 (Android)
157
+
158
+ Backend:
159
+ - Node.js 18 LTS, Python 3.11 (FastAPI), Go 1.19
160
+ - Express.js, Pydantic, Gin
161
+ - APIs: REST, GraphQL, gRPC
162
+
163
+ Database:
164
+ - PostgreSQL 15 (primary relational database)
165
+ - MongoDB 6.0 (user profiles, metadata)
166
+ - Redis 7.0 (caching, session management)
167
+ - Amazon S3 (documents, backups)
168
+
169
+ Infrastructure:
170
+ - AWS (primary cloud provider)
171
+ - Kubernetes 1.25+ (container orchestration)
172
+ - Terraform (Infrastructure as Code)
173
+ - Docker containers with security scanning
174
+
175
+ Development Process:
176
+ - Agile methodology with 2-week sprints
177
+ - Git workflow with feature branches
178
+ - CI/CD pipeline using Jenkins/GitHub Actions
179
+ - Code review requirements: 2 approvals minimum
180
+
181
+ Security:
182
+ - OAuth 2.0, JWT tokens
183
+ - TLS 1.3 for all communications
184
+ - AES-256 encryption for data at rest
185
+ - Regular security audits and penetration testing
186
+
187
+ Performance Targets:
188
+ - API response time: P95 < 200ms
189
+ - Uptime: 99.99%
190
+ - Page load time: < 2 seconds
191
+ """,
192
+ "metadata": {
193
+ "title": "Engineering Master Document",
194
+ "department": "Engineering",
195
+ "type": "Technical Documentation",
196
+ "content_type": "technical_docs"
197
+ }
198
+ },
199
+
200
+ "general_policies": {
201
+ "content": """
202
+ FinSolve Technologies General Company Information:
203
+
204
+ Company Mission:
205
+ "To empower financial freedom through secure, scalable, and innovative technology solutions."
206
+
207
+ Core Values:
208
+ - Integrity: Act with honesty and transparency
209
+ - Respect: Value diversity and treat everyone with dignity
210
+ - Innovation: Encourage creativity and continuous improvement
211
+ - Customer Focus: Customers at the heart of everything we do
212
+ - Accountability: Take responsibility for actions and results
213
+
214
+ General Policies:
215
+ - Code of Conduct: Professional behavior and respect for all
216
+ - Anti-Discrimination: Equal opportunity regardless of background
217
+ - Work from Home: Up to 2 days/week for eligible roles
218
+ - Dress Code: Business casual Monday-Thursday, smart casual Friday
219
+
220
+ Employee Services:
221
+ - Employee Assistance Program (EAP)
222
+ - Mental health support and counseling
223
+ - Wellness programs and health check-ups
224
+ - Team outings and social activities
225
+
226
+ Communication:
227
+ - Internal communications through official channels
228
+ - Quarterly all-hands meetings
229
+ - Regular newsletter updates
230
+ - Open door policy for feedback
231
+
232
+ Training & Development:
233
+ - Mandatory induction training for new hires
234
+ - Technical and soft skills workshops
235
+ - Certification reimbursement up to ₹50,000/year
236
+ - Internal job postings for career growth
237
+ """,
238
+ "metadata": {
239
+ "title": "General Company Policies",
240
+ "department": "General",
241
+ "type": "Policy Document",
242
+ "content_type": "general_policies"
243
+ }
244
+ }
245
+ }
246
+
247
+ def get_documents_for_role(self, role: str) -> List[Document]:
248
+ """Get documents accessible to a specific role"""
249
+ from auth_system import AuthSystem
250
+ auth_system = AuthSystem()
251
+
252
+ accessible_docs = auth_system.get_accessible_documents(role)
253
+ documents = []
254
+
255
+ for content_type in accessible_docs:
256
+ if content_type in self.embedded_content:
257
+ content_data = self.embedded_content[content_type]
258
+
259
+ # Create document with metadata
260
+ doc = Document(
261
+ page_content=content_data["content"],
262
+ metadata={
263
+ **content_data["metadata"],
264
+ "accessible_to": role
265
+ }
266
+ )
267
+
268
+ # Split into chunks
269
+ chunks = self.text_splitter.split_documents([doc])
270
+
271
+ # Add chunk information to metadata
272
+ for i, chunk in enumerate(chunks):
273
+ chunk.metadata['chunk_id'] = i
274
+ chunk.metadata['total_chunks'] = len(chunks)
275
+
276
+ documents.extend(chunks)
277
+
278
+ return documents
279
+
280
+ def get_all_documents(self) -> List[Document]:
281
+ """Get all available documents"""
282
+ all_documents = []
283
+
284
+ for content_type, content_data in self.embedded_content.items():
285
+ doc = Document(
286
+ page_content=content_data["content"],
287
+ metadata=content_data["metadata"]
288
+ )
289
+
290
+ # Split into chunks
291
+ chunks = self.text_splitter.split_documents([doc])
292
+
293
+ # Add chunk information to metadata
294
+ for i, chunk in enumerate(chunks):
295
+ chunk.metadata['chunk_id'] = i
296
+ chunk.metadata['total_chunks'] = len(chunks)
297
+
298
+ all_documents.extend(chunks)
299
+
300
+ return all_documents
301
+
302
+ def get_document_info(self) -> Dict:
303
+ """Get information about all available documents"""
304
+ doc_info = {}
305
+ for content_type, content_data in self.embedded_content.items():
306
+ doc_info[content_type] = content_data["metadata"]
307
+ return doc_info
308
+
309
+ def search_content(self, query: str, role: str) -> List[Document]:
310
+ """Search content based on query and role"""
311
+ documents = self.get_documents_for_role(role)
312
+
313
+ # Simple keyword matching for demonstration
314
+ query_lower = query.lower()
315
+ relevant_docs = []
316
+
317
+ for doc in documents:
318
+ content_lower = doc.page_content.lower()
319
+ # Check if query terms are in the content
320
+ if any(term in content_lower for term in query_lower.split()):
321
+ relevant_docs.append(doc)
322
+
323
+ return relevant_docs[:5] # Return top 5 matches