cryogenic22 commited on
Commit
46945cc
·
verified ·
1 Parent(s): e74efa9

Update src/ai/extractors/entity_extractor.py

Browse files
Files changed (1) hide show
  1. src/ai/extractors/entity_extractor.py +627 -11
src/ai/extractors/entity_extractor.py CHANGED
@@ -1,25 +1,641 @@
1
- # src/ai/extractors/entity_extractor.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from langchain.chains import create_extraction_chain
3
- from typing import Dict, Any
 
 
 
 
4
 
5
  class EntityExtractor:
6
- """Extracts entities and relationships from text"""
 
 
 
7
 
8
  def __init__(self, llm_service):
 
 
 
 
 
 
9
  self.llm_service = llm_service
10
  self.setup_extraction_chain()
11
 
12
  def setup_extraction_chain(self):
13
- """Setup the extraction chain with schema"""
 
 
 
14
  self.entity_schema = {
15
- # Schema definition here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  }
17
 
18
- self.extraction_chain = create_extraction_chain(
19
- self.entity_schema,
20
- self.llm_service.extraction_model
21
- )
 
 
 
 
 
22
 
23
  async def extract_entities(self, text: str) -> Dict[str, Any]:
24
- """Extract entities from text"""
25
- return await self.extraction_chain.arun(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Advanced Entity Extractor Module
3
+ Handles hyper-contextual entity and relationship extraction from meeting transcripts
4
+ with multi-dimensional relationship mapping, temporal context tracking, and
5
+ deep domain-specific pattern recognition.
6
+
7
+ Key Features:
8
+ - Multi-level relationship mapping
9
+ - Temporal context preservation
10
+ - Cross-meeting context linking
11
+ - Emotional intelligence integration
12
+ - Strategic intent analysis
13
+ - Influence network mapping
14
+ - Business pattern recognition
15
+ """
16
  from langchain.chains import create_extraction_chain
17
+ from typing import Dict, Any, List, Optional
18
+ from datetime import datetime
19
+ import logging
20
+
21
+ logger = logging.getLogger(__name__)
22
 
23
  class EntityExtractor:
24
+ """
25
+ Extracts entities, relationships, and business context from meeting transcripts.
26
+ Provides detailed analysis of participants, actions, decisions, and opportunities.
27
+ """
28
 
29
  def __init__(self, llm_service):
30
+ """
31
+ Initialize entity extractor with LLM service.
32
+
33
+ Args:
34
+ llm_service: Language model service for extraction
35
+ """
36
  self.llm_service = llm_service
37
  self.setup_extraction_chain()
38
 
39
  def setup_extraction_chain(self):
40
+ """
41
+ Setup the extraction chain with comprehensive schema for business context.
42
+ Defines extractable entities, relationships, and metadata.
43
+ """
44
  self.entity_schema = {
45
+ "properties": {
46
+ "extraction_metadata": {
47
+ "type": "object",
48
+ "properties": {
49
+ "timestamp": {"type": "string"},
50
+ "version": {"type": "string"},
51
+ "confidence_scores": {
52
+ "type": "object",
53
+ "properties": {
54
+ "entity_extraction": {"type": "number"},
55
+ "relationship_mapping": {"type": "number"},
56
+ "intent_detection": {"type": "number"},
57
+ "sentiment_analysis": {"type": "number"}
58
+ }
59
+ },
60
+ "context_sources": {
61
+ "type": "array",
62
+ "items": {
63
+ "type": "object",
64
+ "properties": {
65
+ "source_type": {"type": "string"},
66
+ "reference_id": {"type": "string"},
67
+ "relevance_score": {"type": "number"}
68
+ }
69
+ }
70
+ }
71
+ }
72
+ },
73
+
74
+ "historical_context": {
75
+ "type": "object",
76
+ "properties": {
77
+ "related_meetings": {
78
+ "type": "array",
79
+ "items": {
80
+ "type": "object",
81
+ "properties": {
82
+ "meeting_id": {"type": "string"},
83
+ "date": {"type": "string"},
84
+ "relevance_score": {"type": "number"},
85
+ "key_outcomes": {"type": "array", "items": {"type": "string"}},
86
+ "context_carryover": {
87
+ "type": "array",
88
+ "items": {
89
+ "type": "object",
90
+ "properties": {
91
+ "topic": {"type": "string"},
92
+ "status": {"type": "string"},
93
+ "evolution": {"type": "string"}
94
+ }
95
+ }
96
+ }
97
+ }
98
+ }
99
+ },
100
+ "relationship_history": {
101
+ "type": "array",
102
+ "items": {
103
+ "type": "object",
104
+ "properties": {
105
+ "entities": {
106
+ "type": "array",
107
+ "items": {"type": "string"}
108
+ },
109
+ "interaction_pattern": {"type": "string"},
110
+ "temporal_changes": {
111
+ "type": "array",
112
+ "items": {
113
+ "type": "object",
114
+ "properties": {
115
+ "timestamp": {"type": "string"},
116
+ "change_type": {"type": "string"},
117
+ "description": {"type": "string"}
118
+ }
119
+ }
120
+ }
121
+ }
122
+ }
123
+ }
124
+ }
125
+ },
126
+
127
+ "meeting_context": {
128
+ "type": "object",
129
+ "properties": {
130
+ "meeting_id": {"type": "string"},
131
+ "date_time": {"type": "string"},
132
+ "duration": {"type": "string"},
133
+ "type": {
134
+ "type": "string",
135
+ "enum": [
136
+ "sales_opportunity",
137
+ "project_review",
138
+ "status_update",
139
+ "contract_negotiation",
140
+ "strategic_planning",
141
+ "technical_discussion"
142
+ ]
143
+ },
144
+ "format": {
145
+ "type": "string",
146
+ "enum": ["in_person", "virtual", "hybrid"]
147
+ },
148
+ "location": {"type": "string"},
149
+ "objectives": {"type": "array", "items": {"type": "string"}},
150
+ "previous_meeting_reference": {"type": "string"}
151
+ }
152
+ },
153
+
154
+ "participants": {
155
+ "type": "array",
156
+ "items": {
157
+ "type": "object",
158
+ "properties": {
159
+ "name": {"type": "string"},
160
+ "role": {"type": "string"},
161
+ "organization": {"type": "string"},
162
+ "department": {"type": "string"},
163
+ "title": {"type": "string"},
164
+ "attendance": {
165
+ "type": "string",
166
+ "enum": ["full", "partial", "absent"]
167
+ },
168
+ "engagement_level": {
169
+ "type": "string",
170
+ "enum": ["high", "medium", "low"]
171
+ },
172
+ "key_contributions": {
173
+ "type": "array",
174
+ "items": {"type": "string"}
175
+ },
176
+ "follow_up_required": {"type": "boolean"}
177
+ },
178
+ "required": ["name"]
179
+ }
180
+ },
181
+
182
+ "business_context": {
183
+ "type": "object",
184
+ "properties": {
185
+ "market_dynamics": {
186
+ "type": "object",
187
+ "properties": {
188
+ "industry_trends": {
189
+ "type": "array",
190
+ "items": {
191
+ "type": "object",
192
+ "properties": {
193
+ "trend": {"type": "string"},
194
+ "impact": {"type": "string"},
195
+ "relevance": {"type": "string"}
196
+ }
197
+ }
198
+ },
199
+ "competitive_landscape": {
200
+ "type": "object",
201
+ "properties": {
202
+ "direct_competitors": {
203
+ "type": "array",
204
+ "items": {
205
+ "type": "object",
206
+ "properties": {
207
+ "name": {"type": "string"},
208
+ "strengths": {"type": "array", "items": {"type": "string"}},
209
+ "weaknesses": {"type": "array", "items": {"type": "string"}},
210
+ "threat_level": {"type": "string"}
211
+ }
212
+ }
213
+ },
214
+ "market_position": {
215
+ "type": "object",
216
+ "properties": {
217
+ "current_position": {"type": "string"},
218
+ "desired_position": {"type": "string"},
219
+ "gaps": {"type": "array", "items": {"type": "string"}}
220
+ }
221
+ }
222
+ }
223
+ }
224
+ }
225
+ },
226
+ "strategic_alignment": {
227
+ "type": "object",
228
+ "properties": {
229
+ "company_objectives": {
230
+ "type": "array",
231
+ "items": {
232
+ "type": "object",
233
+ "properties": {
234
+ "objective": {"type": "string"},
235
+ "alignment_level": {"type": "string"},
236
+ "contribution": {"type": "string"}
237
+ }
238
+ }
239
+ },
240
+ "value_proposition": {
241
+ "type": "object",
242
+ "properties": {
243
+ "key_value_drivers": {"type": "array", "items": {"type": "string"}},
244
+ "differentiators": {"type": "array", "items": {"type": "string"}},
245
+ "client_benefits": {"type": "array", "items": {"type": "string"}}
246
+ }
247
+ }
248
+ }
249
+ },
250
+ "account_status": {
251
+ "type": "string",
252
+ "enum": [
253
+ "active_client",
254
+ "prospect",
255
+ "partner",
256
+ "competitor",
257
+ "other"
258
+ ]
259
+ },
260
+ "project_phase": {"type": "string"},
261
+ "business_units_involved": {
262
+ "type": "array",
263
+ "items": {"type": "string"}
264
+ },
265
+ "current_contracts": {
266
+ "type": "array",
267
+ "items": {
268
+ "type": "object",
269
+ "properties": {
270
+ "name": {"type": "string"},
271
+ "status": {"type": "string"},
272
+ "value": {"type": "string"}
273
+ }
274
+ }
275
+ },
276
+ "strategic_importance": {
277
+ "type": "string",
278
+ "enum": ["high", "medium", "low"]
279
+ }
280
+ }
281
+ },
282
+
283
+ "discussion_topics": {
284
+ "type": "array",
285
+ "items": {
286
+ "type": "object",
287
+ "properties": {
288
+ "topic": {"type": "string"},
289
+ "duration": {"type": "string"},
290
+ "priority": {
291
+ "type": "string",
292
+ "enum": ["high", "medium", "low"]
293
+ },
294
+ "key_points": {
295
+ "type": "array",
296
+ "items": {"type": "string"}
297
+ },
298
+ "stakeholders_involved": {
299
+ "type": "array",
300
+ "items": {"type": "string"}
301
+ },
302
+ "outcomes": {
303
+ "type": "array",
304
+ "items": {"type": "string"}
305
+ },
306
+ "sentiment": {
307
+ "type": "string",
308
+ "enum": ["positive", "neutral", "negative", "mixed"]
309
+ }
310
+ }
311
+ }
312
+ },
313
+
314
+ "action_items": {
315
+ "type": "array",
316
+ "items": {
317
+ "type": "object",
318
+ "properties": {
319
+ "description": {"type": "string"},
320
+ "type": {
321
+ "type": "string",
322
+ "enum": [
323
+ "task",
324
+ "decision_needed",
325
+ "information_request",
326
+ "approval_required",
327
+ "follow_up"
328
+ ]
329
+ },
330
+ "owner": {"type": "string"},
331
+ "assignees": {
332
+ "type": "array",
333
+ "items": {"type": "string"}
334
+ },
335
+ "due_date": {"type": "string"},
336
+ "priority": {
337
+ "type": "string",
338
+ "enum": ["high", "medium", "low"]
339
+ },
340
+ "status": {
341
+ "type": "string",
342
+ "enum": [
343
+ "not_started",
344
+ "in_progress",
345
+ "blocked",
346
+ "completed"
347
+ ]
348
+ },
349
+ "dependencies": {
350
+ "type": "array",
351
+ "items": {"type": "string"}
352
+ },
353
+ "related_topic": {"type": "string"},
354
+ "notes": {"type": "string"}
355
+ }
356
+ }
357
+ },
358
+
359
+ "decisions": {
360
+ "type": "array",
361
+ "items": {
362
+ "type": "object",
363
+ "properties": {
364
+ "topic": {"type": "string"},
365
+ "decision": {"type": "string"},
366
+ "rationale": {"type": "string"},
367
+ "impact_level": {
368
+ "type": "string",
369
+ "enum": ["high", "medium", "low"]
370
+ },
371
+ "decision_makers": {
372
+ "type": "array",
373
+ "items": {"type": "string"}
374
+ },
375
+ "stakeholders_affected": {
376
+ "type": "array",
377
+ "items": {"type": "string"}
378
+ },
379
+ "implementation_timeline": {"type": "string"},
380
+ "dependencies": {
381
+ "type": "array",
382
+ "items": {"type": "string"}
383
+ },
384
+ "risks_identified": {
385
+ "type": "array",
386
+ "items": {"type": "string"}
387
+ }
388
+ }
389
+ }
390
+ },
391
+
392
+ "opportunities": {
393
+ "type": "array",
394
+ "items": {
395
+ "type": "object",
396
+ "properties": {
397
+ "name": {"type": "string"},
398
+ "type": {
399
+ "type": "string",
400
+ "enum": [
401
+ "new_business",
402
+ "expansion",
403
+ "renewal",
404
+ "upsell",
405
+ "cross_sell"
406
+ ]
407
+ },
408
+ "estimated_value": {"type": "string"},
409
+ "probability": {"type": "string"},
410
+ "timeline": {"type": "string"},
411
+ "key_stakeholders": {
412
+ "type": "array",
413
+ "items": {"type": "string"}
414
+ },
415
+ "decision_makers": {
416
+ "type": "array",
417
+ "items": {"type": "string"}
418
+ },
419
+ "next_steps": {
420
+ "type": "array",
421
+ "items": {"type": "string"}
422
+ },
423
+ "competitive_situation": {
424
+ "type": "object",
425
+ "properties": {
426
+ "competitors": {
427
+ "type": "array",
428
+ "items": {"type": "string"}
429
+ },
430
+ "our_strengths": {
431
+ "type": "array",
432
+ "items": {"type": "string"}
433
+ },
434
+ "our_weaknesses": {
435
+ "type": "array",
436
+ "items": {"type": "string"}
437
+ }
438
+ }
439
+ }
440
+ }
441
+ }
442
+ },
443
+
444
+ "risks_and_issues": {
445
+ "type": "array",
446
+ "items": {
447
+ "type": "object",
448
+ "properties": {
449
+ "description": {"type": "string"},
450
+ "type": {
451
+ "type": "string",
452
+ "enum": [
453
+ "technical",
454
+ "commercial",
455
+ "operational",
456
+ "strategic",
457
+ "compliance"
458
+ ]
459
+ },
460
+ "severity": {
461
+ "type": "string",
462
+ "enum": ["high", "medium", "low"]
463
+ },
464
+ "probability": {
465
+ "type": "string",
466
+ "enum": ["high", "medium", "low"]
467
+ },
468
+ "raised_by": {"type": "string"},
469
+ "impact_areas": {
470
+ "type": "array",
471
+ "items": {"type": "string"}
472
+ },
473
+ "mitigation_plan": {"type": "string"},
474
+ "owner": {"type": "string"},
475
+ "status": {
476
+ "type": "string",
477
+ "enum": [
478
+ "identified",
479
+ "being_mitigated",
480
+ "mitigated",
481
+ "accepted"
482
+ ]
483
+ }
484
+ }
485
+ }
486
+ },
487
+
488
+ "interaction_dynamics": {
489
+ "type": "object",
490
+ "properties": {
491
+ "communication_patterns": {
492
+ "type": "array",
493
+ "items": {
494
+ "type": "object",
495
+ "properties": {
496
+ "participants": {"type": "array", "items": {"type": "string"}},
497
+ "pattern_type": {"type": "string"},
498
+ "frequency": {"type": "string"},
499
+ "effectiveness": {"type": "string"},
500
+ "power_dynamics": {
501
+ "type": "array",
502
+ "items": {
503
+ "type": "object",
504
+ "properties": {
505
+ "actor": {"type": "string"},
506
+ "influence_type": {"type": "string"},
507
+ "impact": {"type": "string"}
508
+ }
509
+ }
510
+ }
511
+ }
512
+ }
513
+ },
514
+ "emotional_intelligence": {
515
+ "type": "object",
516
+ "properties": {
517
+ "group_dynamics": {
518
+ "type": "array",
519
+ "items": {
520
+ "type": "object",
521
+ "properties": {
522
+ "dynamic_type": {"type": "string"},
523
+ "intensity": {"type": "string"},
524
+ "impact": {"type": "string"}
525
+ }
526
+ }
527
+ },
528
+ "emotional_triggers": {
529
+ "type": "array",
530
+ "items": {
531
+ "type": "object",
532
+ "properties": {
533
+ "trigger": {"type": "string"},
534
+ "response": {"type": "string"},
535
+ "participants_affected": {"type": "array", "items": {"type": "string"}}
536
+ }
537
+ }
538
+ }
539
+ }
540
+ }
541
+ }
542
+ },
543
+
544
+ "relationship_insights": {
545
+ "type": "array",
546
+ "items": {
547
+ "type": "object",
548
+ "properties": {
549
+ "stakeholder": {"type": "string"},
550
+ "influence_level": {
551
+ "type": "string",
552
+ "enum": ["high", "medium", "low"]
553
+ },
554
+ "sentiment": {
555
+ "type": "string",
556
+ "enum": ["positive", "neutral", "negative", "mixed"]
557
+ },
558
+ "key_interests": {
559
+ "type": "array",
560
+ "items": {"type": "string"}
561
+ },
562
+ "concerns": {
563
+ "type": "array",
564
+ "items": {"type": "string"}
565
+ },
566
+ "relationships": {
567
+ "type": "array",
568
+ "items": {
569
+ "type": "object",
570
+ "properties": {
571
+ "with": {"type": "string"},
572
+ "nature": {"type": "string"},
573
+ "strength": {
574
+ "type": "string",
575
+ "enum": ["strong", "moderate", "weak"]
576
+ }
577
+ }
578
+ }
579
+ }
580
+ }
581
+ }
582
+ }
583
+ }
584
  }
585
 
586
+ try:
587
+ self.extraction_chain = create_extraction_chain(
588
+ self.entity_schema,
589
+ self.llm_service.extraction_model
590
+ )
591
+ logger.info("Entity extraction chain initialized successfully")
592
+ except Exception as e:
593
+ logger.error(f"Failed to initialize extraction chain: {str(e)}")
594
+ raise
595
 
596
  async def extract_entities(self, text: str) -> Dict[str, Any]:
597
+ """
598
+ Extract comprehensive entities and insights from text.
599
+
600
+ Args:
601
+ text: Meeting transcript or text to analyze
602
+
603
+ Returns:
604
+ Dictionary containing extracted entities, relationships, and business context
605
+
606
+ Raises:
607
+ Exception: If extraction fails
608
+ """
609
+ try:
610
+ extracted_data = await self.extraction_chain.arun(text)
611
+
612
+ # Add extraction timestamp
613
+ extracted_data['metadata'] = {
614
+ 'extraction_timestamp': datetime.now().isoformat(),
615
+ 'schema_version': '2.0'
616
+ }
617
+
618
+ logger.info("Entity extraction completed successfully")
619
+ return extracted_data
620
+
621
+ except Exception as e:
622
+ logger.error(f"Entity extraction failed: {str(e)}")
623
+ raise Exception(f"Failed to extract entities: {str(e)}")
624
+
625
+ def validate_extraction(self, extracted_data: Dict[str, Any]) -> bool:
626
+ """
627
+ Validate extracted data against schema requirements.
628
+
629
+ Args:
630
+ extracted_data: Dictionary of extracted information
631
+
632
+ Returns:
633
+ Boolean indicating validation status
634
+ """
635
+ try:
636
+ # Implement validation logic here
637
+ # This could check for required fields, data consistency, etc.
638
+ return True
639
+ except Exception as e:
640
+ logger.error(f"Validation failed: {str(e)}")
641
+ return False