File size: 3,308 Bytes
8124364
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
"""
Entity IR Models
Data structures for entity retrieval system
"""

from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional


@dataclass
class Entity:
    """
    Represents an entity from the knowledge base.
    Designed to work with both wiki_summary.json and dhow_summary.json formats.
    """
    name: str                           # Primary name (first in variants list)
    id: str = ""                        # Entity ID from knowledge base
    variants: List[str] = field(default_factory=list)  # All name variations
    source: str = ""                    # "wiki" or "dhow"

    # Core facts
    title: str = ""
    url: str = ""
    raw_text: str = ""
    summary: str = ""

    # Dhow-specific fields
    primary_position: str = ""
    primary_organization: str = ""
    family_name: str = ""
    city: str = ""
    country: str = ""

    # Full original data for RAG
    facts: Dict[str, Any] = field(default_factory=dict)
    
    def __post_init__(self):
        """Ensure name is always in variants"""
        if self.name and self.name not in self.variants:
            self.variants.insert(0, self.name)
    
    def get_searchable_text(self) -> str:
        """
        Returns combined text for BM25 indexing.
        Includes name variants + position/organization for disambiguation.
        """
        parts = list(self.variants)
        if self.primary_position:
            parts.append(self.primary_position)
        if self.primary_organization:
            parts.append(self.primary_organization)
        if self.family_name:
            parts.append(self.family_name)
        return " ".join(parts)


@dataclass
class RetrievalResult:
    """
    Result from entity retrieval with scoring information.
    """
    entity: Entity
    score: float
    match_type: str  # "alias", "exact", "bm25", "hybrid"
    
    # Debug info
    matched_variant: Optional[str] = None  # Which variant was matched
    normalized_query: Optional[str] = None  # Query after normalization


@dataclass
class RetrievalConfig:
    """
    Configuration for retrieval system.
    """
    # BM25 parameters (optimized for short Arabic names)
    bm25_k1: float = 1.8   # Term frequency saturation
    bm25_b: float = 0.4    # Document length normalization
    
    # Retrieval settings
    top_k: int = 5
    alias_boost: float = 10.0     # Score multiplier for alias matches
    exact_match_boost: float = 5.0  # Score multiplier for exact matches
    
    # Thresholds
    min_score_threshold: float = 0.1  # Minimum score to return


@dataclass 
class BenchmarkResult:
    """
    Results from running benchmark evaluation.
    """
    total_queries: int
    precision_at_1: float
    recall_at_5: float
    avg_latency_ms: float
    p95_latency_ms: float
    
    # Detailed breakdown
    alias_hits: int = 0
    bm25_hits: int = 0
    misses: int = 0
    
    def __str__(self) -> str:
        return f"""
Benchmark Results:
  Total Queries: {self.total_queries}
  Precision@1:   {self.precision_at_1:.2%}
  Recall@5:      {self.recall_at_5:.2%}
  Avg Latency:   {self.avg_latency_ms:.2f}ms
  P95 Latency:   {self.p95_latency_ms:.2f}ms
  
  Breakdown:
    Alias Hits:  {self.alias_hits}
    BM25 Hits:   {self.bm25_hits}
    Misses:      {self.misses}
""".strip()