File size: 3,308 Bytes
8124364 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 | """
Entity IR Models
Data structures for entity retrieval system
"""
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional
@dataclass
class Entity:
"""
Represents an entity from the knowledge base.
Designed to work with both wiki_summary.json and dhow_summary.json formats.
"""
name: str # Primary name (first in variants list)
id: str = "" # Entity ID from knowledge base
variants: List[str] = field(default_factory=list) # All name variations
source: str = "" # "wiki" or "dhow"
# Core facts
title: str = ""
url: str = ""
raw_text: str = ""
summary: str = ""
# Dhow-specific fields
primary_position: str = ""
primary_organization: str = ""
family_name: str = ""
city: str = ""
country: str = ""
# Full original data for RAG
facts: Dict[str, Any] = field(default_factory=dict)
def __post_init__(self):
"""Ensure name is always in variants"""
if self.name and self.name not in self.variants:
self.variants.insert(0, self.name)
def get_searchable_text(self) -> str:
"""
Returns combined text for BM25 indexing.
Includes name variants + position/organization for disambiguation.
"""
parts = list(self.variants)
if self.primary_position:
parts.append(self.primary_position)
if self.primary_organization:
parts.append(self.primary_organization)
if self.family_name:
parts.append(self.family_name)
return " ".join(parts)
@dataclass
class RetrievalResult:
"""
Result from entity retrieval with scoring information.
"""
entity: Entity
score: float
match_type: str # "alias", "exact", "bm25", "hybrid"
# Debug info
matched_variant: Optional[str] = None # Which variant was matched
normalized_query: Optional[str] = None # Query after normalization
@dataclass
class RetrievalConfig:
"""
Configuration for retrieval system.
"""
# BM25 parameters (optimized for short Arabic names)
bm25_k1: float = 1.8 # Term frequency saturation
bm25_b: float = 0.4 # Document length normalization
# Retrieval settings
top_k: int = 5
alias_boost: float = 10.0 # Score multiplier for alias matches
exact_match_boost: float = 5.0 # Score multiplier for exact matches
# Thresholds
min_score_threshold: float = 0.1 # Minimum score to return
@dataclass
class BenchmarkResult:
"""
Results from running benchmark evaluation.
"""
total_queries: int
precision_at_1: float
recall_at_5: float
avg_latency_ms: float
p95_latency_ms: float
# Detailed breakdown
alias_hits: int = 0
bm25_hits: int = 0
misses: int = 0
def __str__(self) -> str:
return f"""
Benchmark Results:
Total Queries: {self.total_queries}
Precision@1: {self.precision_at_1:.2%}
Recall@5: {self.recall_at_5:.2%}
Avg Latency: {self.avg_latency_ms:.2f}ms
P95 Latency: {self.p95_latency_ms:.2f}ms
Breakdown:
Alias Hits: {self.alias_hits}
BM25 Hits: {self.bm25_hits}
Misses: {self.misses}
""".strip()
|