vn6295337 Claude Opus 4.5 commited on
Commit
c6a48e0
·
1 Parent(s): 3a9dfa1

Add evaluation framework for Docling + RAG pipeline

Browse files

- eval_spot_check.py: Manual parsing inspection
- eval_parsing.py: Automated structure metrics
- eval_retrieval.py: Precision/recall/MRR
- eval_embeddings.py: Semantic similarity tests
- tests/eval_data/: Test document structure

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

scripts/eval_embeddings.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Embedding quality evaluation.
4
+
5
+ Usage:
6
+ python scripts/eval_embeddings.py tests/eval_data/queries.json
7
+
8
+ Measures:
9
+ - Cosine similarity for similar text pairs (should be high)
10
+ - Cosine similarity for dissimilar text pairs (should be low)
11
+ """
12
+
13
+ import sys
14
+ import json
15
+ import numpy as np
16
+ from pathlib import Path
17
+ from dataclasses import dataclass
18
+ from typing import List, Tuple
19
+
20
+ sys.path.insert(0, str(Path(__file__).parent.parent))
21
+
22
+
23
+ @dataclass
24
+ class EmbeddingMetrics:
25
+ """Metrics for embedding quality."""
26
+ similar_pairs_avg: float
27
+ similar_pairs_min: float
28
+ dissimilar_pairs_avg: float
29
+ dissimilar_pairs_max: float
30
+ separation: float # similar_avg - dissimilar_avg
31
+ similar_results: List[Tuple[str, str, float]]
32
+ dissimilar_results: List[Tuple[str, str, float]]
33
+
34
+
35
+ def cosine_similarity(a: List[float], b: List[float]) -> float:
36
+ """Calculate cosine similarity between two vectors."""
37
+ a = np.array(a)
38
+ b = np.array(b)
39
+ return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
40
+
41
+
42
+ def get_embedding(text: str, model=None) -> List[float]:
43
+ """Get embedding for text using sentence-transformers."""
44
+ if model is None:
45
+ from sentence_transformers import SentenceTransformer
46
+ model = SentenceTransformer('all-MiniLM-L6-v2')
47
+
48
+ embedding = model.encode(text, convert_to_numpy=True)
49
+ return embedding.tolist()
50
+
51
+
52
+ def evaluate_embeddings(queries_file: str) -> EmbeddingMetrics:
53
+ """Evaluate embedding quality using similarity pairs."""
54
+
55
+ with open(queries_file, 'r') as f:
56
+ data = json.load(f)
57
+
58
+ similarity_pairs = data.get("similarity_pairs", {})
59
+ similar = similarity_pairs.get("similar", [])
60
+ dissimilar = similarity_pairs.get("dissimilar", [])
61
+
62
+ if not similar and not dissimilar:
63
+ print("No similarity pairs found in queries file")
64
+ print("Expected format:")
65
+ print(''' "similarity_pairs": {
66
+ "similar": [["text1", "text2"], ...],
67
+ "dissimilar": [["text1", "text2"], ...]
68
+ }''')
69
+ return None
70
+
71
+ print("\n" + "=" * 60)
72
+ print(" EMBEDDING QUALITY EVALUATION")
73
+ print("=" * 60)
74
+
75
+ # Load model once
76
+ print("\nLoading embedding model...")
77
+ from sentence_transformers import SentenceTransformer
78
+ model = SentenceTransformer('all-MiniLM-L6-v2')
79
+ print(f"Model: all-MiniLM-L6-v2 (384 dimensions)")
80
+
81
+ # Evaluate similar pairs
82
+ similar_scores = []
83
+ similar_results = []
84
+
85
+ print(f"\n📊 Similar Pairs ({len(similar)} pairs)")
86
+ print(" Expected: cosine similarity > 0.6")
87
+ print()
88
+
89
+ for pair in similar:
90
+ if len(pair) != 2:
91
+ continue
92
+ text1, text2 = pair
93
+ emb1 = model.encode(text1, convert_to_numpy=True)
94
+ emb2 = model.encode(text2, convert_to_numpy=True)
95
+ score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
96
+ similar_scores.append(score)
97
+ similar_results.append((text1, text2, score))
98
+
99
+ status = "✅" if score > 0.6 else "⚠️" if score > 0.4 else "❌"
100
+ print(f" {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
101
+
102
+ # Evaluate dissimilar pairs
103
+ dissimilar_scores = []
104
+ dissimilar_results = []
105
+
106
+ print(f"\n📊 Dissimilar Pairs ({len(dissimilar)} pairs)")
107
+ print(" Expected: cosine similarity < 0.4")
108
+ print()
109
+
110
+ for pair in dissimilar:
111
+ if len(pair) != 2:
112
+ continue
113
+ text1, text2 = pair
114
+ emb1 = model.encode(text1, convert_to_numpy=True)
115
+ emb2 = model.encode(text2, convert_to_numpy=True)
116
+ score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
117
+ dissimilar_scores.append(score)
118
+ dissimilar_results.append((text1, text2, score))
119
+
120
+ status = "✅" if score < 0.4 else "⚠️" if score < 0.6 else "❌"
121
+ print(f" {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
122
+
123
+ # Calculate metrics
124
+ metrics = EmbeddingMetrics(
125
+ similar_pairs_avg=np.mean(similar_scores) if similar_scores else 0.0,
126
+ similar_pairs_min=np.min(similar_scores) if similar_scores else 0.0,
127
+ dissimilar_pairs_avg=np.mean(dissimilar_scores) if dissimilar_scores else 0.0,
128
+ dissimilar_pairs_max=np.max(dissimilar_scores) if dissimilar_scores else 0.0,
129
+ separation=(np.mean(similar_scores) - np.mean(dissimilar_scores)
130
+ if similar_scores and dissimilar_scores else 0.0),
131
+ similar_results=similar_results,
132
+ dissimilar_results=dissimilar_results
133
+ )
134
+
135
+ # Print summary
136
+ print("\n" + "-" * 60)
137
+ print(" SUMMARY")
138
+ print("-" * 60)
139
+
140
+ if similar_scores:
141
+ print(f" Similar pairs avg: {metrics.similar_pairs_avg:.3f}")
142
+ print(f" Similar pairs min: {metrics.similar_pairs_min:.3f}")
143
+
144
+ if dissimilar_scores:
145
+ print(f" Dissimilar pairs avg: {metrics.dissimilar_pairs_avg:.3f}")
146
+ print(f" Dissimilar pairs max: {metrics.dissimilar_pairs_max:.3f}")
147
+
148
+ print(f" Separation (similar - dissimilar): {metrics.separation:.3f}")
149
+
150
+ # Quality assessment
151
+ print("\n📈 Quality Assessment")
152
+
153
+ if metrics.similar_pairs_avg >= 0.6:
154
+ print(" ✅ Similar pairs: GOOD (avg ≥ 0.6)")
155
+ elif metrics.similar_pairs_avg >= 0.4:
156
+ print(" ⚠️ Similar pairs: FAIR (avg 0.4-0.6)")
157
+ else:
158
+ print(" ❌ Similar pairs: POOR (avg < 0.4)")
159
+
160
+ if metrics.dissimilar_pairs_avg <= 0.4:
161
+ print(" ✅ Dissimilar pairs: GOOD (avg ≤ 0.4)")
162
+ elif metrics.dissimilar_pairs_avg <= 0.6:
163
+ print(" ⚠️ Dissimilar pairs: FAIR (avg 0.4-0.6)")
164
+ else:
165
+ print(" ❌ Dissimilar pairs: POOR (avg > 0.6)")
166
+
167
+ if metrics.separation >= 0.3:
168
+ print(" ✅ Separation: GOOD (≥ 0.3)")
169
+ elif metrics.separation >= 0.15:
170
+ print(" ⚠️ Separation: FAIR (0.15-0.3)")
171
+ else:
172
+ print(" ❌ Separation: POOR (< 0.15)")
173
+
174
+ return metrics
175
+
176
+
177
+ if __name__ == "__main__":
178
+ if len(sys.argv) < 2:
179
+ print("Usage: python scripts/eval_embeddings.py queries.json")
180
+ print("\nExample:")
181
+ print(" python scripts/eval_embeddings.py tests/eval_data/queries.json")
182
+ sys.exit(1)
183
+
184
+ queries_file = sys.argv[1]
185
+
186
+ if not Path(queries_file).exists():
187
+ print(f"Error: File not found: {queries_file}")
188
+ sys.exit(1)
189
+
190
+ metrics = evaluate_embeddings(queries_file)
191
+
192
+ if metrics and metrics.separation < 0.15:
193
+ sys.exit(1)
scripts/eval_parsing.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Automated parsing quality evaluation.
4
+
5
+ Usage:
6
+ python scripts/eval_parsing.py tests/eval_data/documents
7
+
8
+ Measures:
9
+ - Element extraction counts
10
+ - Structure preservation (tables, headings)
11
+ - Format coverage
12
+ """
13
+
14
+ import sys
15
+ import json
16
+ from pathlib import Path
17
+ from collections import Counter
18
+ from dataclasses import dataclass, asdict
19
+ from typing import List, Dict, Any
20
+
21
+ sys.path.insert(0, str(Path(__file__).parent.parent))
22
+
23
+ from src.ingestion.docling_loader import (
24
+ load_documents_with_docling,
25
+ SUPPORTED_EXTENSIONS
26
+ )
27
+
28
+
29
+ @dataclass
30
+ class ParsingMetrics:
31
+ """Metrics for parsing quality evaluation."""
32
+ total_documents: int = 0
33
+ successful_documents: int = 0
34
+ failed_documents: int = 0
35
+ total_elements: int = 0
36
+ total_chars: int = 0
37
+ elements_by_type: Dict[str, int] = None
38
+ formats_processed: Dict[str, int] = None
39
+ avg_elements_per_doc: float = 0.0
40
+ avg_chars_per_doc: float = 0.0
41
+ documents_with_tables: int = 0
42
+ documents_with_headings: int = 0
43
+ issues: List[str] = None
44
+
45
+ def __post_init__(self):
46
+ if self.elements_by_type is None:
47
+ self.elements_by_type = {}
48
+ if self.formats_processed is None:
49
+ self.formats_processed = {}
50
+ if self.issues is None:
51
+ self.issues = []
52
+
53
+
54
+ def evaluate_parsing(docs_dir: str) -> ParsingMetrics:
55
+ """Evaluate parsing quality across all documents in directory."""
56
+
57
+ docs = load_documents_with_docling(docs_dir, recursive=True)
58
+
59
+ metrics = ParsingMetrics()
60
+ metrics.total_documents = len(docs)
61
+
62
+ element_types = Counter()
63
+ format_counts = Counter()
64
+
65
+ for doc in docs:
66
+ format_counts[doc.format] += 1
67
+
68
+ if doc.status != "OK":
69
+ metrics.failed_documents += 1
70
+ metrics.issues.append(f"{doc.filename}: {doc.status} - {doc.error}")
71
+ continue
72
+
73
+ metrics.successful_documents += 1
74
+ metrics.total_elements += len(doc.elements)
75
+ metrics.total_chars += doc.chars
76
+
77
+ # Count element types
78
+ doc_types = Counter(el.element_type for el in doc.elements)
79
+ element_types.update(doc_types)
80
+
81
+ # Check for tables and headings
82
+ if doc_types.get("table", 0) > 0:
83
+ metrics.documents_with_tables += 1
84
+ if doc_types.get("heading", 0) > 0:
85
+ metrics.documents_with_headings += 1
86
+
87
+ # Check for potential issues
88
+ if len(doc.elements) == 0:
89
+ metrics.issues.append(f"{doc.filename}: No elements extracted")
90
+ elif len(doc.elements) < 3:
91
+ metrics.issues.append(f"{doc.filename}: Very few elements ({len(doc.elements)})")
92
+
93
+ # Calculate averages
94
+ if metrics.successful_documents > 0:
95
+ metrics.avg_elements_per_doc = metrics.total_elements / metrics.successful_documents
96
+ metrics.avg_chars_per_doc = metrics.total_chars / metrics.successful_documents
97
+
98
+ metrics.elements_by_type = dict(element_types)
99
+ metrics.formats_processed = dict(format_counts)
100
+
101
+ return metrics
102
+
103
+
104
+ def print_report(metrics: ParsingMetrics):
105
+ """Print evaluation report."""
106
+
107
+ print("\n" + "=" * 60)
108
+ print(" PARSING QUALITY EVALUATION REPORT")
109
+ print("=" * 60)
110
+
111
+ # Document stats
112
+ print("\n📄 Document Statistics")
113
+ print(f" Total documents: {metrics.total_documents}")
114
+ print(f" Successful: {metrics.successful_documents}")
115
+ print(f" Failed: {metrics.failed_documents}")
116
+
117
+ success_rate = (metrics.successful_documents / metrics.total_documents * 100
118
+ if metrics.total_documents > 0 else 0)
119
+ print(f" Success rate: {success_rate:.1f}%")
120
+
121
+ # Format breakdown
122
+ print("\n📁 Formats Processed")
123
+ for fmt, count in sorted(metrics.formats_processed.items()):
124
+ print(f" {fmt}: {count}")
125
+
126
+ # Element stats
127
+ print("\n🔢 Element Statistics")
128
+ print(f" Total elements: {metrics.total_elements}")
129
+ print(f" Total characters: {metrics.total_chars:,}")
130
+ print(f" Avg elements/doc: {metrics.avg_elements_per_doc:.1f}")
131
+ print(f" Avg chars/doc: {metrics.avg_chars_per_doc:,.0f}")
132
+
133
+ # Element types
134
+ print("\n📊 Element Types")
135
+ for el_type, count in sorted(metrics.elements_by_type.items(), key=lambda x: -x[1]):
136
+ print(f" {el_type}: {count}")
137
+
138
+ # Structure detection
139
+ print("\n🏗️ Structure Detection")
140
+ print(f" Documents with tables: {metrics.documents_with_tables}")
141
+ print(f" Documents with headings: {metrics.documents_with_headings}")
142
+
143
+ # Issues
144
+ if metrics.issues:
145
+ print("\n⚠️ Issues Found")
146
+ for issue in metrics.issues[:10]:
147
+ print(f" - {issue}")
148
+ if len(metrics.issues) > 10:
149
+ print(f" ... and {len(metrics.issues) - 10} more")
150
+ else:
151
+ print("\n✅ No issues detected")
152
+
153
+ # Quality score
154
+ print("\n📈 Quality Score")
155
+ score = calculate_quality_score(metrics)
156
+ print(f" Overall: {score:.0f}/100")
157
+
158
+ return score
159
+
160
+
161
+ def calculate_quality_score(metrics: ParsingMetrics) -> float:
162
+ """Calculate overall quality score (0-100)."""
163
+
164
+ if metrics.total_documents == 0:
165
+ return 0.0
166
+
167
+ score = 0.0
168
+
169
+ # Success rate (40 points max)
170
+ success_rate = metrics.successful_documents / metrics.total_documents
171
+ score += success_rate * 40
172
+
173
+ # Element extraction (30 points max)
174
+ if metrics.avg_elements_per_doc > 10:
175
+ score += 30
176
+ elif metrics.avg_elements_per_doc > 5:
177
+ score += 20
178
+ elif metrics.avg_elements_per_doc > 1:
179
+ score += 10
180
+
181
+ # Structure detection (20 points max)
182
+ if metrics.successful_documents > 0:
183
+ table_rate = metrics.documents_with_tables / metrics.successful_documents
184
+ heading_rate = metrics.documents_with_headings / metrics.successful_documents
185
+ score += (table_rate + heading_rate) * 10
186
+
187
+ # No issues bonus (10 points)
188
+ if len(metrics.issues) == 0:
189
+ score += 10
190
+
191
+ return min(score, 100)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ if len(sys.argv) < 2:
196
+ print("Usage: python scripts/eval_parsing.py /path/to/documents")
197
+ sys.exit(1)
198
+
199
+ docs_dir = sys.argv[1]
200
+
201
+ if not Path(docs_dir).is_dir():
202
+ print(f"Error: Directory not found: {docs_dir}")
203
+ sys.exit(1)
204
+
205
+ metrics = evaluate_parsing(docs_dir)
206
+ score = print_report(metrics)
207
+
208
+ # Output JSON if requested
209
+ if "--json" in sys.argv:
210
+ print("\n" + json.dumps(asdict(metrics), indent=2))
211
+
212
+ # Exit with error if score is too low
213
+ if score < 50:
214
+ sys.exit(1)
scripts/eval_retrieval.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Retrieval quality evaluation.
4
+
5
+ Usage:
6
+ python scripts/eval_retrieval.py tests/eval_data/queries.json
7
+
8
+ Measures:
9
+ - Precision@k
10
+ - Recall@k
11
+ - Mean Reciprocal Rank (MRR)
12
+ """
13
+
14
+ import sys
15
+ import json
16
+ from pathlib import Path
17
+ from dataclasses import dataclass
18
+ from typing import List, Dict, Set, Optional
19
+
20
+ sys.path.insert(0, str(Path(__file__).parent.parent))
21
+
22
+
23
+ @dataclass
24
+ class RetrievalMetrics:
25
+ """Metrics for a single query."""
26
+ query_id: str
27
+ query: str
28
+ precision_at_k: float
29
+ recall_at_k: float
30
+ reciprocal_rank: float
31
+ retrieved_ids: List[str]
32
+ relevant_found: List[str]
33
+ relevant_missed: List[str]
34
+
35
+
36
+ @dataclass
37
+ class AggregateMetrics:
38
+ """Aggregate metrics across all queries."""
39
+ total_queries: int
40
+ mean_precision: float
41
+ mean_recall: float
42
+ mrr: float # Mean Reciprocal Rank
43
+ queries_with_hits: int
44
+
45
+
46
+ def evaluate_single_query(
47
+ query_id: str,
48
+ query: str,
49
+ relevant_chunks: Set[str],
50
+ retrieved_chunks: List[str],
51
+ k: int = 5
52
+ ) -> RetrievalMetrics:
53
+ """Evaluate retrieval for a single query."""
54
+
55
+ top_k = retrieved_chunks[:k]
56
+ top_k_set = set(top_k)
57
+
58
+ # Precision@k: relevant in top-k / k
59
+ relevant_in_top_k = top_k_set & relevant_chunks
60
+ precision = len(relevant_in_top_k) / k if k > 0 else 0.0
61
+
62
+ # Recall@k: relevant in top-k / total relevant
63
+ recall = len(relevant_in_top_k) / len(relevant_chunks) if relevant_chunks else 0.0
64
+
65
+ # Reciprocal Rank: 1 / rank of first relevant
66
+ reciprocal_rank = 0.0
67
+ for i, chunk_id in enumerate(top_k):
68
+ if chunk_id in relevant_chunks:
69
+ reciprocal_rank = 1.0 / (i + 1)
70
+ break
71
+
72
+ return RetrievalMetrics(
73
+ query_id=query_id,
74
+ query=query,
75
+ precision_at_k=precision,
76
+ recall_at_k=recall,
77
+ reciprocal_rank=reciprocal_rank,
78
+ retrieved_ids=top_k,
79
+ relevant_found=list(relevant_in_top_k),
80
+ relevant_missed=list(relevant_chunks - top_k_set)
81
+ )
82
+
83
+
84
+ def run_retrieval_eval(
85
+ queries_file: str,
86
+ k: int = 5,
87
+ use_mock: bool = False
88
+ ) -> AggregateMetrics:
89
+ """Run retrieval evaluation from queries file."""
90
+
91
+ with open(queries_file, 'r') as f:
92
+ data = json.load(f)
93
+
94
+ queries = data.get("queries", [])
95
+
96
+ if not queries:
97
+ print("No queries found in file")
98
+ return None
99
+
100
+ # Import retrieval function
101
+ if not use_mock:
102
+ try:
103
+ from src.retrieval.hybrid import hybrid_search
104
+ except ImportError:
105
+ print("Warning: Could not import hybrid_search, using mock")
106
+ use_mock = True
107
+
108
+ all_metrics = []
109
+
110
+ print("\n" + "=" * 60)
111
+ print(" RETRIEVAL QUALITY EVALUATION")
112
+ print("=" * 60)
113
+
114
+ for q in queries:
115
+ query_id = q.get("id", "unknown")
116
+ query_text = q.get("query", "")
117
+ relevant = set(q.get("relevant_chunks", []))
118
+
119
+ if not relevant:
120
+ print(f"\n⚠️ Query {query_id}: No relevant chunks defined, skipping")
121
+ continue
122
+
123
+ print(f"\n📝 Query {query_id}: {query_text[:50]}...")
124
+
125
+ # Get retrieval results
126
+ if use_mock:
127
+ # Mock results for testing without Pinecone
128
+ retrieved = list(relevant)[:k] + ["mock::0", "mock::1"]
129
+ else:
130
+ try:
131
+ results = hybrid_search(query_text, top_k=k)
132
+ retrieved = [r.get("id", "") for r in results]
133
+ except Exception as e:
134
+ print(f" Error: {e}")
135
+ retrieved = []
136
+
137
+ # Evaluate
138
+ metrics = evaluate_single_query(
139
+ query_id=query_id,
140
+ query=query_text,
141
+ relevant_chunks=relevant,
142
+ retrieved_chunks=retrieved,
143
+ k=k
144
+ )
145
+ all_metrics.append(metrics)
146
+
147
+ # Print results
148
+ print(f" Precision@{k}: {metrics.precision_at_k:.2f}")
149
+ print(f" Recall@{k}: {metrics.recall_at_k:.2f}")
150
+ print(f" Reciprocal Rank: {metrics.reciprocal_rank:.2f}")
151
+ if metrics.relevant_found:
152
+ print(f" ✅ Found: {metrics.relevant_found}")
153
+ if metrics.relevant_missed:
154
+ print(f" ❌ Missed: {metrics.relevant_missed}")
155
+
156
+ # Aggregate
157
+ if not all_metrics:
158
+ print("\nNo queries evaluated")
159
+ return None
160
+
161
+ aggregate = AggregateMetrics(
162
+ total_queries=len(all_metrics),
163
+ mean_precision=sum(m.precision_at_k for m in all_metrics) / len(all_metrics),
164
+ mean_recall=sum(m.recall_at_k for m in all_metrics) / len(all_metrics),
165
+ mrr=sum(m.reciprocal_rank for m in all_metrics) / len(all_metrics),
166
+ queries_with_hits=sum(1 for m in all_metrics if m.reciprocal_rank > 0)
167
+ )
168
+
169
+ # Print summary
170
+ print("\n" + "-" * 60)
171
+ print(" SUMMARY")
172
+ print("-" * 60)
173
+ print(f" Total queries: {aggregate.total_queries}")
174
+ print(f" Mean Precision@{k}: {aggregate.mean_precision:.2f}")
175
+ print(f" Mean Recall@{k}: {aggregate.mean_recall:.2f}")
176
+ print(f" MRR: {aggregate.mrr:.2f}")
177
+ print(f" Queries with hits: {aggregate.queries_with_hits}/{aggregate.total_queries}")
178
+
179
+ # Quality assessment
180
+ print("\n📊 Quality Assessment")
181
+ if aggregate.mean_precision >= 0.6:
182
+ print(" ✅ Precision: GOOD (≥60%)")
183
+ elif aggregate.mean_precision >= 0.4:
184
+ print(" ⚠️ Precision: FAIR (40-60%)")
185
+ else:
186
+ print(" ❌ Precision: POOR (<40%)")
187
+
188
+ if aggregate.mrr >= 0.5:
189
+ print(" ✅ MRR: GOOD (≥0.5)")
190
+ elif aggregate.mrr >= 0.3:
191
+ print(" ⚠️ MRR: FAIR (0.3-0.5)")
192
+ else:
193
+ print(" ❌ MRR: POOR (<0.3)")
194
+
195
+ return aggregate
196
+
197
+
198
+ if __name__ == "__main__":
199
+ if len(sys.argv) < 2:
200
+ print("Usage: python scripts/eval_retrieval.py queries.json [--mock]")
201
+ print("\nExample:")
202
+ print(" python scripts/eval_retrieval.py tests/eval_data/queries.json")
203
+ print(" python scripts/eval_retrieval.py tests/eval_data/queries.json --mock")
204
+ sys.exit(1)
205
+
206
+ queries_file = sys.argv[1]
207
+ use_mock = "--mock" in sys.argv
208
+ k = 5
209
+
210
+ # Parse k value if provided
211
+ for arg in sys.argv:
212
+ if arg.startswith("--k="):
213
+ k = int(arg.split("=")[1])
214
+
215
+ if not Path(queries_file).exists():
216
+ print(f"Error: File not found: {queries_file}")
217
+ sys.exit(1)
218
+
219
+ metrics = run_retrieval_eval(queries_file, k=k, use_mock=use_mock)
220
+
221
+ if metrics and metrics.mean_precision < 0.4:
222
+ sys.exit(1)
scripts/eval_spot_check.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Quick spot check for Docling parsing quality.
4
+
5
+ Usage:
6
+ python scripts/eval_spot_check.py /path/to/documents
7
+ python scripts/eval_spot_check.py /path/to/single/file.pdf
8
+
9
+ Outputs a visual summary of how Docling parsed each document.
10
+ """
11
+
12
+ import sys
13
+ import os
14
+ from pathlib import Path
15
+ from collections import Counter
16
+
17
+ # Add project root to path
18
+ sys.path.insert(0, str(Path(__file__).parent.parent))
19
+
20
+ from src.ingestion.docling_loader import (
21
+ load_document_with_docling,
22
+ load_documents_with_docling,
23
+ SUPPORTED_EXTENSIONS,
24
+ ParsedDocument
25
+ )
26
+
27
+
28
+ def print_header(text: str, char: str = "="):
29
+ """Print a formatted header."""
30
+ print(f"\n{char * 60}")
31
+ print(f" {text}")
32
+ print(f"{char * 60}")
33
+
34
+
35
+ def analyze_document(doc: ParsedDocument, verbose: bool = True) -> dict:
36
+ """Analyze a single parsed document and return metrics."""
37
+
38
+ # Count elements by type
39
+ type_counts = Counter(el.element_type for el in doc.elements)
40
+
41
+ # Check for potential issues
42
+ issues = []
43
+ if doc.status != "OK":
44
+ issues.append(f"Status: {doc.status} - {doc.error}")
45
+ if len(doc.elements) == 0:
46
+ issues.append("No elements extracted!")
47
+ if doc.chars == 0:
48
+ issues.append("Zero characters extracted!")
49
+ if type_counts.get("table", 0) == 0 and doc.format == ".pdf":
50
+ # PDFs often have tables - flag if none found
51
+ issues.append("No tables detected (may be expected)")
52
+
53
+ # Calculate metrics
54
+ metrics = {
55
+ "filename": doc.filename,
56
+ "format": doc.format,
57
+ "status": doc.status,
58
+ "total_elements": len(doc.elements),
59
+ "total_chars": doc.chars,
60
+ "total_words": doc.words,
61
+ "page_count": doc.page_count,
62
+ "element_types": dict(type_counts),
63
+ "issues": issues
64
+ }
65
+
66
+ if verbose:
67
+ print_header(f"{doc.filename} ({doc.format})", "-")
68
+ print(f" Status: {doc.status}")
69
+ print(f" Elements: {len(doc.elements)}")
70
+ print(f" Characters: {doc.chars:,}")
71
+ print(f" Words: {doc.words:,}")
72
+ if doc.page_count:
73
+ print(f" Pages: {doc.page_count}")
74
+
75
+ print(f"\n Element breakdown:")
76
+ for el_type, count in sorted(type_counts.items()):
77
+ print(f" {el_type}: {count}")
78
+
79
+ if issues:
80
+ print(f"\n ⚠️ Potential issues:")
81
+ for issue in issues:
82
+ print(f" - {issue}")
83
+
84
+ # Show sample elements
85
+ print(f"\n Sample elements (first 5):")
86
+ for i, el in enumerate(doc.elements[:5]):
87
+ text_preview = el.text[:80].replace('\n', ' ')
88
+ if len(el.text) > 80:
89
+ text_preview += "..."
90
+ print(f" [{el.element_type}] {text_preview}")
91
+
92
+ # Show table preview if any
93
+ tables = [el for el in doc.elements if el.element_type == "table"]
94
+ if tables:
95
+ print(f"\n Table preview (first table):")
96
+ table_text = tables[0].text[:300].replace('\n', '\n ')
97
+ print(f" {table_text}")
98
+ if len(tables[0].text) > 300:
99
+ print(" ...")
100
+
101
+ return metrics
102
+
103
+
104
+ def run_spot_check(path: str, verbose: bool = True):
105
+ """Run spot check on a file or directory."""
106
+
107
+ path = Path(path)
108
+
109
+ print_header("DOCLING PARSING SPOT CHECK")
110
+ print(f" Path: {path}")
111
+ print(f" Supported formats: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
112
+
113
+ all_metrics = []
114
+
115
+ if path.is_file():
116
+ # Single file
117
+ doc = load_document_with_docling(str(path))
118
+ metrics = analyze_document(doc, verbose=verbose)
119
+ all_metrics.append(metrics)
120
+
121
+ elif path.is_dir():
122
+ # Directory
123
+ docs = load_documents_with_docling(str(path), recursive=True)
124
+ print(f" Found {len(docs)} documents")
125
+
126
+ for doc in docs:
127
+ metrics = analyze_document(doc, verbose=verbose)
128
+ all_metrics.append(metrics)
129
+
130
+ else:
131
+ print(f" ERROR: Path not found: {path}")
132
+ return []
133
+
134
+ # Summary
135
+ print_header("SUMMARY")
136
+
137
+ ok_count = sum(1 for m in all_metrics if m["status"] == "OK")
138
+ total_elements = sum(m["total_elements"] for m in all_metrics)
139
+ total_chars = sum(m["total_chars"] for m in all_metrics)
140
+
141
+ print(f" Documents processed: {len(all_metrics)}")
142
+ print(f" Successful (OK): {ok_count}")
143
+ print(f" Failed/Skipped: {len(all_metrics) - ok_count}")
144
+ print(f" Total elements: {total_elements}")
145
+ print(f" Total characters: {total_chars:,}")
146
+
147
+ # Aggregate element types
148
+ all_types = Counter()
149
+ for m in all_metrics:
150
+ all_types.update(m["element_types"])
151
+
152
+ print(f"\n Element types across all docs:")
153
+ for el_type, count in sorted(all_types.items(), key=lambda x: -x[1]):
154
+ print(f" {el_type}: {count}")
155
+
156
+ # All issues
157
+ all_issues = []
158
+ for m in all_metrics:
159
+ for issue in m["issues"]:
160
+ all_issues.append(f"{m['filename']}: {issue}")
161
+
162
+ if all_issues:
163
+ print(f"\n ⚠️ Issues found:")
164
+ for issue in all_issues[:10]:
165
+ print(f" - {issue}")
166
+ if len(all_issues) > 10:
167
+ print(f" ... and {len(all_issues) - 10} more")
168
+ else:
169
+ print(f"\n ✅ No issues detected")
170
+
171
+ return all_metrics
172
+
173
+
174
+ if __name__ == "__main__":
175
+ if len(sys.argv) < 2:
176
+ print("Usage: python scripts/eval_spot_check.py /path/to/documents")
177
+ print("\nExamples:")
178
+ print(" python scripts/eval_spot_check.py ./tests/eval_data/documents")
179
+ print(" python scripts/eval_spot_check.py ./report.pdf")
180
+ sys.exit(1)
181
+
182
+ target_path = sys.argv[1]
183
+ verbose = "--quiet" not in sys.argv
184
+
185
+ run_spot_check(target_path, verbose=verbose)
tests/eval_data/documents/.gitkeep ADDED
File without changes
tests/eval_data/queries.json ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "description": "Test queries for retrieval evaluation",
3
+ "queries": [
4
+ {
5
+ "id": "q1",
6
+ "query": "Example query about your document content",
7
+ "relevant_chunks": ["document.pdf::0", "document.pdf::1"],
8
+ "keywords": ["expected", "keywords", "in", "answer"]
9
+ }
10
+ ],
11
+ "similarity_pairs": {
12
+ "similar": [
13
+ ["What is the total revenue?", "How much money did we make?"],
14
+ ["Describe the methodology", "What methods were used?"]
15
+ ],
16
+ "dissimilar": [
17
+ ["What is the revenue?", "Who founded the company?"],
18
+ ["Technical specifications", "Company history"]
19
+ ]
20
+ }
21
+ }