Spaces:
Sleeping
Sleeping
Add evaluation framework for Docling + RAG pipeline
Browse files- eval_spot_check.py: Manual parsing inspection
- eval_parsing.py: Automated structure metrics
- eval_retrieval.py: Precision/recall/MRR
- eval_embeddings.py: Semantic similarity tests
- tests/eval_data/: Test document structure
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- scripts/eval_embeddings.py +193 -0
- scripts/eval_parsing.py +214 -0
- scripts/eval_retrieval.py +222 -0
- scripts/eval_spot_check.py +185 -0
- tests/eval_data/documents/.gitkeep +0 -0
- tests/eval_data/queries.json +21 -0
scripts/eval_embeddings.py
ADDED
|
@@ -0,0 +1,193 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Embedding quality evaluation.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/eval_embeddings.py tests/eval_data/queries.json
|
| 7 |
+
|
| 8 |
+
Measures:
|
| 9 |
+
- Cosine similarity for similar text pairs (should be high)
|
| 10 |
+
- Cosine similarity for dissimilar text pairs (should be low)
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
import json
|
| 15 |
+
import numpy as np
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
from typing import List, Tuple
|
| 19 |
+
|
| 20 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class EmbeddingMetrics:
|
| 25 |
+
"""Metrics for embedding quality."""
|
| 26 |
+
similar_pairs_avg: float
|
| 27 |
+
similar_pairs_min: float
|
| 28 |
+
dissimilar_pairs_avg: float
|
| 29 |
+
dissimilar_pairs_max: float
|
| 30 |
+
separation: float # similar_avg - dissimilar_avg
|
| 31 |
+
similar_results: List[Tuple[str, str, float]]
|
| 32 |
+
dissimilar_results: List[Tuple[str, str, float]]
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def cosine_similarity(a: List[float], b: List[float]) -> float:
|
| 36 |
+
"""Calculate cosine similarity between two vectors."""
|
| 37 |
+
a = np.array(a)
|
| 38 |
+
b = np.array(b)
|
| 39 |
+
return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def get_embedding(text: str, model=None) -> List[float]:
|
| 43 |
+
"""Get embedding for text using sentence-transformers."""
|
| 44 |
+
if model is None:
|
| 45 |
+
from sentence_transformers import SentenceTransformer
|
| 46 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 47 |
+
|
| 48 |
+
embedding = model.encode(text, convert_to_numpy=True)
|
| 49 |
+
return embedding.tolist()
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def evaluate_embeddings(queries_file: str) -> EmbeddingMetrics:
|
| 53 |
+
"""Evaluate embedding quality using similarity pairs."""
|
| 54 |
+
|
| 55 |
+
with open(queries_file, 'r') as f:
|
| 56 |
+
data = json.load(f)
|
| 57 |
+
|
| 58 |
+
similarity_pairs = data.get("similarity_pairs", {})
|
| 59 |
+
similar = similarity_pairs.get("similar", [])
|
| 60 |
+
dissimilar = similarity_pairs.get("dissimilar", [])
|
| 61 |
+
|
| 62 |
+
if not similar and not dissimilar:
|
| 63 |
+
print("No similarity pairs found in queries file")
|
| 64 |
+
print("Expected format:")
|
| 65 |
+
print(''' "similarity_pairs": {
|
| 66 |
+
"similar": [["text1", "text2"], ...],
|
| 67 |
+
"dissimilar": [["text1", "text2"], ...]
|
| 68 |
+
}''')
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
print("\n" + "=" * 60)
|
| 72 |
+
print(" EMBEDDING QUALITY EVALUATION")
|
| 73 |
+
print("=" * 60)
|
| 74 |
+
|
| 75 |
+
# Load model once
|
| 76 |
+
print("\nLoading embedding model...")
|
| 77 |
+
from sentence_transformers import SentenceTransformer
|
| 78 |
+
model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 79 |
+
print(f"Model: all-MiniLM-L6-v2 (384 dimensions)")
|
| 80 |
+
|
| 81 |
+
# Evaluate similar pairs
|
| 82 |
+
similar_scores = []
|
| 83 |
+
similar_results = []
|
| 84 |
+
|
| 85 |
+
print(f"\n📊 Similar Pairs ({len(similar)} pairs)")
|
| 86 |
+
print(" Expected: cosine similarity > 0.6")
|
| 87 |
+
print()
|
| 88 |
+
|
| 89 |
+
for pair in similar:
|
| 90 |
+
if len(pair) != 2:
|
| 91 |
+
continue
|
| 92 |
+
text1, text2 = pair
|
| 93 |
+
emb1 = model.encode(text1, convert_to_numpy=True)
|
| 94 |
+
emb2 = model.encode(text2, convert_to_numpy=True)
|
| 95 |
+
score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
|
| 96 |
+
similar_scores.append(score)
|
| 97 |
+
similar_results.append((text1, text2, score))
|
| 98 |
+
|
| 99 |
+
status = "✅" if score > 0.6 else "⚠️" if score > 0.4 else "❌"
|
| 100 |
+
print(f" {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
|
| 101 |
+
|
| 102 |
+
# Evaluate dissimilar pairs
|
| 103 |
+
dissimilar_scores = []
|
| 104 |
+
dissimilar_results = []
|
| 105 |
+
|
| 106 |
+
print(f"\n📊 Dissimilar Pairs ({len(dissimilar)} pairs)")
|
| 107 |
+
print(" Expected: cosine similarity < 0.4")
|
| 108 |
+
print()
|
| 109 |
+
|
| 110 |
+
for pair in dissimilar:
|
| 111 |
+
if len(pair) != 2:
|
| 112 |
+
continue
|
| 113 |
+
text1, text2 = pair
|
| 114 |
+
emb1 = model.encode(text1, convert_to_numpy=True)
|
| 115 |
+
emb2 = model.encode(text2, convert_to_numpy=True)
|
| 116 |
+
score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
|
| 117 |
+
dissimilar_scores.append(score)
|
| 118 |
+
dissimilar_results.append((text1, text2, score))
|
| 119 |
+
|
| 120 |
+
status = "✅" if score < 0.4 else "⚠️" if score < 0.6 else "❌"
|
| 121 |
+
print(f" {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
|
| 122 |
+
|
| 123 |
+
# Calculate metrics
|
| 124 |
+
metrics = EmbeddingMetrics(
|
| 125 |
+
similar_pairs_avg=np.mean(similar_scores) if similar_scores else 0.0,
|
| 126 |
+
similar_pairs_min=np.min(similar_scores) if similar_scores else 0.0,
|
| 127 |
+
dissimilar_pairs_avg=np.mean(dissimilar_scores) if dissimilar_scores else 0.0,
|
| 128 |
+
dissimilar_pairs_max=np.max(dissimilar_scores) if dissimilar_scores else 0.0,
|
| 129 |
+
separation=(np.mean(similar_scores) - np.mean(dissimilar_scores)
|
| 130 |
+
if similar_scores and dissimilar_scores else 0.0),
|
| 131 |
+
similar_results=similar_results,
|
| 132 |
+
dissimilar_results=dissimilar_results
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Print summary
|
| 136 |
+
print("\n" + "-" * 60)
|
| 137 |
+
print(" SUMMARY")
|
| 138 |
+
print("-" * 60)
|
| 139 |
+
|
| 140 |
+
if similar_scores:
|
| 141 |
+
print(f" Similar pairs avg: {metrics.similar_pairs_avg:.3f}")
|
| 142 |
+
print(f" Similar pairs min: {metrics.similar_pairs_min:.3f}")
|
| 143 |
+
|
| 144 |
+
if dissimilar_scores:
|
| 145 |
+
print(f" Dissimilar pairs avg: {metrics.dissimilar_pairs_avg:.3f}")
|
| 146 |
+
print(f" Dissimilar pairs max: {metrics.dissimilar_pairs_max:.3f}")
|
| 147 |
+
|
| 148 |
+
print(f" Separation (similar - dissimilar): {metrics.separation:.3f}")
|
| 149 |
+
|
| 150 |
+
# Quality assessment
|
| 151 |
+
print("\n📈 Quality Assessment")
|
| 152 |
+
|
| 153 |
+
if metrics.similar_pairs_avg >= 0.6:
|
| 154 |
+
print(" ✅ Similar pairs: GOOD (avg ≥ 0.6)")
|
| 155 |
+
elif metrics.similar_pairs_avg >= 0.4:
|
| 156 |
+
print(" ⚠️ Similar pairs: FAIR (avg 0.4-0.6)")
|
| 157 |
+
else:
|
| 158 |
+
print(" ❌ Similar pairs: POOR (avg < 0.4)")
|
| 159 |
+
|
| 160 |
+
if metrics.dissimilar_pairs_avg <= 0.4:
|
| 161 |
+
print(" ✅ Dissimilar pairs: GOOD (avg ≤ 0.4)")
|
| 162 |
+
elif metrics.dissimilar_pairs_avg <= 0.6:
|
| 163 |
+
print(" ⚠️ Dissimilar pairs: FAIR (avg 0.4-0.6)")
|
| 164 |
+
else:
|
| 165 |
+
print(" ❌ Dissimilar pairs: POOR (avg > 0.6)")
|
| 166 |
+
|
| 167 |
+
if metrics.separation >= 0.3:
|
| 168 |
+
print(" ✅ Separation: GOOD (≥ 0.3)")
|
| 169 |
+
elif metrics.separation >= 0.15:
|
| 170 |
+
print(" ⚠️ Separation: FAIR (0.15-0.3)")
|
| 171 |
+
else:
|
| 172 |
+
print(" ❌ Separation: POOR (< 0.15)")
|
| 173 |
+
|
| 174 |
+
return metrics
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
if __name__ == "__main__":
|
| 178 |
+
if len(sys.argv) < 2:
|
| 179 |
+
print("Usage: python scripts/eval_embeddings.py queries.json")
|
| 180 |
+
print("\nExample:")
|
| 181 |
+
print(" python scripts/eval_embeddings.py tests/eval_data/queries.json")
|
| 182 |
+
sys.exit(1)
|
| 183 |
+
|
| 184 |
+
queries_file = sys.argv[1]
|
| 185 |
+
|
| 186 |
+
if not Path(queries_file).exists():
|
| 187 |
+
print(f"Error: File not found: {queries_file}")
|
| 188 |
+
sys.exit(1)
|
| 189 |
+
|
| 190 |
+
metrics = evaluate_embeddings(queries_file)
|
| 191 |
+
|
| 192 |
+
if metrics and metrics.separation < 0.15:
|
| 193 |
+
sys.exit(1)
|
scripts/eval_parsing.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Automated parsing quality evaluation.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/eval_parsing.py tests/eval_data/documents
|
| 7 |
+
|
| 8 |
+
Measures:
|
| 9 |
+
- Element extraction counts
|
| 10 |
+
- Structure preservation (tables, headings)
|
| 11 |
+
- Format coverage
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from collections import Counter
|
| 18 |
+
from dataclasses import dataclass, asdict
|
| 19 |
+
from typing import List, Dict, Any
|
| 20 |
+
|
| 21 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 22 |
+
|
| 23 |
+
from src.ingestion.docling_loader import (
|
| 24 |
+
load_documents_with_docling,
|
| 25 |
+
SUPPORTED_EXTENSIONS
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class ParsingMetrics:
|
| 31 |
+
"""Metrics for parsing quality evaluation."""
|
| 32 |
+
total_documents: int = 0
|
| 33 |
+
successful_documents: int = 0
|
| 34 |
+
failed_documents: int = 0
|
| 35 |
+
total_elements: int = 0
|
| 36 |
+
total_chars: int = 0
|
| 37 |
+
elements_by_type: Dict[str, int] = None
|
| 38 |
+
formats_processed: Dict[str, int] = None
|
| 39 |
+
avg_elements_per_doc: float = 0.0
|
| 40 |
+
avg_chars_per_doc: float = 0.0
|
| 41 |
+
documents_with_tables: int = 0
|
| 42 |
+
documents_with_headings: int = 0
|
| 43 |
+
issues: List[str] = None
|
| 44 |
+
|
| 45 |
+
def __post_init__(self):
|
| 46 |
+
if self.elements_by_type is None:
|
| 47 |
+
self.elements_by_type = {}
|
| 48 |
+
if self.formats_processed is None:
|
| 49 |
+
self.formats_processed = {}
|
| 50 |
+
if self.issues is None:
|
| 51 |
+
self.issues = []
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def evaluate_parsing(docs_dir: str) -> ParsingMetrics:
|
| 55 |
+
"""Evaluate parsing quality across all documents in directory."""
|
| 56 |
+
|
| 57 |
+
docs = load_documents_with_docling(docs_dir, recursive=True)
|
| 58 |
+
|
| 59 |
+
metrics = ParsingMetrics()
|
| 60 |
+
metrics.total_documents = len(docs)
|
| 61 |
+
|
| 62 |
+
element_types = Counter()
|
| 63 |
+
format_counts = Counter()
|
| 64 |
+
|
| 65 |
+
for doc in docs:
|
| 66 |
+
format_counts[doc.format] += 1
|
| 67 |
+
|
| 68 |
+
if doc.status != "OK":
|
| 69 |
+
metrics.failed_documents += 1
|
| 70 |
+
metrics.issues.append(f"{doc.filename}: {doc.status} - {doc.error}")
|
| 71 |
+
continue
|
| 72 |
+
|
| 73 |
+
metrics.successful_documents += 1
|
| 74 |
+
metrics.total_elements += len(doc.elements)
|
| 75 |
+
metrics.total_chars += doc.chars
|
| 76 |
+
|
| 77 |
+
# Count element types
|
| 78 |
+
doc_types = Counter(el.element_type for el in doc.elements)
|
| 79 |
+
element_types.update(doc_types)
|
| 80 |
+
|
| 81 |
+
# Check for tables and headings
|
| 82 |
+
if doc_types.get("table", 0) > 0:
|
| 83 |
+
metrics.documents_with_tables += 1
|
| 84 |
+
if doc_types.get("heading", 0) > 0:
|
| 85 |
+
metrics.documents_with_headings += 1
|
| 86 |
+
|
| 87 |
+
# Check for potential issues
|
| 88 |
+
if len(doc.elements) == 0:
|
| 89 |
+
metrics.issues.append(f"{doc.filename}: No elements extracted")
|
| 90 |
+
elif len(doc.elements) < 3:
|
| 91 |
+
metrics.issues.append(f"{doc.filename}: Very few elements ({len(doc.elements)})")
|
| 92 |
+
|
| 93 |
+
# Calculate averages
|
| 94 |
+
if metrics.successful_documents > 0:
|
| 95 |
+
metrics.avg_elements_per_doc = metrics.total_elements / metrics.successful_documents
|
| 96 |
+
metrics.avg_chars_per_doc = metrics.total_chars / metrics.successful_documents
|
| 97 |
+
|
| 98 |
+
metrics.elements_by_type = dict(element_types)
|
| 99 |
+
metrics.formats_processed = dict(format_counts)
|
| 100 |
+
|
| 101 |
+
return metrics
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def print_report(metrics: ParsingMetrics):
|
| 105 |
+
"""Print evaluation report."""
|
| 106 |
+
|
| 107 |
+
print("\n" + "=" * 60)
|
| 108 |
+
print(" PARSING QUALITY EVALUATION REPORT")
|
| 109 |
+
print("=" * 60)
|
| 110 |
+
|
| 111 |
+
# Document stats
|
| 112 |
+
print("\n📄 Document Statistics")
|
| 113 |
+
print(f" Total documents: {metrics.total_documents}")
|
| 114 |
+
print(f" Successful: {metrics.successful_documents}")
|
| 115 |
+
print(f" Failed: {metrics.failed_documents}")
|
| 116 |
+
|
| 117 |
+
success_rate = (metrics.successful_documents / metrics.total_documents * 100
|
| 118 |
+
if metrics.total_documents > 0 else 0)
|
| 119 |
+
print(f" Success rate: {success_rate:.1f}%")
|
| 120 |
+
|
| 121 |
+
# Format breakdown
|
| 122 |
+
print("\n📁 Formats Processed")
|
| 123 |
+
for fmt, count in sorted(metrics.formats_processed.items()):
|
| 124 |
+
print(f" {fmt}: {count}")
|
| 125 |
+
|
| 126 |
+
# Element stats
|
| 127 |
+
print("\n🔢 Element Statistics")
|
| 128 |
+
print(f" Total elements: {metrics.total_elements}")
|
| 129 |
+
print(f" Total characters: {metrics.total_chars:,}")
|
| 130 |
+
print(f" Avg elements/doc: {metrics.avg_elements_per_doc:.1f}")
|
| 131 |
+
print(f" Avg chars/doc: {metrics.avg_chars_per_doc:,.0f}")
|
| 132 |
+
|
| 133 |
+
# Element types
|
| 134 |
+
print("\n📊 Element Types")
|
| 135 |
+
for el_type, count in sorted(metrics.elements_by_type.items(), key=lambda x: -x[1]):
|
| 136 |
+
print(f" {el_type}: {count}")
|
| 137 |
+
|
| 138 |
+
# Structure detection
|
| 139 |
+
print("\n🏗️ Structure Detection")
|
| 140 |
+
print(f" Documents with tables: {metrics.documents_with_tables}")
|
| 141 |
+
print(f" Documents with headings: {metrics.documents_with_headings}")
|
| 142 |
+
|
| 143 |
+
# Issues
|
| 144 |
+
if metrics.issues:
|
| 145 |
+
print("\n⚠️ Issues Found")
|
| 146 |
+
for issue in metrics.issues[:10]:
|
| 147 |
+
print(f" - {issue}")
|
| 148 |
+
if len(metrics.issues) > 10:
|
| 149 |
+
print(f" ... and {len(metrics.issues) - 10} more")
|
| 150 |
+
else:
|
| 151 |
+
print("\n✅ No issues detected")
|
| 152 |
+
|
| 153 |
+
# Quality score
|
| 154 |
+
print("\n📈 Quality Score")
|
| 155 |
+
score = calculate_quality_score(metrics)
|
| 156 |
+
print(f" Overall: {score:.0f}/100")
|
| 157 |
+
|
| 158 |
+
return score
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
def calculate_quality_score(metrics: ParsingMetrics) -> float:
|
| 162 |
+
"""Calculate overall quality score (0-100)."""
|
| 163 |
+
|
| 164 |
+
if metrics.total_documents == 0:
|
| 165 |
+
return 0.0
|
| 166 |
+
|
| 167 |
+
score = 0.0
|
| 168 |
+
|
| 169 |
+
# Success rate (40 points max)
|
| 170 |
+
success_rate = metrics.successful_documents / metrics.total_documents
|
| 171 |
+
score += success_rate * 40
|
| 172 |
+
|
| 173 |
+
# Element extraction (30 points max)
|
| 174 |
+
if metrics.avg_elements_per_doc > 10:
|
| 175 |
+
score += 30
|
| 176 |
+
elif metrics.avg_elements_per_doc > 5:
|
| 177 |
+
score += 20
|
| 178 |
+
elif metrics.avg_elements_per_doc > 1:
|
| 179 |
+
score += 10
|
| 180 |
+
|
| 181 |
+
# Structure detection (20 points max)
|
| 182 |
+
if metrics.successful_documents > 0:
|
| 183 |
+
table_rate = metrics.documents_with_tables / metrics.successful_documents
|
| 184 |
+
heading_rate = metrics.documents_with_headings / metrics.successful_documents
|
| 185 |
+
score += (table_rate + heading_rate) * 10
|
| 186 |
+
|
| 187 |
+
# No issues bonus (10 points)
|
| 188 |
+
if len(metrics.issues) == 0:
|
| 189 |
+
score += 10
|
| 190 |
+
|
| 191 |
+
return min(score, 100)
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
if __name__ == "__main__":
|
| 195 |
+
if len(sys.argv) < 2:
|
| 196 |
+
print("Usage: python scripts/eval_parsing.py /path/to/documents")
|
| 197 |
+
sys.exit(1)
|
| 198 |
+
|
| 199 |
+
docs_dir = sys.argv[1]
|
| 200 |
+
|
| 201 |
+
if not Path(docs_dir).is_dir():
|
| 202 |
+
print(f"Error: Directory not found: {docs_dir}")
|
| 203 |
+
sys.exit(1)
|
| 204 |
+
|
| 205 |
+
metrics = evaluate_parsing(docs_dir)
|
| 206 |
+
score = print_report(metrics)
|
| 207 |
+
|
| 208 |
+
# Output JSON if requested
|
| 209 |
+
if "--json" in sys.argv:
|
| 210 |
+
print("\n" + json.dumps(asdict(metrics), indent=2))
|
| 211 |
+
|
| 212 |
+
# Exit with error if score is too low
|
| 213 |
+
if score < 50:
|
| 214 |
+
sys.exit(1)
|
scripts/eval_retrieval.py
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Retrieval quality evaluation.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/eval_retrieval.py tests/eval_data/queries.json
|
| 7 |
+
|
| 8 |
+
Measures:
|
| 9 |
+
- Precision@k
|
| 10 |
+
- Recall@k
|
| 11 |
+
- Mean Reciprocal Rank (MRR)
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import json
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from dataclasses import dataclass
|
| 18 |
+
from typing import List, Dict, Set, Optional
|
| 19 |
+
|
| 20 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class RetrievalMetrics:
|
| 25 |
+
"""Metrics for a single query."""
|
| 26 |
+
query_id: str
|
| 27 |
+
query: str
|
| 28 |
+
precision_at_k: float
|
| 29 |
+
recall_at_k: float
|
| 30 |
+
reciprocal_rank: float
|
| 31 |
+
retrieved_ids: List[str]
|
| 32 |
+
relevant_found: List[str]
|
| 33 |
+
relevant_missed: List[str]
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class AggregateMetrics:
|
| 38 |
+
"""Aggregate metrics across all queries."""
|
| 39 |
+
total_queries: int
|
| 40 |
+
mean_precision: float
|
| 41 |
+
mean_recall: float
|
| 42 |
+
mrr: float # Mean Reciprocal Rank
|
| 43 |
+
queries_with_hits: int
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def evaluate_single_query(
|
| 47 |
+
query_id: str,
|
| 48 |
+
query: str,
|
| 49 |
+
relevant_chunks: Set[str],
|
| 50 |
+
retrieved_chunks: List[str],
|
| 51 |
+
k: int = 5
|
| 52 |
+
) -> RetrievalMetrics:
|
| 53 |
+
"""Evaluate retrieval for a single query."""
|
| 54 |
+
|
| 55 |
+
top_k = retrieved_chunks[:k]
|
| 56 |
+
top_k_set = set(top_k)
|
| 57 |
+
|
| 58 |
+
# Precision@k: relevant in top-k / k
|
| 59 |
+
relevant_in_top_k = top_k_set & relevant_chunks
|
| 60 |
+
precision = len(relevant_in_top_k) / k if k > 0 else 0.0
|
| 61 |
+
|
| 62 |
+
# Recall@k: relevant in top-k / total relevant
|
| 63 |
+
recall = len(relevant_in_top_k) / len(relevant_chunks) if relevant_chunks else 0.0
|
| 64 |
+
|
| 65 |
+
# Reciprocal Rank: 1 / rank of first relevant
|
| 66 |
+
reciprocal_rank = 0.0
|
| 67 |
+
for i, chunk_id in enumerate(top_k):
|
| 68 |
+
if chunk_id in relevant_chunks:
|
| 69 |
+
reciprocal_rank = 1.0 / (i + 1)
|
| 70 |
+
break
|
| 71 |
+
|
| 72 |
+
return RetrievalMetrics(
|
| 73 |
+
query_id=query_id,
|
| 74 |
+
query=query,
|
| 75 |
+
precision_at_k=precision,
|
| 76 |
+
recall_at_k=recall,
|
| 77 |
+
reciprocal_rank=reciprocal_rank,
|
| 78 |
+
retrieved_ids=top_k,
|
| 79 |
+
relevant_found=list(relevant_in_top_k),
|
| 80 |
+
relevant_missed=list(relevant_chunks - top_k_set)
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def run_retrieval_eval(
|
| 85 |
+
queries_file: str,
|
| 86 |
+
k: int = 5,
|
| 87 |
+
use_mock: bool = False
|
| 88 |
+
) -> AggregateMetrics:
|
| 89 |
+
"""Run retrieval evaluation from queries file."""
|
| 90 |
+
|
| 91 |
+
with open(queries_file, 'r') as f:
|
| 92 |
+
data = json.load(f)
|
| 93 |
+
|
| 94 |
+
queries = data.get("queries", [])
|
| 95 |
+
|
| 96 |
+
if not queries:
|
| 97 |
+
print("No queries found in file")
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
# Import retrieval function
|
| 101 |
+
if not use_mock:
|
| 102 |
+
try:
|
| 103 |
+
from src.retrieval.hybrid import hybrid_search
|
| 104 |
+
except ImportError:
|
| 105 |
+
print("Warning: Could not import hybrid_search, using mock")
|
| 106 |
+
use_mock = True
|
| 107 |
+
|
| 108 |
+
all_metrics = []
|
| 109 |
+
|
| 110 |
+
print("\n" + "=" * 60)
|
| 111 |
+
print(" RETRIEVAL QUALITY EVALUATION")
|
| 112 |
+
print("=" * 60)
|
| 113 |
+
|
| 114 |
+
for q in queries:
|
| 115 |
+
query_id = q.get("id", "unknown")
|
| 116 |
+
query_text = q.get("query", "")
|
| 117 |
+
relevant = set(q.get("relevant_chunks", []))
|
| 118 |
+
|
| 119 |
+
if not relevant:
|
| 120 |
+
print(f"\n⚠️ Query {query_id}: No relevant chunks defined, skipping")
|
| 121 |
+
continue
|
| 122 |
+
|
| 123 |
+
print(f"\n📝 Query {query_id}: {query_text[:50]}...")
|
| 124 |
+
|
| 125 |
+
# Get retrieval results
|
| 126 |
+
if use_mock:
|
| 127 |
+
# Mock results for testing without Pinecone
|
| 128 |
+
retrieved = list(relevant)[:k] + ["mock::0", "mock::1"]
|
| 129 |
+
else:
|
| 130 |
+
try:
|
| 131 |
+
results = hybrid_search(query_text, top_k=k)
|
| 132 |
+
retrieved = [r.get("id", "") for r in results]
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f" Error: {e}")
|
| 135 |
+
retrieved = []
|
| 136 |
+
|
| 137 |
+
# Evaluate
|
| 138 |
+
metrics = evaluate_single_query(
|
| 139 |
+
query_id=query_id,
|
| 140 |
+
query=query_text,
|
| 141 |
+
relevant_chunks=relevant,
|
| 142 |
+
retrieved_chunks=retrieved,
|
| 143 |
+
k=k
|
| 144 |
+
)
|
| 145 |
+
all_metrics.append(metrics)
|
| 146 |
+
|
| 147 |
+
# Print results
|
| 148 |
+
print(f" Precision@{k}: {metrics.precision_at_k:.2f}")
|
| 149 |
+
print(f" Recall@{k}: {metrics.recall_at_k:.2f}")
|
| 150 |
+
print(f" Reciprocal Rank: {metrics.reciprocal_rank:.2f}")
|
| 151 |
+
if metrics.relevant_found:
|
| 152 |
+
print(f" ✅ Found: {metrics.relevant_found}")
|
| 153 |
+
if metrics.relevant_missed:
|
| 154 |
+
print(f" ❌ Missed: {metrics.relevant_missed}")
|
| 155 |
+
|
| 156 |
+
# Aggregate
|
| 157 |
+
if not all_metrics:
|
| 158 |
+
print("\nNo queries evaluated")
|
| 159 |
+
return None
|
| 160 |
+
|
| 161 |
+
aggregate = AggregateMetrics(
|
| 162 |
+
total_queries=len(all_metrics),
|
| 163 |
+
mean_precision=sum(m.precision_at_k for m in all_metrics) / len(all_metrics),
|
| 164 |
+
mean_recall=sum(m.recall_at_k for m in all_metrics) / len(all_metrics),
|
| 165 |
+
mrr=sum(m.reciprocal_rank for m in all_metrics) / len(all_metrics),
|
| 166 |
+
queries_with_hits=sum(1 for m in all_metrics if m.reciprocal_rank > 0)
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Print summary
|
| 170 |
+
print("\n" + "-" * 60)
|
| 171 |
+
print(" SUMMARY")
|
| 172 |
+
print("-" * 60)
|
| 173 |
+
print(f" Total queries: {aggregate.total_queries}")
|
| 174 |
+
print(f" Mean Precision@{k}: {aggregate.mean_precision:.2f}")
|
| 175 |
+
print(f" Mean Recall@{k}: {aggregate.mean_recall:.2f}")
|
| 176 |
+
print(f" MRR: {aggregate.mrr:.2f}")
|
| 177 |
+
print(f" Queries with hits: {aggregate.queries_with_hits}/{aggregate.total_queries}")
|
| 178 |
+
|
| 179 |
+
# Quality assessment
|
| 180 |
+
print("\n📊 Quality Assessment")
|
| 181 |
+
if aggregate.mean_precision >= 0.6:
|
| 182 |
+
print(" ✅ Precision: GOOD (≥60%)")
|
| 183 |
+
elif aggregate.mean_precision >= 0.4:
|
| 184 |
+
print(" ⚠️ Precision: FAIR (40-60%)")
|
| 185 |
+
else:
|
| 186 |
+
print(" ❌ Precision: POOR (<40%)")
|
| 187 |
+
|
| 188 |
+
if aggregate.mrr >= 0.5:
|
| 189 |
+
print(" ✅ MRR: GOOD (≥0.5)")
|
| 190 |
+
elif aggregate.mrr >= 0.3:
|
| 191 |
+
print(" ⚠️ MRR: FAIR (0.3-0.5)")
|
| 192 |
+
else:
|
| 193 |
+
print(" ❌ MRR: POOR (<0.3)")
|
| 194 |
+
|
| 195 |
+
return aggregate
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
if __name__ == "__main__":
|
| 199 |
+
if len(sys.argv) < 2:
|
| 200 |
+
print("Usage: python scripts/eval_retrieval.py queries.json [--mock]")
|
| 201 |
+
print("\nExample:")
|
| 202 |
+
print(" python scripts/eval_retrieval.py tests/eval_data/queries.json")
|
| 203 |
+
print(" python scripts/eval_retrieval.py tests/eval_data/queries.json --mock")
|
| 204 |
+
sys.exit(1)
|
| 205 |
+
|
| 206 |
+
queries_file = sys.argv[1]
|
| 207 |
+
use_mock = "--mock" in sys.argv
|
| 208 |
+
k = 5
|
| 209 |
+
|
| 210 |
+
# Parse k value if provided
|
| 211 |
+
for arg in sys.argv:
|
| 212 |
+
if arg.startswith("--k="):
|
| 213 |
+
k = int(arg.split("=")[1])
|
| 214 |
+
|
| 215 |
+
if not Path(queries_file).exists():
|
| 216 |
+
print(f"Error: File not found: {queries_file}")
|
| 217 |
+
sys.exit(1)
|
| 218 |
+
|
| 219 |
+
metrics = run_retrieval_eval(queries_file, k=k, use_mock=use_mock)
|
| 220 |
+
|
| 221 |
+
if metrics and metrics.mean_precision < 0.4:
|
| 222 |
+
sys.exit(1)
|
scripts/eval_spot_check.py
ADDED
|
@@ -0,0 +1,185 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Quick spot check for Docling parsing quality.
|
| 4 |
+
|
| 5 |
+
Usage:
|
| 6 |
+
python scripts/eval_spot_check.py /path/to/documents
|
| 7 |
+
python scripts/eval_spot_check.py /path/to/single/file.pdf
|
| 8 |
+
|
| 9 |
+
Outputs a visual summary of how Docling parsed each document.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import sys
|
| 13 |
+
import os
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from collections import Counter
|
| 16 |
+
|
| 17 |
+
# Add project root to path
|
| 18 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 19 |
+
|
| 20 |
+
from src.ingestion.docling_loader import (
|
| 21 |
+
load_document_with_docling,
|
| 22 |
+
load_documents_with_docling,
|
| 23 |
+
SUPPORTED_EXTENSIONS,
|
| 24 |
+
ParsedDocument
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def print_header(text: str, char: str = "="):
|
| 29 |
+
"""Print a formatted header."""
|
| 30 |
+
print(f"\n{char * 60}")
|
| 31 |
+
print(f" {text}")
|
| 32 |
+
print(f"{char * 60}")
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def analyze_document(doc: ParsedDocument, verbose: bool = True) -> dict:
|
| 36 |
+
"""Analyze a single parsed document and return metrics."""
|
| 37 |
+
|
| 38 |
+
# Count elements by type
|
| 39 |
+
type_counts = Counter(el.element_type for el in doc.elements)
|
| 40 |
+
|
| 41 |
+
# Check for potential issues
|
| 42 |
+
issues = []
|
| 43 |
+
if doc.status != "OK":
|
| 44 |
+
issues.append(f"Status: {doc.status} - {doc.error}")
|
| 45 |
+
if len(doc.elements) == 0:
|
| 46 |
+
issues.append("No elements extracted!")
|
| 47 |
+
if doc.chars == 0:
|
| 48 |
+
issues.append("Zero characters extracted!")
|
| 49 |
+
if type_counts.get("table", 0) == 0 and doc.format == ".pdf":
|
| 50 |
+
# PDFs often have tables - flag if none found
|
| 51 |
+
issues.append("No tables detected (may be expected)")
|
| 52 |
+
|
| 53 |
+
# Calculate metrics
|
| 54 |
+
metrics = {
|
| 55 |
+
"filename": doc.filename,
|
| 56 |
+
"format": doc.format,
|
| 57 |
+
"status": doc.status,
|
| 58 |
+
"total_elements": len(doc.elements),
|
| 59 |
+
"total_chars": doc.chars,
|
| 60 |
+
"total_words": doc.words,
|
| 61 |
+
"page_count": doc.page_count,
|
| 62 |
+
"element_types": dict(type_counts),
|
| 63 |
+
"issues": issues
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
if verbose:
|
| 67 |
+
print_header(f"{doc.filename} ({doc.format})", "-")
|
| 68 |
+
print(f" Status: {doc.status}")
|
| 69 |
+
print(f" Elements: {len(doc.elements)}")
|
| 70 |
+
print(f" Characters: {doc.chars:,}")
|
| 71 |
+
print(f" Words: {doc.words:,}")
|
| 72 |
+
if doc.page_count:
|
| 73 |
+
print(f" Pages: {doc.page_count}")
|
| 74 |
+
|
| 75 |
+
print(f"\n Element breakdown:")
|
| 76 |
+
for el_type, count in sorted(type_counts.items()):
|
| 77 |
+
print(f" {el_type}: {count}")
|
| 78 |
+
|
| 79 |
+
if issues:
|
| 80 |
+
print(f"\n ⚠️ Potential issues:")
|
| 81 |
+
for issue in issues:
|
| 82 |
+
print(f" - {issue}")
|
| 83 |
+
|
| 84 |
+
# Show sample elements
|
| 85 |
+
print(f"\n Sample elements (first 5):")
|
| 86 |
+
for i, el in enumerate(doc.elements[:5]):
|
| 87 |
+
text_preview = el.text[:80].replace('\n', ' ')
|
| 88 |
+
if len(el.text) > 80:
|
| 89 |
+
text_preview += "..."
|
| 90 |
+
print(f" [{el.element_type}] {text_preview}")
|
| 91 |
+
|
| 92 |
+
# Show table preview if any
|
| 93 |
+
tables = [el for el in doc.elements if el.element_type == "table"]
|
| 94 |
+
if tables:
|
| 95 |
+
print(f"\n Table preview (first table):")
|
| 96 |
+
table_text = tables[0].text[:300].replace('\n', '\n ')
|
| 97 |
+
print(f" {table_text}")
|
| 98 |
+
if len(tables[0].text) > 300:
|
| 99 |
+
print(" ...")
|
| 100 |
+
|
| 101 |
+
return metrics
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def run_spot_check(path: str, verbose: bool = True):
|
| 105 |
+
"""Run spot check on a file or directory."""
|
| 106 |
+
|
| 107 |
+
path = Path(path)
|
| 108 |
+
|
| 109 |
+
print_header("DOCLING PARSING SPOT CHECK")
|
| 110 |
+
print(f" Path: {path}")
|
| 111 |
+
print(f" Supported formats: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
|
| 112 |
+
|
| 113 |
+
all_metrics = []
|
| 114 |
+
|
| 115 |
+
if path.is_file():
|
| 116 |
+
# Single file
|
| 117 |
+
doc = load_document_with_docling(str(path))
|
| 118 |
+
metrics = analyze_document(doc, verbose=verbose)
|
| 119 |
+
all_metrics.append(metrics)
|
| 120 |
+
|
| 121 |
+
elif path.is_dir():
|
| 122 |
+
# Directory
|
| 123 |
+
docs = load_documents_with_docling(str(path), recursive=True)
|
| 124 |
+
print(f" Found {len(docs)} documents")
|
| 125 |
+
|
| 126 |
+
for doc in docs:
|
| 127 |
+
metrics = analyze_document(doc, verbose=verbose)
|
| 128 |
+
all_metrics.append(metrics)
|
| 129 |
+
|
| 130 |
+
else:
|
| 131 |
+
print(f" ERROR: Path not found: {path}")
|
| 132 |
+
return []
|
| 133 |
+
|
| 134 |
+
# Summary
|
| 135 |
+
print_header("SUMMARY")
|
| 136 |
+
|
| 137 |
+
ok_count = sum(1 for m in all_metrics if m["status"] == "OK")
|
| 138 |
+
total_elements = sum(m["total_elements"] for m in all_metrics)
|
| 139 |
+
total_chars = sum(m["total_chars"] for m in all_metrics)
|
| 140 |
+
|
| 141 |
+
print(f" Documents processed: {len(all_metrics)}")
|
| 142 |
+
print(f" Successful (OK): {ok_count}")
|
| 143 |
+
print(f" Failed/Skipped: {len(all_metrics) - ok_count}")
|
| 144 |
+
print(f" Total elements: {total_elements}")
|
| 145 |
+
print(f" Total characters: {total_chars:,}")
|
| 146 |
+
|
| 147 |
+
# Aggregate element types
|
| 148 |
+
all_types = Counter()
|
| 149 |
+
for m in all_metrics:
|
| 150 |
+
all_types.update(m["element_types"])
|
| 151 |
+
|
| 152 |
+
print(f"\n Element types across all docs:")
|
| 153 |
+
for el_type, count in sorted(all_types.items(), key=lambda x: -x[1]):
|
| 154 |
+
print(f" {el_type}: {count}")
|
| 155 |
+
|
| 156 |
+
# All issues
|
| 157 |
+
all_issues = []
|
| 158 |
+
for m in all_metrics:
|
| 159 |
+
for issue in m["issues"]:
|
| 160 |
+
all_issues.append(f"{m['filename']}: {issue}")
|
| 161 |
+
|
| 162 |
+
if all_issues:
|
| 163 |
+
print(f"\n ⚠️ Issues found:")
|
| 164 |
+
for issue in all_issues[:10]:
|
| 165 |
+
print(f" - {issue}")
|
| 166 |
+
if len(all_issues) > 10:
|
| 167 |
+
print(f" ... and {len(all_issues) - 10} more")
|
| 168 |
+
else:
|
| 169 |
+
print(f"\n ✅ No issues detected")
|
| 170 |
+
|
| 171 |
+
return all_metrics
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
if __name__ == "__main__":
|
| 175 |
+
if len(sys.argv) < 2:
|
| 176 |
+
print("Usage: python scripts/eval_spot_check.py /path/to/documents")
|
| 177 |
+
print("\nExamples:")
|
| 178 |
+
print(" python scripts/eval_spot_check.py ./tests/eval_data/documents")
|
| 179 |
+
print(" python scripts/eval_spot_check.py ./report.pdf")
|
| 180 |
+
sys.exit(1)
|
| 181 |
+
|
| 182 |
+
target_path = sys.argv[1]
|
| 183 |
+
verbose = "--quiet" not in sys.argv
|
| 184 |
+
|
| 185 |
+
run_spot_check(target_path, verbose=verbose)
|
tests/eval_data/documents/.gitkeep
ADDED
|
File without changes
|
tests/eval_data/queries.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"description": "Test queries for retrieval evaluation",
|
| 3 |
+
"queries": [
|
| 4 |
+
{
|
| 5 |
+
"id": "q1",
|
| 6 |
+
"query": "Example query about your document content",
|
| 7 |
+
"relevant_chunks": ["document.pdf::0", "document.pdf::1"],
|
| 8 |
+
"keywords": ["expected", "keywords", "in", "answer"]
|
| 9 |
+
}
|
| 10 |
+
],
|
| 11 |
+
"similarity_pairs": {
|
| 12 |
+
"similar": [
|
| 13 |
+
["What is the total revenue?", "How much money did we make?"],
|
| 14 |
+
["Describe the methodology", "What methods were used?"]
|
| 15 |
+
],
|
| 16 |
+
"dissimilar": [
|
| 17 |
+
["What is the revenue?", "Who founded the company?"],
|
| 18 |
+
["Technical specifications", "Company history"]
|
| 19 |
+
]
|
| 20 |
+
}
|
| 21 |
+
}
|