Spaces:

vn6295337
/

RAG-document-assistant

Sleeping

vn6295337 Claude Opus 4.5 commited on Jan 7

Commit

c6a48e0

1 Parent(s): 3a9dfa1

Add evaluation framework for Docling + RAG pipeline

- eval_spot_check.py: Manual parsing inspection
- eval_parsing.py: Automated structure metrics
- eval_retrieval.py: Precision/recall/MRR
- eval_embeddings.py: Semantic similarity tests
- tests/eval_data/: Test document structure

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (6) hide show

scripts/eval_embeddings.py +193 -0
scripts/eval_parsing.py +214 -0
scripts/eval_retrieval.py +222 -0
scripts/eval_spot_check.py +185 -0
tests/eval_data/documents/.gitkeep +0 -0
tests/eval_data/queries.json +21 -0

scripts/eval_embeddings.py ADDED Viewed

	@@ -0,0 +1,193 @@

+#!/usr/bin/env python3
+"""
+Embedding quality evaluation.
+Usage:
+    python scripts/eval_embeddings.py tests/eval_data/queries.json
+Measures:
+- Cosine similarity for similar text pairs (should be high)
+- Cosine similarity for dissimilar text pairs (should be low)
+"""
+import sys
+import json
+import numpy as np
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Tuple
+sys.path.insert(0, str(Path(__file__).parent.parent))
+@dataclass
+class EmbeddingMetrics:
+    """Metrics for embedding quality."""
+    similar_pairs_avg: float
+    similar_pairs_min: float
+    dissimilar_pairs_avg: float
+    dissimilar_pairs_max: float
+    separation: float  # similar_avg - dissimilar_avg
+    similar_results: List[Tuple[str, str, float]]
+    dissimilar_results: List[Tuple[str, str, float]]
+def cosine_similarity(a: List[float], b: List[float]) -> float:
+    """Calculate cosine similarity between two vectors."""
+    a = np.array(a)
+    b = np.array(b)
+    return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)))
+def get_embedding(text: str, model=None) -> List[float]:
+    """Get embedding for text using sentence-transformers."""
+    if model is None:
+        from sentence_transformers import SentenceTransformer
+        model = SentenceTransformer('all-MiniLM-L6-v2')
+    embedding = model.encode(text, convert_to_numpy=True)
+    return embedding.tolist()
+def evaluate_embeddings(queries_file: str) -> EmbeddingMetrics:
+    """Evaluate embedding quality using similarity pairs."""
+    with open(queries_file, 'r') as f:
+        data = json.load(f)
+    similarity_pairs = data.get("similarity_pairs", {})
+    similar = similarity_pairs.get("similar", [])
+    dissimilar = similarity_pairs.get("dissimilar", [])
+    if not similar and not dissimilar:
+        print("No similarity pairs found in queries file")
+        print("Expected format:")
+        print('''  "similarity_pairs": {
+    "similar": [["text1", "text2"], ...],
+    "dissimilar": [["text1", "text2"], ...]
+  }''')
+        return None
+    print("\n" + "=" * 60)
+    print("  EMBEDDING QUALITY EVALUATION")
+    print("=" * 60)
+    # Load model once
+    print("\nLoading embedding model...")
+    from sentence_transformers import SentenceTransformer
+    model = SentenceTransformer('all-MiniLM-L6-v2')
+    print(f"Model: all-MiniLM-L6-v2 (384 dimensions)")
+    # Evaluate similar pairs
+    similar_scores = []
+    similar_results = []
+    print(f"\n📊 Similar Pairs ({len(similar)} pairs)")
+    print("   Expected: cosine similarity > 0.6")
+    print()
+    for pair in similar:
+        if len(pair) != 2:
+            continue
+        text1, text2 = pair
+        emb1 = model.encode(text1, convert_to_numpy=True)
+        emb2 = model.encode(text2, convert_to_numpy=True)
+        score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
+        similar_scores.append(score)
+        similar_results.append((text1, text2, score))
+        status = "✅" if score > 0.6 else "⚠️" if score > 0.4 else "❌"
+        print(f"   {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
+    # Evaluate dissimilar pairs
+    dissimilar_scores = []
+    dissimilar_results = []
+    print(f"\n📊 Dissimilar Pairs ({len(dissimilar)} pairs)")
+    print("   Expected: cosine similarity < 0.4")
+    print()
+    for pair in dissimilar:
+        if len(pair) != 2:
+            continue
+        text1, text2 = pair
+        emb1 = model.encode(text1, convert_to_numpy=True)
+        emb2 = model.encode(text2, convert_to_numpy=True)
+        score = float(np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2)))
+        dissimilar_scores.append(score)
+        dissimilar_results.append((text1, text2, score))
+        status = "✅" if score < 0.4 else "⚠️" if score < 0.6 else "❌"
+        print(f"   {status} {score:.3f}: \"{text1[:30]}...\" vs \"{text2[:30]}...\"")
+    # Calculate metrics
+    metrics = EmbeddingMetrics(
+        similar_pairs_avg=np.mean(similar_scores) if similar_scores else 0.0,
+        similar_pairs_min=np.min(similar_scores) if similar_scores else 0.0,
+        dissimilar_pairs_avg=np.mean(dissimilar_scores) if dissimilar_scores else 0.0,
+        dissimilar_pairs_max=np.max(dissimilar_scores) if dissimilar_scores else 0.0,
+        separation=(np.mean(similar_scores) - np.mean(dissimilar_scores)
+                   if similar_scores and dissimilar_scores else 0.0),
+        similar_results=similar_results,
+        dissimilar_results=dissimilar_results
+    )
+    # Print summary
+    print("\n" + "-" * 60)
+    print("  SUMMARY")
+    print("-" * 60)
+    if similar_scores:
+        print(f"  Similar pairs avg: {metrics.similar_pairs_avg:.3f}")
+        print(f"  Similar pairs min: {metrics.similar_pairs_min:.3f}")
+    if dissimilar_scores:
+        print(f"  Dissimilar pairs avg: {metrics.dissimilar_pairs_avg:.3f}")
+        print(f"  Dissimilar pairs max: {metrics.dissimilar_pairs_max:.3f}")
+    print(f"  Separation (similar - dissimilar): {metrics.separation:.3f}")
+    # Quality assessment
+    print("\n📈 Quality Assessment")
+    if metrics.similar_pairs_avg >= 0.6:
+        print("  ✅ Similar pairs: GOOD (avg ≥ 0.6)")
+    elif metrics.similar_pairs_avg >= 0.4:
+        print("  ⚠️ Similar pairs: FAIR (avg 0.4-0.6)")
+    else:
+        print("  ❌ Similar pairs: POOR (avg < 0.4)")
+    if metrics.dissimilar_pairs_avg <= 0.4:
+        print("  ✅ Dissimilar pairs: GOOD (avg ≤ 0.4)")
+    elif metrics.dissimilar_pairs_avg <= 0.6:
+        print("  ⚠️ Dissimilar pairs: FAIR (avg 0.4-0.6)")
+    else:
+        print("  ❌ Dissimilar pairs: POOR (avg > 0.6)")
+    if metrics.separation >= 0.3:
+        print("  ✅ Separation: GOOD (≥ 0.3)")
+    elif metrics.separation >= 0.15:
+        print("  ⚠️ Separation: FAIR (0.15-0.3)")
+    else:
+        print("  ❌ Separation: POOR (< 0.15)")
+    return metrics
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/eval_embeddings.py queries.json")
+        print("\nExample:")
+        print("  python scripts/eval_embeddings.py tests/eval_data/queries.json")
+        sys.exit(1)
+    queries_file = sys.argv[1]
+    if not Path(queries_file).exists():
+        print(f"Error: File not found: {queries_file}")
+        sys.exit(1)
+    metrics = evaluate_embeddings(queries_file)
+    if metrics and metrics.separation < 0.15:
+        sys.exit(1)

scripts/eval_parsing.py ADDED Viewed

	@@ -0,0 +1,214 @@

+#!/usr/bin/env python3
+"""
+Automated parsing quality evaluation.
+Usage:
+    python scripts/eval_parsing.py tests/eval_data/documents
+Measures:
+- Element extraction counts
+- Structure preservation (tables, headings)
+- Format coverage
+"""
+import sys
+import json
+from pathlib import Path
+from collections import Counter
+from dataclasses import dataclass, asdict
+from typing import List, Dict, Any
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.ingestion.docling_loader import (
+    load_documents_with_docling,
+    SUPPORTED_EXTENSIONS
+)
+@dataclass
+class ParsingMetrics:
+    """Metrics for parsing quality evaluation."""
+    total_documents: int = 0
+    successful_documents: int = 0
+    failed_documents: int = 0
+    total_elements: int = 0
+    total_chars: int = 0
+    elements_by_type: Dict[str, int] = None
+    formats_processed: Dict[str, int] = None
+    avg_elements_per_doc: float = 0.0
+    avg_chars_per_doc: float = 0.0
+    documents_with_tables: int = 0
+    documents_with_headings: int = 0
+    issues: List[str] = None
+    def __post_init__(self):
+        if self.elements_by_type is None:
+            self.elements_by_type = {}
+        if self.formats_processed is None:
+            self.formats_processed = {}
+        if self.issues is None:
+            self.issues = []
+def evaluate_parsing(docs_dir: str) -> ParsingMetrics:
+    """Evaluate parsing quality across all documents in directory."""
+    docs = load_documents_with_docling(docs_dir, recursive=True)
+    metrics = ParsingMetrics()
+    metrics.total_documents = len(docs)
+    element_types = Counter()
+    format_counts = Counter()
+    for doc in docs:
+        format_counts[doc.format] += 1
+        if doc.status != "OK":
+            metrics.failed_documents += 1
+            metrics.issues.append(f"{doc.filename}: {doc.status} - {doc.error}")
+            continue
+        metrics.successful_documents += 1
+        metrics.total_elements += len(doc.elements)
+        metrics.total_chars += doc.chars
+        # Count element types
+        doc_types = Counter(el.element_type for el in doc.elements)
+        element_types.update(doc_types)
+        # Check for tables and headings
+        if doc_types.get("table", 0) > 0:
+            metrics.documents_with_tables += 1
+        if doc_types.get("heading", 0) > 0:
+            metrics.documents_with_headings += 1
+        # Check for potential issues
+        if len(doc.elements) == 0:
+            metrics.issues.append(f"{doc.filename}: No elements extracted")
+        elif len(doc.elements) < 3:
+            metrics.issues.append(f"{doc.filename}: Very few elements ({len(doc.elements)})")
+    # Calculate averages
+    if metrics.successful_documents > 0:
+        metrics.avg_elements_per_doc = metrics.total_elements / metrics.successful_documents
+        metrics.avg_chars_per_doc = metrics.total_chars / metrics.successful_documents
+    metrics.elements_by_type = dict(element_types)
+    metrics.formats_processed = dict(format_counts)
+    return metrics
+def print_report(metrics: ParsingMetrics):
+    """Print evaluation report."""
+    print("\n" + "=" * 60)
+    print("  PARSING QUALITY EVALUATION REPORT")
+    print("=" * 60)
+    # Document stats
+    print("\n📄 Document Statistics")
+    print(f"  Total documents: {metrics.total_documents}")
+    print(f"  Successful: {metrics.successful_documents}")
+    print(f"  Failed: {metrics.failed_documents}")
+    success_rate = (metrics.successful_documents / metrics.total_documents * 100
+                   if metrics.total_documents > 0 else 0)
+    print(f"  Success rate: {success_rate:.1f}%")
+    # Format breakdown
+    print("\n📁 Formats Processed")
+    for fmt, count in sorted(metrics.formats_processed.items()):
+        print(f"  {fmt}: {count}")
+    # Element stats
+    print("\n🔢 Element Statistics")
+    print(f"  Total elements: {metrics.total_elements}")
+    print(f"  Total characters: {metrics.total_chars:,}")
+    print(f"  Avg elements/doc: {metrics.avg_elements_per_doc:.1f}")
+    print(f"  Avg chars/doc: {metrics.avg_chars_per_doc:,.0f}")
+    # Element types
+    print("\n📊 Element Types")
+    for el_type, count in sorted(metrics.elements_by_type.items(), key=lambda x: -x[1]):
+        print(f"  {el_type}: {count}")
+    # Structure detection
+    print("\n🏗️ Structure Detection")
+    print(f"  Documents with tables: {metrics.documents_with_tables}")
+    print(f"  Documents with headings: {metrics.documents_with_headings}")
+    # Issues
+    if metrics.issues:
+        print("\n⚠️ Issues Found")
+        for issue in metrics.issues[:10]:
+            print(f"  - {issue}")
+        if len(metrics.issues) > 10:
+            print(f"  ... and {len(metrics.issues) - 10} more")
+    else:
+        print("\n✅ No issues detected")
+    # Quality score
+    print("\n📈 Quality Score")
+    score = calculate_quality_score(metrics)
+    print(f"  Overall: {score:.0f}/100")
+    return score
+def calculate_quality_score(metrics: ParsingMetrics) -> float:
+    """Calculate overall quality score (0-100)."""
+    if metrics.total_documents == 0:
+        return 0.0
+    score = 0.0
+    # Success rate (40 points max)
+    success_rate = metrics.successful_documents / metrics.total_documents
+    score += success_rate * 40
+    # Element extraction (30 points max)
+    if metrics.avg_elements_per_doc > 10:
+        score += 30
+    elif metrics.avg_elements_per_doc > 5:
+        score += 20
+    elif metrics.avg_elements_per_doc > 1:
+        score += 10
+    # Structure detection (20 points max)
+    if metrics.successful_documents > 0:
+        table_rate = metrics.documents_with_tables / metrics.successful_documents
+        heading_rate = metrics.documents_with_headings / metrics.successful_documents
+        score += (table_rate + heading_rate) * 10
+    # No issues bonus (10 points)
+    if len(metrics.issues) == 0:
+        score += 10
+    return min(score, 100)
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/eval_parsing.py /path/to/documents")
+        sys.exit(1)
+    docs_dir = sys.argv[1]
+    if not Path(docs_dir).is_dir():
+        print(f"Error: Directory not found: {docs_dir}")
+        sys.exit(1)
+    metrics = evaluate_parsing(docs_dir)
+    score = print_report(metrics)
+    # Output JSON if requested
+    if "--json" in sys.argv:
+        print("\n" + json.dumps(asdict(metrics), indent=2))
+    # Exit with error if score is too low
+    if score < 50:
+        sys.exit(1)

scripts/eval_retrieval.py ADDED Viewed

	@@ -0,0 +1,222 @@

+#!/usr/bin/env python3
+"""
+Retrieval quality evaluation.
+Usage:
+    python scripts/eval_retrieval.py tests/eval_data/queries.json
+Measures:
+- Precision@k
+- Recall@k
+- Mean Reciprocal Rank (MRR)
+"""
+import sys
+import json
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Set, Optional
+sys.path.insert(0, str(Path(__file__).parent.parent))
+@dataclass
+class RetrievalMetrics:
+    """Metrics for a single query."""
+    query_id: str
+    query: str
+    precision_at_k: float
+    recall_at_k: float
+    reciprocal_rank: float
+    retrieved_ids: List[str]
+    relevant_found: List[str]
+    relevant_missed: List[str]
+@dataclass
+class AggregateMetrics:
+    """Aggregate metrics across all queries."""
+    total_queries: int
+    mean_precision: float
+    mean_recall: float
+    mrr: float  # Mean Reciprocal Rank
+    queries_with_hits: int
+def evaluate_single_query(
+    query_id: str,
+    query: str,
+    relevant_chunks: Set[str],
+    retrieved_chunks: List[str],
+    k: int = 5
+) -> RetrievalMetrics:
+    """Evaluate retrieval for a single query."""
+    top_k = retrieved_chunks[:k]
+    top_k_set = set(top_k)
+    # Precision@k: relevant in top-k / k
+    relevant_in_top_k = top_k_set & relevant_chunks
+    precision = len(relevant_in_top_k) / k if k > 0 else 0.0
+    # Recall@k: relevant in top-k / total relevant
+    recall = len(relevant_in_top_k) / len(relevant_chunks) if relevant_chunks else 0.0
+    # Reciprocal Rank: 1 / rank of first relevant
+    reciprocal_rank = 0.0
+    for i, chunk_id in enumerate(top_k):
+        if chunk_id in relevant_chunks:
+            reciprocal_rank = 1.0 / (i + 1)
+            break
+    return RetrievalMetrics(
+        query_id=query_id,
+        query=query,
+        precision_at_k=precision,
+        recall_at_k=recall,
+        reciprocal_rank=reciprocal_rank,
+        retrieved_ids=top_k,
+        relevant_found=list(relevant_in_top_k),
+        relevant_missed=list(relevant_chunks - top_k_set)
+    )
+def run_retrieval_eval(
+    queries_file: str,
+    k: int = 5,
+    use_mock: bool = False
+) -> AggregateMetrics:
+    """Run retrieval evaluation from queries file."""
+    with open(queries_file, 'r') as f:
+        data = json.load(f)
+    queries = data.get("queries", [])
+    if not queries:
+        print("No queries found in file")
+        return None
+    # Import retrieval function
+    if not use_mock:
+        try:
+            from src.retrieval.hybrid import hybrid_search
+        except ImportError:
+            print("Warning: Could not import hybrid_search, using mock")
+            use_mock = True
+    all_metrics = []
+    print("\n" + "=" * 60)
+    print("  RETRIEVAL QUALITY EVALUATION")
+    print("=" * 60)
+    for q in queries:
+        query_id = q.get("id", "unknown")
+        query_text = q.get("query", "")
+        relevant = set(q.get("relevant_chunks", []))
+        if not relevant:
+            print(f"\n⚠️ Query {query_id}: No relevant chunks defined, skipping")
+            continue
+        print(f"\n📝 Query {query_id}: {query_text[:50]}...")
+        # Get retrieval results
+        if use_mock:
+            # Mock results for testing without Pinecone
+            retrieved = list(relevant)[:k] + ["mock::0", "mock::1"]
+        else:
+            try:
+                results = hybrid_search(query_text, top_k=k)
+                retrieved = [r.get("id", "") for r in results]
+            except Exception as e:
+                print(f"   Error: {e}")
+                retrieved = []
+        # Evaluate
+        metrics = evaluate_single_query(
+            query_id=query_id,
+            query=query_text,
+            relevant_chunks=relevant,
+            retrieved_chunks=retrieved,
+            k=k
+        )
+        all_metrics.append(metrics)
+        # Print results
+        print(f"   Precision@{k}: {metrics.precision_at_k:.2f}")
+        print(f"   Recall@{k}: {metrics.recall_at_k:.2f}")
+        print(f"   Reciprocal Rank: {metrics.reciprocal_rank:.2f}")
+        if metrics.relevant_found:
+            print(f"   ✅ Found: {metrics.relevant_found}")
+        if metrics.relevant_missed:
+            print(f"   ❌ Missed: {metrics.relevant_missed}")
+    # Aggregate
+    if not all_metrics:
+        print("\nNo queries evaluated")
+        return None
+    aggregate = AggregateMetrics(
+        total_queries=len(all_metrics),
+        mean_precision=sum(m.precision_at_k for m in all_metrics) / len(all_metrics),
+        mean_recall=sum(m.recall_at_k for m in all_metrics) / len(all_metrics),
+        mrr=sum(m.reciprocal_rank for m in all_metrics) / len(all_metrics),
+        queries_with_hits=sum(1 for m in all_metrics if m.reciprocal_rank > 0)
+    )
+    # Print summary
+    print("\n" + "-" * 60)
+    print("  SUMMARY")
+    print("-" * 60)
+    print(f"  Total queries: {aggregate.total_queries}")
+    print(f"  Mean Precision@{k}: {aggregate.mean_precision:.2f}")
+    print(f"  Mean Recall@{k}: {aggregate.mean_recall:.2f}")
+    print(f"  MRR: {aggregate.mrr:.2f}")
+    print(f"  Queries with hits: {aggregate.queries_with_hits}/{aggregate.total_queries}")
+    # Quality assessment
+    print("\n📊 Quality Assessment")
+    if aggregate.mean_precision >= 0.6:
+        print("  ✅ Precision: GOOD (≥60%)")
+    elif aggregate.mean_precision >= 0.4:
+        print("  ⚠️ Precision: FAIR (40-60%)")
+    else:
+        print("  ❌ Precision: POOR (<40%)")
+    if aggregate.mrr >= 0.5:
+        print("  ✅ MRR: GOOD (≥0.5)")
+    elif aggregate.mrr >= 0.3:
+        print("  ⚠️ MRR: FAIR (0.3-0.5)")
+    else:
+        print("  ❌ MRR: POOR (<0.3)")
+    return aggregate
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/eval_retrieval.py queries.json [--mock]")
+        print("\nExample:")
+        print("  python scripts/eval_retrieval.py tests/eval_data/queries.json")
+        print("  python scripts/eval_retrieval.py tests/eval_data/queries.json --mock")
+        sys.exit(1)
+    queries_file = sys.argv[1]
+    use_mock = "--mock" in sys.argv
+    k = 5
+    # Parse k value if provided
+    for arg in sys.argv:
+        if arg.startswith("--k="):
+            k = int(arg.split("=")[1])
+    if not Path(queries_file).exists():
+        print(f"Error: File not found: {queries_file}")
+        sys.exit(1)
+    metrics = run_retrieval_eval(queries_file, k=k, use_mock=use_mock)
+    if metrics and metrics.mean_precision < 0.4:
+        sys.exit(1)

scripts/eval_spot_check.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+"""
+Quick spot check for Docling parsing quality.
+Usage:
+    python scripts/eval_spot_check.py /path/to/documents
+    python scripts/eval_spot_check.py /path/to/single/file.pdf
+Outputs a visual summary of how Docling parsed each document.
+"""
+import sys
+import os
+from pathlib import Path
+from collections import Counter
+# Add project root to path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from src.ingestion.docling_loader import (
+    load_document_with_docling,
+    load_documents_with_docling,
+    SUPPORTED_EXTENSIONS,
+    ParsedDocument
+)
+def print_header(text: str, char: str = "="):
+    """Print a formatted header."""
+    print(f"\n{char * 60}")
+    print(f"  {text}")
+    print(f"{char * 60}")
+def analyze_document(doc: ParsedDocument, verbose: bool = True) -> dict:
+    """Analyze a single parsed document and return metrics."""
+    # Count elements by type
+    type_counts = Counter(el.element_type for el in doc.elements)
+    # Check for potential issues
+    issues = []
+    if doc.status != "OK":
+        issues.append(f"Status: {doc.status} - {doc.error}")
+    if len(doc.elements) == 0:
+        issues.append("No elements extracted!")
+    if doc.chars == 0:
+        issues.append("Zero characters extracted!")
+    if type_counts.get("table", 0) == 0 and doc.format == ".pdf":
+        # PDFs often have tables - flag if none found
+        issues.append("No tables detected (may be expected)")
+    # Calculate metrics
+    metrics = {
+        "filename": doc.filename,
+        "format": doc.format,
+        "status": doc.status,
+        "total_elements": len(doc.elements),
+        "total_chars": doc.chars,
+        "total_words": doc.words,
+        "page_count": doc.page_count,
+        "element_types": dict(type_counts),
+        "issues": issues
+    }
+    if verbose:
+        print_header(f"{doc.filename} ({doc.format})", "-")
+        print(f"  Status: {doc.status}")
+        print(f"  Elements: {len(doc.elements)}")
+        print(f"  Characters: {doc.chars:,}")
+        print(f"  Words: {doc.words:,}")
+        if doc.page_count:
+            print(f"  Pages: {doc.page_count}")
+        print(f"\n  Element breakdown:")
+        for el_type, count in sorted(type_counts.items()):
+            print(f"    {el_type}: {count}")
+        if issues:
+            print(f"\n  ⚠️  Potential issues:")
+            for issue in issues:
+                print(f"    - {issue}")
+        # Show sample elements
+        print(f"\n  Sample elements (first 5):")
+        for i, el in enumerate(doc.elements[:5]):
+            text_preview = el.text[:80].replace('\n', ' ')
+            if len(el.text) > 80:
+                text_preview += "..."
+            print(f"    [{el.element_type}] {text_preview}")
+        # Show table preview if any
+        tables = [el for el in doc.elements if el.element_type == "table"]
+        if tables:
+            print(f"\n  Table preview (first table):")
+            table_text = tables[0].text[:300].replace('\n', '\n    ')
+            print(f"    {table_text}")
+            if len(tables[0].text) > 300:
+                print("    ...")
+    return metrics
+def run_spot_check(path: str, verbose: bool = True):
+    """Run spot check on a file or directory."""
+    path = Path(path)
+    print_header("DOCLING PARSING SPOT CHECK")
+    print(f"  Path: {path}")
+    print(f"  Supported formats: {', '.join(sorted(SUPPORTED_EXTENSIONS))}")
+    all_metrics = []
+    if path.is_file():
+        # Single file
+        doc = load_document_with_docling(str(path))
+        metrics = analyze_document(doc, verbose=verbose)
+        all_metrics.append(metrics)
+    elif path.is_dir():
+        # Directory
+        docs = load_documents_with_docling(str(path), recursive=True)
+        print(f"  Found {len(docs)} documents")
+        for doc in docs:
+            metrics = analyze_document(doc, verbose=verbose)
+            all_metrics.append(metrics)
+    else:
+        print(f"  ERROR: Path not found: {path}")
+        return []
+    # Summary
+    print_header("SUMMARY")
+    ok_count = sum(1 for m in all_metrics if m["status"] == "OK")
+    total_elements = sum(m["total_elements"] for m in all_metrics)
+    total_chars = sum(m["total_chars"] for m in all_metrics)
+    print(f"  Documents processed: {len(all_metrics)}")
+    print(f"  Successful (OK): {ok_count}")
+    print(f"  Failed/Skipped: {len(all_metrics) - ok_count}")
+    print(f"  Total elements: {total_elements}")
+    print(f"  Total characters: {total_chars:,}")
+    # Aggregate element types
+    all_types = Counter()
+    for m in all_metrics:
+        all_types.update(m["element_types"])
+    print(f"\n  Element types across all docs:")
+    for el_type, count in sorted(all_types.items(), key=lambda x: -x[1]):
+        print(f"    {el_type}: {count}")
+    # All issues
+    all_issues = []
+    for m in all_metrics:
+        for issue in m["issues"]:
+            all_issues.append(f"{m['filename']}: {issue}")
+    if all_issues:
+        print(f"\n  ⚠️  Issues found:")
+        for issue in all_issues[:10]:
+            print(f"    - {issue}")
+        if len(all_issues) > 10:
+            print(f"    ... and {len(all_issues) - 10} more")
+    else:
+        print(f"\n  ✅ No issues detected")
+    return all_metrics
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python scripts/eval_spot_check.py /path/to/documents")
+        print("\nExamples:")
+        print("  python scripts/eval_spot_check.py ./tests/eval_data/documents")
+        print("  python scripts/eval_spot_check.py ./report.pdf")
+        sys.exit(1)
+    target_path = sys.argv[1]
+    verbose = "--quiet" not in sys.argv
+    run_spot_check(target_path, verbose=verbose)

tests/eval_data/documents/.gitkeep ADDED Viewed

File without changes

tests/eval_data/queries.json ADDED Viewed

	@@ -0,0 +1,21 @@

+{
+  "description": "Test queries for retrieval evaluation",
+  "queries": [
+    {
+      "id": "q1",
+      "query": "Example query about your document content",
+      "relevant_chunks": ["document.pdf::0", "document.pdf::1"],
+      "keywords": ["expected", "keywords", "in", "answer"]
+    }
+  ],
+  "similarity_pairs": {
+    "similar": [
+      ["What is the total revenue?", "How much money did we make?"],
+      ["Describe the methodology", "What methods were used?"]
+    ],
+    "dissimilar": [
+      ["What is the revenue?", "Who founded the company?"],
+      ["Technical specifications", "Company history"]
+    ]
+  }
+}