Spaces:
Build error
Build error
Ilia Tambovtsev commited on
Commit ·
dfee524
1
Parent(s): 24e252a
feat: configure logging all params, move preprocessing to be a feature of a storage
Browse files- src/eval/eval_mlflow.py +46 -8
- src/rag/preprocess.py +5 -1
- src/rag/storage.py +170 -7
- src/run_evaluation.py +19 -3
src/eval/eval_mlflow.py
CHANGED
|
@@ -292,7 +292,7 @@ class MetricPresets:
|
|
| 292 |
|
| 293 |
LLM = ["llmrelevance"]
|
| 294 |
|
| 295 |
-
|
| 296 |
|
| 297 |
@classmethod
|
| 298 |
def get_preset(cls, name: str) -> List[str]:
|
|
@@ -354,6 +354,15 @@ class MlflowConfig(BaseModel):
|
|
| 354 |
logger.info(f"Using metrics: {self.metrics}")
|
| 355 |
return super().model_post_init(__context)
|
| 356 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
class RAGEvaluatorMlflow:
|
| 359 |
"""MLFlow-based evaluator for RAG pipeline"""
|
|
@@ -422,7 +431,7 @@ class RAGEvaluatorMlflow:
|
|
| 422 |
Dictionary mapping metric names to MetricResult objects
|
| 423 |
"""
|
| 424 |
# Log evaluation start
|
| 425 |
-
self._logger.info(f"Evaluating question: {question}")
|
| 426 |
|
| 427 |
results = {}
|
| 428 |
|
|
@@ -435,7 +444,7 @@ class RAGEvaluatorMlflow:
|
|
| 435 |
# Log metric result
|
| 436 |
log_msg = f"Metric {metric.name}: {result.score}"
|
| 437 |
if result.explanation:
|
| 438 |
-
log_msg += f" ({result.explanation})"
|
| 439 |
self._logger.info(log_msg)
|
| 440 |
|
| 441 |
except Exception as e:
|
|
@@ -570,12 +579,39 @@ class RAGEvaluatorMlflow:
|
|
| 570 |
|
| 571 |
for scorer in self.config.scorers:
|
| 572 |
self._logger.info(f"Evaluating with scorer: {scorer.id}")
|
| 573 |
-
with mlflow.start_run(run_name=f"scorer_{scorer.id}"):
|
| 574 |
-
mlflow.log_params(scorer.model_dump())
|
| 575 |
-
self._logger.debug(f"Logged scorer parameters: {scorer.model_dump()}")
|
| 576 |
|
| 577 |
-
|
| 578 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
|
| 580 |
# Initialize aggregation containers
|
| 581 |
results_log = []
|
|
@@ -594,6 +630,8 @@ class RAGEvaluatorMlflow:
|
|
| 594 |
if results_log
|
| 595 |
else len(questions_df)
|
| 596 |
)
|
|
|
|
|
|
|
| 597 |
|
| 598 |
# Process results
|
| 599 |
results_df = pd.DataFrame(results_log)
|
|
|
|
| 292 |
|
| 293 |
LLM = ["llmrelevance"]
|
| 294 |
|
| 295 |
+
ALL = BASIC + LLM
|
| 296 |
|
| 297 |
@classmethod
|
| 298 |
def get_preset(cls, name: str) -> List[str]:
|
|
|
|
| 354 |
logger.info(f"Using metrics: {self.metrics}")
|
| 355 |
return super().model_post_init(__context)
|
| 356 |
|
| 357 |
+
def get_log_params(self) -> Dict[str, Any]:
|
| 358 |
+
"""Get parameters for MLflow logging"""
|
| 359 |
+
return {
|
| 360 |
+
"experiment_name": self.experiment_name,
|
| 361 |
+
"n_judge_contexts": self.n_judge_contexts,
|
| 362 |
+
"metrics": ",".join(self.metrics),
|
| 363 |
+
"metric_args": self.metric_args,
|
| 364 |
+
}
|
| 365 |
+
|
| 366 |
|
| 367 |
class RAGEvaluatorMlflow:
|
| 368 |
"""MLFlow-based evaluator for RAG pipeline"""
|
|
|
|
| 431 |
Dictionary mapping metric names to MetricResult objects
|
| 432 |
"""
|
| 433 |
# Log evaluation start
|
| 434 |
+
self._logger.info(f"Evaluating question: '{question}'")
|
| 435 |
|
| 436 |
results = {}
|
| 437 |
|
|
|
|
| 444 |
# Log metric result
|
| 445 |
log_msg = f"Metric {metric.name}: {result.score}"
|
| 446 |
if result.explanation:
|
| 447 |
+
log_msg += f" ({result.explanation[:200]})"
|
| 448 |
self._logger.info(log_msg)
|
| 449 |
|
| 450 |
except Exception as e:
|
|
|
|
| 579 |
|
| 580 |
for scorer in self.config.scorers:
|
| 581 |
self._logger.info(f"Evaluating with scorer: {scorer.id}")
|
|
|
|
|
|
|
|
|
|
| 582 |
|
| 583 |
+
# Initialize retriever
|
| 584 |
+
retriever = self.config.get_retriever_with_scorer(scorer)
|
| 585 |
+
|
| 586 |
+
with mlflow.start_run(
|
| 587 |
+
run_name=f"scorer_{scorer.id}__retriever_{retriever.id}"
|
| 588 |
+
):
|
| 589 |
+
# Log preprocessor
|
| 590 |
+
preprocessor_id = (
|
| 591 |
+
retriever.storage.query_preprocessor.id
|
| 592 |
+
if retriever.storage.query_preprocessor
|
| 593 |
+
else "None"
|
| 594 |
+
)
|
| 595 |
+
mlflow.log_params({"preprocessing": preprocessor_id})
|
| 596 |
+
self._logger.info(f"Using preprocessor: {preprocessor_id}")
|
| 597 |
+
|
| 598 |
+
# Log config parameters
|
| 599 |
+
mlflow.log_params(
|
| 600 |
+
{f"config_{k}": v for k, v in self.config.get_log_params().items()}
|
| 601 |
+
)
|
| 602 |
+
self._logger.debug("Logged config parameters")
|
| 603 |
+
|
| 604 |
+
# Log scorer parameters
|
| 605 |
+
mlflow.log_params(
|
| 606 |
+
{f"scorer_{k}": v for k, v in scorer.model_dump().items()}
|
| 607 |
+
)
|
| 608 |
+
self._logger.debug("Logged scorer parameters")
|
| 609 |
+
|
| 610 |
+
# Initialize retriever and log its parameters
|
| 611 |
+
mlflow.log_params(
|
| 612 |
+
{f"retriever_{k}": v for k, v in retriever.get_log_params().items()}
|
| 613 |
+
)
|
| 614 |
+
self._logger.debug("Logged retriever parameters")
|
| 615 |
|
| 616 |
# Initialize aggregation containers
|
| 617 |
results_log = []
|
|
|
|
| 630 |
if results_log
|
| 631 |
else len(questions_df)
|
| 632 |
)
|
| 633 |
+
if n_errors > 1:
|
| 634 |
+
logger.error(f"{n_errors} while processing {retriever.id}")
|
| 635 |
|
| 636 |
# Process results
|
| 637 |
results_df = pd.DataFrame(results_log)
|
src/rag/preprocess.py
CHANGED
|
@@ -6,7 +6,7 @@ import nltk
|
|
| 6 |
from nltk.corpus import stopwords
|
| 7 |
|
| 8 |
|
| 9 |
-
class
|
| 10 |
"""Preprocesses search queries by removing common patterns and standardizing format."""
|
| 11 |
|
| 12 |
@dataclass
|
|
@@ -76,6 +76,10 @@ class QueryPreprocessor:
|
|
| 76 |
re.compile(p.pattern, re.IGNORECASE) for p in patterns
|
| 77 |
]
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
def remove_stopwords_from_text(self, text: str) -> str:
|
| 80 |
"""Remove stopwords while preserving protected terms."""
|
| 81 |
tokens = text.split()
|
|
|
|
| 6 |
from nltk.corpus import stopwords
|
| 7 |
|
| 8 |
|
| 9 |
+
class RegexQueryPreprocessor:
|
| 10 |
"""Preprocesses search queries by removing common patterns and standardizing format."""
|
| 11 |
|
| 12 |
@dataclass
|
|
|
|
| 76 |
re.compile(p.pattern, re.IGNORECASE) for p in patterns
|
| 77 |
]
|
| 78 |
|
| 79 |
+
@property
|
| 80 |
+
def id(self):
|
| 81 |
+
return self.__class__.__name__
|
| 82 |
+
|
| 83 |
def remove_stopwords_from_text(self, text: str) -> str:
|
| 84 |
"""Remove stopwords while preserving protected terms."""
|
| 85 |
tokens = text.split()
|
src/rag/storage.py
CHANGED
|
@@ -1,12 +1,13 @@
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
| 3 |
-
from collections import OrderedDict
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
| 6 |
from uuid import uuid4
|
| 7 |
|
| 8 |
import chromadb
|
| 9 |
import numpy as np
|
|
|
|
| 10 |
from chromadb.api.types import QueryResult
|
| 11 |
from chromadb.config import Settings
|
| 12 |
from datasets.utils import metadata
|
|
@@ -27,7 +28,7 @@ from src.chains.prompts import JsonH1AndGDPrompt
|
|
| 27 |
from src.config.model_setup import EmbeddingConfig
|
| 28 |
from src.config.navigator import Navigator
|
| 29 |
from src.rag import BaseScorer, HyperbolicScorer, ScorerTypes
|
| 30 |
-
from src.rag.preprocess import
|
| 31 |
from src.rag.score import ExponentialScorer, MinScorer
|
| 32 |
|
| 33 |
logger = logging.getLogger(__name__)
|
|
@@ -376,6 +377,7 @@ class ChromaSlideStore:
|
|
| 376 |
self,
|
| 377 |
collection_name: str = "pres1",
|
| 378 |
embedding_model: Embeddings = EmbeddingConfig().load_openai(),
|
|
|
|
| 379 |
):
|
| 380 |
"""Initialize ChromaDB storage"""
|
| 381 |
self.navigator = Navigator()
|
|
@@ -397,6 +399,9 @@ class ChromaSlideStore:
|
|
| 397 |
# self._api_key = os.getenv("OPENAI_API_KEY")
|
| 398 |
self._embeddings = embedding_model
|
| 399 |
|
|
|
|
|
|
|
|
|
|
| 400 |
# Initialize indexer
|
| 401 |
self._indexer = SlideIndexer(collection_name=collection_name)
|
| 402 |
|
|
@@ -461,8 +466,10 @@ class ChromaSlideStore:
|
|
| 461 |
Returns:
|
| 462 |
List of ScoredChunks sorted by similarity
|
| 463 |
"""
|
|
|
|
|
|
|
| 464 |
# Get query embedding
|
| 465 |
-
query_embedding = await self._embeddings.aembed_query(
|
| 466 |
|
| 467 |
# Query ChromaDB
|
| 468 |
result = self._collection.query(
|
|
@@ -798,6 +805,139 @@ class ChromaSlideStore:
|
|
| 798 |
await gather(*tasks)
|
| 799 |
logger.info(f"Completed processing presentation: '{presentation.name}'")
|
| 800 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 801 |
|
| 802 |
class PresentationRetriever(BaseModel):
|
| 803 |
"""Retriever for slide search that provides formatted context"""
|
|
@@ -806,16 +946,18 @@ class PresentationRetriever(BaseModel):
|
|
| 806 |
scorer: BaseScorer = ExponentialScorer()
|
| 807 |
n_contexts: int = -1
|
| 808 |
n_pages: int = -1
|
|
|
|
| 809 |
retrieve_page_contexts: bool = True
|
| 810 |
|
| 811 |
-
query_preprocessor: Optional[QueryPreprocessor] = QueryPreprocessor()
|
| 812 |
-
|
| 813 |
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 814 |
|
| 815 |
@property
|
| 816 |
def id(self) -> str:
|
| 817 |
return self.__class__.__name__.lower()
|
| 818 |
|
|
|
|
|
|
|
|
|
|
| 819 |
def format_slide(
|
| 820 |
self, slide: SearchResultPage, metadata: Optional[Dict[str, Any]] = None
|
| 821 |
) -> str:
|
|
@@ -883,10 +1025,9 @@ class PresentationRetriever(BaseModel):
|
|
| 883 |
Returns:
|
| 884 |
Dictionary with presentation results and formatted context
|
| 885 |
"""
|
| 886 |
-
q_storage = self.query_preprocessor(query) if self.query_preprocessor else query
|
| 887 |
|
| 888 |
results = self.storage.search_query_presentations(
|
| 889 |
-
query=
|
| 890 |
chunk_types=chunk_types,
|
| 891 |
n_results=n_results,
|
| 892 |
scorer=self.scorer,
|
|
@@ -931,6 +1072,15 @@ class PresentationRetriever(BaseModel):
|
|
| 931 |
def set_scorer(self, scorer: ScorerTypes):
|
| 932 |
self.scorer = scorer
|
| 933 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 934 |
|
| 935 |
class LLMPresentationRetriever(PresentationRetriever):
|
| 936 |
"""LLM-enhanced retriever that reranks results using structured relevance scoring"""
|
|
@@ -1144,6 +1294,19 @@ Output Formatting:
|
|
| 1144 |
|
| 1145 |
return dict(contexts=reranked)
|
| 1146 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1147 |
RetrieverTypes = Union[PresentationRetriever, LLMPresentationRetriever]
|
| 1148 |
|
| 1149 |
# def create_slides_database(
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import logging
|
| 3 |
+
from collections import OrderedDict, defaultdict
|
| 4 |
from pathlib import Path
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple, Type, Union
|
| 6 |
from uuid import uuid4
|
| 7 |
|
| 8 |
import chromadb
|
| 9 |
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
from chromadb.api.types import QueryResult
|
| 12 |
from chromadb.config import Settings
|
| 13 |
from datasets.utils import metadata
|
|
|
|
| 28 |
from src.config.model_setup import EmbeddingConfig
|
| 29 |
from src.config.navigator import Navigator
|
| 30 |
from src.rag import BaseScorer, HyperbolicScorer, ScorerTypes
|
| 31 |
+
from src.rag.preprocess import RegexQueryPreprocessor
|
| 32 |
from src.rag.score import ExponentialScorer, MinScorer
|
| 33 |
|
| 34 |
logger = logging.getLogger(__name__)
|
|
|
|
| 377 |
self,
|
| 378 |
collection_name: str = "pres1",
|
| 379 |
embedding_model: Embeddings = EmbeddingConfig().load_openai(),
|
| 380 |
+
query_preprocessor: Optional[RegexQueryPreprocessor] = RegexQueryPreprocessor(),
|
| 381 |
):
|
| 382 |
"""Initialize ChromaDB storage"""
|
| 383 |
self.navigator = Navigator()
|
|
|
|
| 399 |
# self._api_key = os.getenv("OPENAI_API_KEY")
|
| 400 |
self._embeddings = embedding_model
|
| 401 |
|
| 402 |
+
# Initialize query preprocessor
|
| 403 |
+
self.query_preprocessor = query_preprocessor
|
| 404 |
+
|
| 405 |
# Initialize indexer
|
| 406 |
self._indexer = SlideIndexer(collection_name=collection_name)
|
| 407 |
|
|
|
|
| 466 |
Returns:
|
| 467 |
List of ScoredChunks sorted by similarity
|
| 468 |
"""
|
| 469 |
+
q_storage = self.query_preprocessor(query) if self.query_preprocessor else query
|
| 470 |
+
|
| 471 |
# Get query embedding
|
| 472 |
+
query_embedding = await self._embeddings.aembed_query(q_storage)
|
| 473 |
|
| 474 |
# Query ChromaDB
|
| 475 |
result = self._collection.query(
|
|
|
|
| 805 |
await gather(*tasks)
|
| 806 |
logger.info(f"Completed processing presentation: '{presentation.name}'")
|
| 807 |
|
| 808 |
+
def validate_presentations(self) -> Tuple[pd.DataFrame, List[str]]:
|
| 809 |
+
"""Validate that all presentation slides were properly stored.
|
| 810 |
+
|
| 811 |
+
Uses metadata from stored chunks to compare number of pages in presentations.
|
| 812 |
+
Result shows how many pages are in ChromaDB vs expected total pages.
|
| 813 |
+
|
| 814 |
+
Returns:
|
| 815 |
+
Tuple containing:
|
| 816 |
+
- DataFrame with presentations statistics:
|
| 817 |
+
Columns:
|
| 818 |
+
- presentation: Presentation name
|
| 819 |
+
- stored_pages: Number of pages found in ChromaDB
|
| 820 |
+
- chunks_per_page: Average chunks per page
|
| 821 |
+
- total_chunks: Total chunks for this presentation
|
| 822 |
+
- chunk_types: Set of unique chunk types
|
| 823 |
+
- min_page: First page number
|
| 824 |
+
- max_page: Last page number
|
| 825 |
+
- List of validation warnings if any inconsistencies found
|
| 826 |
+
"""
|
| 827 |
+
# Get all stored chunks
|
| 828 |
+
all_chunks = self._collection.get()
|
| 829 |
+
|
| 830 |
+
# Group chunks by presentation
|
| 831 |
+
pres_pages: Dict[str, Set[int]] = defaultdict(set) # Unique pages
|
| 832 |
+
pres_chunks: Dict[str, int] = defaultdict(int) # Total chunks
|
| 833 |
+
pres_types: Dict[str, Set[str]] = defaultdict(set) # Chunk types
|
| 834 |
+
|
| 835 |
+
# Process each chunk's metadata
|
| 836 |
+
for metadata in all_chunks["metadatas"]:
|
| 837 |
+
if not metadata:
|
| 838 |
+
continue
|
| 839 |
+
|
| 840 |
+
pdf_path = metadata.get("pdf_path", "")
|
| 841 |
+
if not pdf_path:
|
| 842 |
+
continue
|
| 843 |
+
|
| 844 |
+
# Extract presentation name from path
|
| 845 |
+
pres_name = Path(pdf_path).stem
|
| 846 |
+
|
| 847 |
+
# Track pages, chunks and types
|
| 848 |
+
page_num = int(metadata.get("page_num", -1))
|
| 849 |
+
if page_num >= 0:
|
| 850 |
+
pres_pages[pres_name].add(page_num)
|
| 851 |
+
|
| 852 |
+
chunk_type = metadata.get("chunk_type", "unknown")
|
| 853 |
+
pres_types[pres_name].add(chunk_type)
|
| 854 |
+
|
| 855 |
+
pres_chunks[pres_name] += 1
|
| 856 |
+
|
| 857 |
+
# Compile statistics and warnings
|
| 858 |
+
stats_data = []
|
| 859 |
+
warnings = []
|
| 860 |
+
|
| 861 |
+
for pres_name in pres_pages:
|
| 862 |
+
stored_pages = len(pres_pages[pres_name])
|
| 863 |
+
total_chunks = pres_chunks[pres_name]
|
| 864 |
+
chunks_per_page = total_chunks / stored_pages if stored_pages > 0 else 0
|
| 865 |
+
chunk_types = pres_types[pres_name]
|
| 866 |
+
pages = sorted(pres_pages[pres_name])
|
| 867 |
+
|
| 868 |
+
stats_data.append(
|
| 869 |
+
{
|
| 870 |
+
"presentation": pres_name,
|
| 871 |
+
"stored_pages": stored_pages,
|
| 872 |
+
"chunks_per_page": round(chunks_per_page, 2),
|
| 873 |
+
"total_chunks": total_chunks,
|
| 874 |
+
"chunk_types": chunk_types,
|
| 875 |
+
"min_page": min(pages) if pages else None,
|
| 876 |
+
"max_page": max(pages) if pages else None,
|
| 877 |
+
}
|
| 878 |
+
)
|
| 879 |
+
|
| 880 |
+
# Check for potential issues
|
| 881 |
+
if (
|
| 882 |
+
chunks_per_page < 3
|
| 883 |
+
): # Assuming we should have at least 3 chunks per page
|
| 884 |
+
warnings.append(
|
| 885 |
+
f"Low chunks per page ({chunks_per_page:.1f}) " f"for '{pres_name}'"
|
| 886 |
+
)
|
| 887 |
+
|
| 888 |
+
# Check for page number gaps
|
| 889 |
+
if pages:
|
| 890 |
+
expected_pages = set(range(min(pages), max(pages) + 1))
|
| 891 |
+
missing_pages = expected_pages - pres_pages[pres_name]
|
| 892 |
+
if missing_pages:
|
| 893 |
+
warnings.append(
|
| 894 |
+
f"Missing pages {sorted(missing_pages)} in '{pres_name}'"
|
| 895 |
+
)
|
| 896 |
+
|
| 897 |
+
# Check for missing chunk types
|
| 898 |
+
expected_types = {
|
| 899 |
+
"text_content",
|
| 900 |
+
"visual_content",
|
| 901 |
+
"topic_overview",
|
| 902 |
+
"conclusions_and_insights",
|
| 903 |
+
"layout_and_composition",
|
| 904 |
+
}
|
| 905 |
+
missing_types = expected_types - chunk_types
|
| 906 |
+
if missing_types:
|
| 907 |
+
warnings.append(f"Missing chunk types {missing_types} in '{pres_name}'")
|
| 908 |
+
|
| 909 |
+
# Create DataFrame from stats
|
| 910 |
+
stats_df = pd.DataFrame(stats_data).sort_values("presentation")
|
| 911 |
+
|
| 912 |
+
return stats_df, warnings
|
| 913 |
+
|
| 914 |
+
def validate_storage(self) -> Tuple[pd.DataFrame, List[str]]:
|
| 915 |
+
"""Helper function to run validation and display results.
|
| 916 |
+
|
| 917 |
+
Args:
|
| 918 |
+
store: ChromaSlideStore instance to validate
|
| 919 |
+
|
| 920 |
+
Returns:
|
| 921 |
+
Tuple of (statistics DataFrame, list of warnings)
|
| 922 |
+
"""
|
| 923 |
+
from IPython.display import display
|
| 924 |
+
|
| 925 |
+
stats_df, warnings = self.validate_presentations()
|
| 926 |
+
|
| 927 |
+
# Display statistics
|
| 928 |
+
print("\nPresentation Statistics:")
|
| 929 |
+
display(stats_df)
|
| 930 |
+
|
| 931 |
+
# Display warnings if any
|
| 932 |
+
if warnings:
|
| 933 |
+
print("\nWarnings:")
|
| 934 |
+
for warning in warnings:
|
| 935 |
+
print(f"- {warning}")
|
| 936 |
+
else:
|
| 937 |
+
print("\nNo validation warnings found.")
|
| 938 |
+
|
| 939 |
+
return stats_df, warnings
|
| 940 |
+
|
| 941 |
|
| 942 |
class PresentationRetriever(BaseModel):
|
| 943 |
"""Retriever for slide search that provides formatted context"""
|
|
|
|
| 946 |
scorer: BaseScorer = ExponentialScorer()
|
| 947 |
n_contexts: int = -1
|
| 948 |
n_pages: int = -1
|
| 949 |
+
n_query_results: int = 70
|
| 950 |
retrieve_page_contexts: bool = True
|
| 951 |
|
|
|
|
|
|
|
| 952 |
model_config = ConfigDict(arbitrary_types_allowed=True)
|
| 953 |
|
| 954 |
@property
|
| 955 |
def id(self) -> str:
|
| 956 |
return self.__class__.__name__.lower()
|
| 957 |
|
| 958 |
+
def set_n_query_results(self, n_query_results: int):
|
| 959 |
+
self.n_query_results = n_query_results
|
| 960 |
+
|
| 961 |
def format_slide(
|
| 962 |
self, slide: SearchResultPage, metadata: Optional[Dict[str, Any]] = None
|
| 963 |
) -> str:
|
|
|
|
| 1025 |
Returns:
|
| 1026 |
Dictionary with presentation results and formatted context
|
| 1027 |
"""
|
|
|
|
| 1028 |
|
| 1029 |
results = self.storage.search_query_presentations(
|
| 1030 |
+
query=query,
|
| 1031 |
chunk_types=chunk_types,
|
| 1032 |
n_results=n_results,
|
| 1033 |
scorer=self.scorer,
|
|
|
|
| 1072 |
def set_scorer(self, scorer: ScorerTypes):
|
| 1073 |
self.scorer = scorer
|
| 1074 |
|
| 1075 |
+
def get_log_params(self) -> Dict[str, Any]:
|
| 1076 |
+
"""Get parameters for MLflow logging"""
|
| 1077 |
+
return {
|
| 1078 |
+
"type": self.__class__.__name__,
|
| 1079 |
+
"n_contexts": self.n_contexts,
|
| 1080 |
+
"n_pages": self.n_pages,
|
| 1081 |
+
"retrieve_page_contexts": self.retrieve_page_contexts,
|
| 1082 |
+
}
|
| 1083 |
+
|
| 1084 |
|
| 1085 |
class LLMPresentationRetriever(PresentationRetriever):
|
| 1086 |
"""LLM-enhanced retriever that reranks results using structured relevance scoring"""
|
|
|
|
| 1294 |
|
| 1295 |
return dict(contexts=reranked)
|
| 1296 |
|
| 1297 |
+
def get_log_params(self) -> Dict[str, Any]:
|
| 1298 |
+
"""Get parameters for MLflow logging including LLM specifics"""
|
| 1299 |
+
params = super().get_log_params()
|
| 1300 |
+
params.update(
|
| 1301 |
+
{
|
| 1302 |
+
"llm_model": self.llm.model_name,
|
| 1303 |
+
"llm_temperature": self.llm.temperature,
|
| 1304 |
+
"top_k": self.top_k,
|
| 1305 |
+
}
|
| 1306 |
+
)
|
| 1307 |
+
return params
|
| 1308 |
+
|
| 1309 |
+
|
| 1310 |
RetrieverTypes = Union[PresentationRetriever, LLMPresentationRetriever]
|
| 1311 |
|
| 1312 |
# def create_slides_database(
|
src/run_evaluation.py
CHANGED
|
@@ -21,6 +21,7 @@ from src.eval.eval_mlflow import (
|
|
| 21 |
)
|
| 22 |
from src.eval.evaluate import LangsmithConfig, RAGEvaluatorLangsmith
|
| 23 |
from src.rag import ChromaSlideStore, PresentationRetriever
|
|
|
|
| 24 |
from src.rag.score import (
|
| 25 |
BaseScorer,
|
| 26 |
ExponentialScorer,
|
|
@@ -109,6 +110,7 @@ class EvaluationCLI:
|
|
| 109 |
model_name: Optional[str],
|
| 110 |
collection: str,
|
| 111 |
scorers: List[str],
|
|
|
|
| 112 |
temperature: float = 0.2,
|
| 113 |
) -> EvalComponents:
|
| 114 |
"""Initialize common evaluation components
|
|
@@ -137,8 +139,10 @@ class EvaluationCLI:
|
|
| 137 |
# Initialize components
|
| 138 |
llm = self.config.model_config.get_llm(provider, model_name, temperature)
|
| 139 |
embeddings = self.config.embedding_config.get_embeddings(provider)
|
|
|
|
|
|
|
| 140 |
storage = ChromaSlideStore(
|
| 141 |
-
collection_name=collection, embedding_model=embeddings
|
| 142 |
)
|
| 143 |
|
| 144 |
logger.info(f"Initialized storage collection: {collection}")
|
|
@@ -159,12 +163,17 @@ class EvaluationCLI:
|
|
| 159 |
def mlflow(
|
| 160 |
self,
|
| 161 |
retriever: str = "basic",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
provider: str = "vsegpt",
|
| 163 |
model_name: Optional[str] = None,
|
| 164 |
collection: str = "pres1",
|
| 165 |
experiment: str = "PresRetrieve_eval",
|
| 166 |
scorers: List[str] = ["default"],
|
| 167 |
metrics: List[str] = ["basic"],
|
|
|
|
| 168 |
n_questions: int = -1,
|
| 169 |
max_concurrent: int = 8,
|
| 170 |
rate_limit_timeout: float = -1,
|
|
@@ -201,7 +210,7 @@ class EvaluationCLI:
|
|
| 201 |
|
| 202 |
metrics: List of metric specifications
|
| 203 |
Options:
|
| 204 |
-
- Presets: 'basic', 'llm', '
|
| 205 |
- Individual: 'presentationmatch', 'presentationfound', 'pagematch', 'pagefound', 'presentationcount',
|
| 206 |
Default: ['basic']
|
| 207 |
|
|
@@ -251,9 +260,15 @@ class EvaluationCLI:
|
|
| 251 |
model_name=model_name,
|
| 252 |
collection=collection,
|
| 253 |
scorers=scorers,
|
|
|
|
| 254 |
temperature=temperature,
|
| 255 |
)
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
# Setup evaluation config
|
| 258 |
db_path = self.config.navigator.eval_runs / "mlruns.db"
|
| 259 |
artifacts_path = self.config.navigator.eval_artifacts
|
|
@@ -270,6 +285,7 @@ class EvaluationCLI:
|
|
| 270 |
else -1.0
|
| 271 |
)
|
| 272 |
),
|
|
|
|
| 273 |
write_to_google=write_to_google,
|
| 274 |
)
|
| 275 |
|
|
@@ -348,7 +364,7 @@ class EvaluationCLI:
|
|
| 348 |
questions_df = questions_df.sample(n_questions).reset_index()
|
| 349 |
logger.info(f"Selected {len(questions_df)} random questions")
|
| 350 |
|
| 351 |
-
evaluator.run_evaluation(
|
| 352 |
logger.info("LangSmith evaluation completed successfully")
|
| 353 |
|
| 354 |
except Exception as e:
|
|
|
|
| 21 |
)
|
| 22 |
from src.eval.evaluate import LangsmithConfig, RAGEvaluatorLangsmith
|
| 23 |
from src.rag import ChromaSlideStore, PresentationRetriever
|
| 24 |
+
from src.rag.preprocess import RegexQueryPreprocessor
|
| 25 |
from src.rag.score import (
|
| 26 |
BaseScorer,
|
| 27 |
ExponentialScorer,
|
|
|
|
| 110 |
model_name: Optional[str],
|
| 111 |
collection: str,
|
| 112 |
scorers: List[str],
|
| 113 |
+
preprocessing: Optional[str] = None,
|
| 114 |
temperature: float = 0.2,
|
| 115 |
) -> EvalComponents:
|
| 116 |
"""Initialize common evaluation components
|
|
|
|
| 139 |
# Initialize components
|
| 140 |
llm = self.config.model_config.get_llm(provider, model_name, temperature)
|
| 141 |
embeddings = self.config.embedding_config.get_embeddings(provider)
|
| 142 |
+
query_preprocessor = {"regex": RegexQueryPreprocessor()}.get(preprocessing) if preprocessing else None
|
| 143 |
+
|
| 144 |
storage = ChromaSlideStore(
|
| 145 |
+
collection_name=collection, embedding_model=embeddings, query_preprocessor=query_preprocessor
|
| 146 |
)
|
| 147 |
|
| 148 |
logger.info(f"Initialized storage collection: {collection}")
|
|
|
|
| 163 |
def mlflow(
|
| 164 |
self,
|
| 165 |
retriever: str = "basic",
|
| 166 |
+
n_query_results: int = 50,
|
| 167 |
+
n_contexts: int = -1,
|
| 168 |
+
n_pages: int = -1,
|
| 169 |
+
preprocessing: str = "regex",
|
| 170 |
provider: str = "vsegpt",
|
| 171 |
model_name: Optional[str] = None,
|
| 172 |
collection: str = "pres1",
|
| 173 |
experiment: str = "PresRetrieve_eval",
|
| 174 |
scorers: List[str] = ["default"],
|
| 175 |
metrics: List[str] = ["basic"],
|
| 176 |
+
n_judge_contexts: int = 8,
|
| 177 |
n_questions: int = -1,
|
| 178 |
max_concurrent: int = 8,
|
| 179 |
rate_limit_timeout: float = -1,
|
|
|
|
| 210 |
|
| 211 |
metrics: List of metric specifications
|
| 212 |
Options:
|
| 213 |
+
- Presets: 'basic', 'llm', 'all'
|
| 214 |
- Individual: 'presentationmatch', 'presentationfound', 'pagematch', 'pagefound', 'presentationcount',
|
| 215 |
Default: ['basic']
|
| 216 |
|
|
|
|
| 260 |
model_name=model_name,
|
| 261 |
collection=collection,
|
| 262 |
scorers=scorers,
|
| 263 |
+
preprocessing=preprocessing,
|
| 264 |
temperature=temperature,
|
| 265 |
)
|
| 266 |
|
| 267 |
+
# Set attributes
|
| 268 |
+
components.retriever.n_query_results = n_query_results
|
| 269 |
+
components.retriever.n_contexts = n_contexts
|
| 270 |
+
components.retriever.n_pages = n_pages
|
| 271 |
+
|
| 272 |
# Setup evaluation config
|
| 273 |
db_path = self.config.navigator.eval_runs / "mlruns.db"
|
| 274 |
artifacts_path = self.config.navigator.eval_artifacts
|
|
|
|
| 285 |
else -1.0
|
| 286 |
)
|
| 287 |
),
|
| 288 |
+
n_judge_contexts=n_judge_contexts,
|
| 289 |
write_to_google=write_to_google,
|
| 290 |
)
|
| 291 |
|
|
|
|
| 364 |
questions_df = questions_df.sample(n_questions).reset_index()
|
| 365 |
logger.info(f"Selected {len(questions_df)} random questions")
|
| 366 |
|
| 367 |
+
evaluator.run_evaluation()
|
| 368 |
logger.info("LangSmith evaluation completed successfully")
|
| 369 |
|
| 370 |
except Exception as e:
|