Spaces:

redmelonberry
/

presentation-search

Build error

App Files Files Community

Ilia Tambovtsev commited on Dec 19, 2024

Commit

24e252a

1 Parent(s): 0c02234

feat: fix async eval to be really async

Browse files

Files changed (2) hide show

src/eval/eval_mlflow.py +17 -9
src/rag/storage.py +29 -7

src/eval/eval_mlflow.py CHANGED Viewed

@@ -18,6 +18,7 @@ from langchain_core.prompts import PromptTemplate
 from langchain_openai import ChatOpenAI
 from pydantic import BaseModel, ConfigDict, Field
 from tqdm import tqdm
 from src.config import Config, load_spreadsheet
 from src.config.logging import setup_logging
@@ -523,6 +524,7 @@ class RAGEvaluatorMlflow:
         # Create semaphore within the async context
         semaphore = asyncio.Semaphore(self._max_concurrent)
         tasks = [
             self.process_question(
                 retriever=retriever,
@@ -536,12 +538,13 @@ class RAGEvaluatorMlflow:
             for idx, (_, row) in enumerate(questions_df.iterrows())
         ]
-        for completed in tqdm(
-            asyncio.as_completed(tasks),
-            desc=f"Processing questions (max {self._max_concurrent} concurrent)",
-            total=len(tasks),
-        ):
-            await completed
     def run_evaluation(self, questions_df: pd.DataFrame) -> None:
         """Run evaluation with async LLM queries and controlled concurrency"""
@@ -585,6 +588,13 @@ class RAGEvaluatorMlflow:
                     )
                 )
                 # Process results
                 results_df = pd.DataFrame(results_log)
                 results_df["experiment_name"] = (
@@ -615,7 +625,5 @@ class RAGEvaluatorMlflow:
                     if values:
                         mean_value = sum(values) / len(values)
                         mlflow.log_metric(f"mean_{name}", mean_value)
                         self._logger.info(f"Mean {name}: {mean_value:.3f}")

 from langchain_openai import ChatOpenAI
 from pydantic import BaseModel, ConfigDict, Field
 from tqdm import tqdm
+from tqdm.asyncio import tqdm_asyncio
 from src.config import Config, load_spreadsheet
 from src.config.logging import setup_logging
         # Create semaphore within the async context
         semaphore = asyncio.Semaphore(self._max_concurrent)
+        # Create tasks for all questions
         tasks = [
             self.process_question(
                 retriever=retriever,
             for idx, (_, row) in enumerate(questions_df.iterrows())
         ]
+        # Wait for all tasks to complete
+        await tqdm_asyncio.gather(
+            *tasks,
+            desc=f"Processing questions for '{retriever.scorer.id[:15]}' (max {self._max_concurrent} concurrent)",
+            total=len(questions_df),
+            dynamic_ncols=True,  # Adjust width automatically
+        )
     def run_evaluation(self, questions_df: pd.DataFrame) -> None:
         """Run evaluation with async LLM queries and controlled concurrency"""
                     )
                 )
+                # Calculate n_errors
+                n_errors = (
+                    len(questions_df) - len(results_log)
+                    if results_log
+                    else len(questions_df)
+                )
                 # Process results
                 results_df = pd.DataFrame(results_log)
                 results_df["experiment_name"] = (
                     if values:
                         mean_value = sum(values) / len(values)
                         mlflow.log_metric(f"mean_{name}", mean_value)
+                        mlflow.log_metric(f"error_rate", n_errors / len(questions_df))
                         self._logger.info(f"Mean {name}: {mean_value:.3f}")

src/rag/storage.py CHANGED Viewed

@@ -445,7 +445,7 @@ class ChromaSlideStore:
         """Get embeddings for texts"""
         return self._embeddings.embed_documents(texts)
-    def query_storage(
         self,
         query: str,
         n_results: int = 10,
@@ -462,14 +462,27 @@ class ChromaSlideStore:
             List of ScoredChunks sorted by similarity
         """
         # Get query embedding
-        query_embedding = self._embeddings.embed_query(query)
         # Query ChromaDB
         result = self._collection.query(
             query_embeddings=[query_embedding], n_results=n_results, where=where
         )
         return result
     def _process_chroma_results(self, results: QueryResult) -> List[ScoredChunk]:
         """Convert ChromaDB results to list of (Document, score) tuples
@@ -490,7 +503,7 @@ class ChromaSlideStore:
         return sorted(scored_chunks, key=lambda chunk: chunk.score)
-    def search_query(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
@@ -545,7 +558,10 @@ class ChromaSlideStore:
             ),
         )
-    def search_query_pages(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
@@ -566,7 +582,7 @@ class ChromaSlideStore:
             List of search results with full slide context, deduplicated by slide_id
         """
         # First perform regular search
-        search_results = self.search_query(
             query=query,
             chunk_types=chunk_types,
             n_results=n_results,  # * 3,  # Get more to ensure different pages
@@ -621,7 +637,10 @@ class ChromaSlideStore:
         return page_results  # [:n_results]
-    def search_query_presentations(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
@@ -644,7 +663,7 @@ class ChromaSlideStore:
             List of presentations with their matching slides, sorted by best match
         """
         # Get initial search results with enough buffer for filtering
-        search_results = self.search_query_pages(
             query=query,
             chunk_types=chunk_types,
             n_results=n_results,
@@ -689,6 +708,9 @@ class ChromaSlideStore:
         return ScoredPresentations(presentations=presentation_results, scorer=scorer)
     def get_by_metadata(
         self, metadata_filter: Dict, n_results: Optional[int] = None
     ) -> List[Document]:

         """Get embeddings for texts"""
         return self._embeddings.embed_documents(texts)
+    async def aquery_storage(
         self,
         query: str,
         n_results: int = 10,
             List of ScoredChunks sorted by similarity
         """
         # Get query embedding
+        query_embedding = await self._embeddings.aembed_query(query)
         # Query ChromaDB
         result = self._collection.query(
             query_embeddings=[query_embedding], n_results=n_results, where=where
         )
+        ## Run ChromaDB query in executor to avoid blocking
+        # result = await asyncio.get_event_loop().run_in_executor(
+        #     None,
+        #     lambda: self._collection.query(
+        #         query_embeddings=[query_embedding],
+        #         n_results=n_results,
+        #         where=where
+        #     )
+        # )
         return result
+    def query_storage(self, *args, **kwargs):
+        return asyncio.run(self.aquery_storage(*args, **kwargs))
     def _process_chroma_results(self, results: QueryResult) -> List[ScoredChunk]:
         """Convert ChromaDB results to list of (Document, score) tuples
         return sorted(scored_chunks, key=lambda chunk: chunk.score)
+    async def asearch_query(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
             ),
         )
+    def search_query(self, *args, **kwargs):
+        return asyncio.run(self.asearch_query(*args, **kwargs))
+    async def asearch_query_pages(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
             List of search results with full slide context, deduplicated by slide_id
         """
         # First perform regular search
+        search_results = await self.asearch_query(
             query=query,
             chunk_types=chunk_types,
             n_results=n_results,  # * 3,  # Get more to ensure different pages
         return page_results  # [:n_results]
+    def search_query_pages(self, *args, **kwargs):
+        return asyncio.run(self.asearch_query_pages(*args, **kwargs))
+    async def asearch_query_presentations(
         self,
         query: str,
         chunk_types: Optional[List[str]] = None,
             List of presentations with their matching slides, sorted by best match
         """
         # Get initial search results with enough buffer for filtering
+        search_results = await self.asearch_query_pages(
             query=query,
             chunk_types=chunk_types,
             n_results=n_results,
         return ScoredPresentations(presentations=presentation_results, scorer=scorer)
+    def search_query_presentations(self, *args, **kwargs):
+        return asyncio.run(self.asearch_query_presentations(*args, **kwargs))
     def get_by_metadata(
         self, metadata_filter: Dict, n_results: Optional[int] = None
     ) -> List[Document]: