Spaces:

mozzic
/

context-thread-agent

Sleeping

App Files Files Community

mozzic commited on Jan 10

Commit

03cbd8d

verified ·

1 Parent(s): 5c7e137

Upload src\evaluation.py with huggingface_hub

Browse files

Files changed (1) hide show

src//evaluation.py +152 -0

src//evaluation.py ADDED Viewed

	@@ -0,0 +1,152 @@

+"""
+Evaluation harness for testing the Context Thread Agent
+"""
+from typing import List, Dict
+from pathlib import Path
+from src.parser import NotebookParser
+from src.dependencies import ContextThreadBuilder
+from src.indexing import FAISSIndexer
+from src.retrieval import RetrievalEngine
+from src.reasoning import ContextualAnsweringSystem
+from src.intent import ContextThreadEnricher
+class EvaluationHarness:
+    """Evaluation harness for notebook Q&A."""
+    def __init__(self, notebooks_dir: str):
+        self.notebooks_dir = Path(notebooks_dir)
+        self.results = []
+    def evaluate_all(self, queries_per_notebook: int = 3) -> Dict:
+        """Evaluate all notebooks in the directory."""
+        notebook_files = list(self.notebooks_dir.glob("*.ipynb"))
+        for nb_file in notebook_files:
+            print(f"Evaluating {nb_file.name}...")
+            result = self.evaluate_notebook(str(nb_file), queries_per_notebook)
+            self.results.append(result)
+        return self._aggregate_results()
+    def evaluate_notebook(self, notebook_path: str, num_queries: int) -> Dict:
+        """Evaluate a single notebook."""
+        # Load notebook
+        parser = NotebookParser()
+        result = parser.parse_file(notebook_path)
+        cells = result['cells']
+        # Build context thread
+        builder = ContextThreadBuilder(
+            notebook_name=Path(notebook_path).stem,
+            thread_id=f"eval_{id(self)}"
+        )
+        builder.add_cells(cells)
+        thread = builder.build()
+        # Enrich
+        enricher = ContextThreadEnricher()
+        thread = enricher.enrich(thread)
+        # Index
+        indexer = FAISSIndexer()
+        indexer.add_multiple(thread.units)
+        # Setup systems
+        engine = RetrievalEngine(thread, indexer)
+        answering_system = ContextualAnsweringSystem(engine)
+        # Generate sample queries (simplified)
+        queries = self._generate_sample_queries(cells, num_queries)
+        # Evaluate
+        scores = []
+        for query in queries:
+            try:
+                response = answering_system.answer_question(query)
+                # Simple scoring based on citations
+                score = min(len(response.citations) * 0.2, 1.0)
+                scores.append(score)
+            except Exception:
+                scores.append(0.0)
+        return {
+            'notebook': Path(notebook_path).name,
+            'total_cells': len(cells),
+            'queries_tested': len(queries),
+            'avg_score': sum(scores) / len(scores) if scores else 0,
+            'scores': scores
+        }
+    def _generate_sample_queries(self, cells: List, num_queries: int) -> List[str]:
+        """Generate sample queries for evaluation."""
+        queries = []
+        # Simple query generation
+        if any('Q4' in cell.source for cell in cells):
+            queries.append("Why did we remove Q4 data?")
+        if any('plot' in cell.source.lower() for cell in cells):
+            queries.append("What does the visualization show?")
+        if any('model' in cell.source.lower() for cell in cells):
+            queries.append("What model was used?")
+        # Fill with generic queries
+        generic_queries = [
+            "What is the main purpose of this analysis?",
+            "What data was used?",
+            "What were the key findings?"
+        ]
+        while len(queries) < num_queries and generic_queries:
+            queries.append(generic_queries.pop(0))
+        return queries[:num_queries]
+    def _aggregate_results(self) -> Dict:
+        """Aggregate results across all notebooks."""
+        if not self.results:
+            return {}
+        total_notebooks = len(self.results)
+        avg_score = sum(r['avg_score'] for r in self.results) / total_notebooks
+        total_queries = sum(r['queries_tested'] for r in self.results)
+        return {
+            'total_notebooks': total_notebooks,
+            'total_queries': total_queries,
+            'average_score': avg_score,
+            'results': self.results
+        }
+    def print_summary(self, summary: Dict):
+        """Print evaluation summary."""
+        print("\n" + "="*50)
+        print("EVALUATION SUMMARY")
+        print("="*50)
+        print(f"Notebooks evaluated: {summary.get('total_notebooks', 0)}")
+        print(f"Total queries: {summary.get('total_queries', 0)}")
+        print(".2%")
+        for result in summary.get('results', []):
+            print(f"\n{result['notebook']}:")
+            print(f"  Cells: {result['total_cells']}")
+            print(f"  Queries: {result['queries_tested']}")
+            print(".2%")
+    def save_results(self, output_file: str):
+        """Save results to CSV."""
+        import csv
+        with open(output_file, 'w', newline='') as f:
+            writer = csv.DictWriter(f, fieldnames=['notebook', 'total_cells', 'queries_tested', 'avg_score'])
+            writer.writeheader()
+            for result in self.results:
+                writer.writerow({
+                    'notebook': result['notebook'],
+                    'total_cells': result['total_cells'],
+                    'queries_tested': result['queries_tested'],
+                    'avg_score': result['avg_score']
+                })