Spaces:

visualisable-ai
/

api

Paused

gary-boon Claude commited on Sep 16, 2025

Commit

ae9e159

1 Parent(s): 1d23728

Fix SWE-bench service to gracefully handle dataset loading failures

- Add fallback to mock data when HuggingFace dataset loading fails
- Improve error handling for deployment environments
- Add detailed mock problem statements for better testing
- Fix initialization errors on HuggingFace Spaces

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

backend/swe_bench_service.py +96 -26

backend/swe_bench_service.py CHANGED Viewed

@@ -94,6 +94,69 @@ class SWEBenchService:
         self.dataset_loaded = False
         self.metrics_cache: Dict[str, Any] = {}
     async def load_dataset(self, dataset_name: str = "princeton-nlp/SWE-bench_Lite"):
         """Load SWE-bench dataset from Hugging Face"""
         try:
@@ -101,38 +164,45 @@ class SWEBenchService:
             logger.info(f"Loading SWE-bench dataset: {dataset_name}")
-            # Load the dataset
-            dataset = load_dataset(dataset_name, split='test')
-            # Convert to our task format
-            for item in dataset:
-                task = SWEBenchTask(
-                    instance_id=item['instance_id'],
-                    repo=item['repo'],
-                    problem_statement=item['problem_statement'],
-                    base_commit=item['base_commit'],
-                    patch=item.get('patch'),
-                    test_patch=item.get('test_patch'),
-                    hints_text=item.get('hints_text'),
-                    created_at=item.get('created_at'),
-                    version=item.get('version'),
-                    FAIL_TO_PASS=item.get('FAIL_TO_PASS'),
-                    PASS_TO_PASS=item.get('PASS_TO_PASS')
-                )
-                self.tasks[task.instance_id] = task
-            self.dataset_loaded = True
-            logger.info(f"Loaded {len(self.tasks)} SWE-bench tasks")
             # Initialize metrics cache
             self._update_metrics_cache()
         except ImportError:
-            logger.error("datasets library not installed. Run: pip install datasets")
-            raise
         except Exception as e:
-            logger.error(f"Failed to load SWE-bench dataset: {e}")
-            raise
     def get_tasks(
         self,

         self.dataset_loaded = False
         self.metrics_cache: Dict[str, Any] = {}
+    def _load_mock_tasks(self):
+        """Load mock tasks when dataset isn't available"""
+        repos = [
+            "astropy/astropy", "django/django", "matplotlib/matplotlib",
+            "pandas-dev/pandas", "pytest-dev/pytest", "scikit-learn/scikit-learn"
+        ]
+        statements = [
+            """Modeling's `separability_matrix` does not compute separability correctly for nested CompoundModels
+Consider the following model:
+```python
+from astropy.modeling import models as m
+from astropy.modeling.separable import separable_matrix
+cm = m.Linear1D(10) & m.Linear1D(5)
+```
+It's separability matrix as you might expect is a diagonal:
+```python
+>>> separability_matrix(cm)
+array([[ True, False],
+       [False,  True]])
+```""",
+            """Please support header rows in RestructuredText output
+### Description
+It would be great if the RestructuredText output could have header rows for tables, similar to what MySQL does for pipe formatting.
+### Expected behavior
+According to the documentation for MyST parsers, the docutils RST table expects the first row to be treated as a header row.
+### Actual behavior
+The RST output treats the first row as a regular data row and doesn't mark it as a header.""",
+            """Issue when parsing empty lists/arrays in configuration
+When attempting to parse empty lists or arrays from configuration files, the parser incorrectly raises a ValueError instead of returning an empty list.
+```python
+>>> config.parse_list("[]")
+ValueError: invalid literal for int() with base 10: '[]'
+```
+Expected behavior: Should return an empty list []"""
+        ]
+        for i in range(100):  # Create 100 mock tasks
+            repo = repos[i % len(repos)]
+            task = SWEBenchTask(
+                instance_id=f"{repo.split('/')[1]}__{i+11000}",
+                repo=repo,
+                problem_statement=statements[i % len(statements)],
+                base_commit=f"commit_{i:04d}",
+                patch="# Mock patch\n+ line added\n- line removed",
+                FAIL_TO_PASS=["test_1", "test_2"] if i % 2 == 0 else ["test_a"],
+                PASS_TO_PASS=["test_pass_1", "test_pass_2"]
+            )
+            self.tasks[task.instance_id] = task
+        logger.info(f"Loaded {len(self.tasks)} mock SWE-bench tasks")
     async def load_dataset(self, dataset_name: str = "princeton-nlp/SWE-bench_Lite"):
         """Load SWE-bench dataset from Hugging Face"""
         try:
             logger.info(f"Loading SWE-bench dataset: {dataset_name}")
+            # Load the dataset with error handling
+            try:
+                dataset = load_dataset(dataset_name, split='test')
+                # Convert to our task format
+                for item in dataset:
+                    task = SWEBenchTask(
+                        instance_id=item['instance_id'],
+                        repo=item['repo'],
+                        problem_statement=item['problem_statement'],
+                        base_commit=item['base_commit'],
+                        patch=item.get('patch'),
+                        test_patch=item.get('test_patch'),
+                        hints_text=item.get('hints_text'),
+                        created_at=item.get('created_at'),
+                        version=item.get('version'),
+                        FAIL_TO_PASS=item.get('FAIL_TO_PASS'),
+                        PASS_TO_PASS=item.get('PASS_TO_PASS')
+                    )
+                    self.tasks[task.instance_id] = task
+                self.dataset_loaded = True
+                logger.info(f"Loaded {len(self.tasks)} SWE-bench tasks")
+            except Exception as dataset_error:
+                logger.warning(f"Could not load full dataset, using mock data: {dataset_error}")
+                self._load_mock_tasks()
+                self.dataset_loaded = True
             # Initialize metrics cache
             self._update_metrics_cache()
         except ImportError:
+            logger.error("datasets library not installed. Using mock data instead")
+            self._load_mock_tasks()
+            self.dataset_loaded = True
         except Exception as e:
+            logger.error(f"Failed to load SWE-bench dataset, using mock: {e}")
+            self._load_mock_tasks()
+            self.dataset_loaded = True
     def get_tasks(
         self,