gary-boon Claude commited on
Commit
4444ae2
·
1 Parent(s): 07be0bf

Add SWE-bench integration and improve backend routing

Browse files

- Integrate SWE-bench dataset with 300 real-world GitHub issues
- Add comprehensive SWE-bench evaluator UI with task selection and solution generation
- Implement dynamic backend routing for CPU/GPU based on user settings
- Move protected pages to (protected) folder structure for authentication
- Add syntax highlighting for code displays using react-syntax-highlighter
- Create confidence visualization components for transparency metrics
- Fix navigation duplication issues and improve layout consistency
- Add backend indicator showing which backend (Local/CPU/GPU) is being used
- Implement special routing for SWE-bench to always use local backend
- Add debugging and logging for backend selection
- Improve error handling and user feedback

🤖 Generated with [Claude Code](https://claude.ai/code)

Co-Authored-By: Claude <noreply@anthropic.com>

backend/__pycache__/model_service.cpython-310.pyc CHANGED
Binary files a/backend/__pycache__/model_service.cpython-310.pyc and b/backend/__pycache__/model_service.cpython-310.pyc differ
 
backend/model_service.py CHANGED
@@ -1137,19 +1137,174 @@ async def run_demo(request: DemoRequest, authenticated: bool = Depends(verify_ap
1137
  "stack": "class Stack:\n '''Simple stack implementation'''",
1138
  "binary_search": "def binary_search(arr, target):\n '''Find target in sorted array'''"
1139
  }
1140
-
1141
  if request.demo_id not in demos:
1142
  raise HTTPException(status_code=404, detail="Demo not found")
1143
-
1144
  result = await manager.generate_with_traces(
1145
  prompt=demos[request.demo_id],
1146
  max_tokens=100,
1147
  temperature=0.7,
1148
  sampling_rate=0.3 # Same as regular generation for better visualization
1149
  )
1150
-
1151
  return result
1152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1153
  if __name__ == "__main__":
1154
  import uvicorn
1155
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
1137
  "stack": "class Stack:\n '''Simple stack implementation'''",
1138
  "binary_search": "def binary_search(arr, target):\n '''Find target in sorted array'''"
1139
  }
1140
+
1141
  if request.demo_id not in demos:
1142
  raise HTTPException(status_code=404, detail="Demo not found")
1143
+
1144
  result = await manager.generate_with_traces(
1145
  prompt=demos[request.demo_id],
1146
  max_tokens=100,
1147
  temperature=0.7,
1148
  sampling_rate=0.3 # Same as regular generation for better visualization
1149
  )
1150
+
1151
  return result
1152
 
1153
+ # SWE-bench endpoints
1154
+ @app.on_event("startup")
1155
+ async def startup_swe_bench():
1156
+ """Initialize SWE-bench service on startup"""
1157
+ from .swe_bench_service import swe_bench_service
1158
+ try:
1159
+ # Load dataset in background
1160
+ asyncio.create_task(swe_bench_service.load_dataset())
1161
+ logger.info("SWE-bench service initialization started")
1162
+ except Exception as e:
1163
+ logger.warning(f"SWE-bench initialization deferred: {e}")
1164
+
1165
+ @app.get("/swe-bench/tasks")
1166
+ async def get_swe_bench_tasks(
1167
+ category: Optional[str] = None,
1168
+ difficulty: Optional[str] = None,
1169
+ repo: Optional[str] = None,
1170
+ limit: int = 100,
1171
+ offset: int = 0,
1172
+ authenticated: bool = Depends(verify_api_key)
1173
+ ):
1174
+ """Get list of SWE-bench tasks"""
1175
+ from .swe_bench_service import swe_bench_service
1176
+
1177
+ if not swe_bench_service.dataset_loaded:
1178
+ # Try to load dataset if not already loaded
1179
+ await swe_bench_service.load_dataset()
1180
+
1181
+ tasks = swe_bench_service.get_tasks(
1182
+ category=category,
1183
+ difficulty=difficulty,
1184
+ repo=repo,
1185
+ limit=limit,
1186
+ offset=offset
1187
+ )
1188
+
1189
+ return {
1190
+ "tasks": tasks,
1191
+ "total": len(swe_bench_service.tasks),
1192
+ "limit": limit,
1193
+ "offset": offset
1194
+ }
1195
+
1196
+ @app.get("/swe-bench/task/{task_id}")
1197
+ async def get_swe_bench_task(
1198
+ task_id: str,
1199
+ authenticated: bool = Depends(verify_api_key)
1200
+ ):
1201
+ """Get details for a specific SWE-bench task"""
1202
+ from .swe_bench_service import swe_bench_service
1203
+
1204
+ if not swe_bench_service.dataset_loaded:
1205
+ await swe_bench_service.load_dataset()
1206
+
1207
+ task = swe_bench_service.get_task_details(task_id)
1208
+ if not task:
1209
+ raise HTTPException(status_code=404, detail="Task not found")
1210
+
1211
+ return task
1212
+
1213
+ @app.post("/swe-bench/generate")
1214
+ async def generate_swe_bench_solution(
1215
+ request: Dict[str, Any],
1216
+ authenticated: bool = Depends(verify_api_key)
1217
+ ):
1218
+ """Generate a solution for a SWE-bench task"""
1219
+ from .swe_bench_service import swe_bench_service
1220
+
1221
+ if not swe_bench_service.dataset_loaded:
1222
+ await swe_bench_service.load_dataset()
1223
+
1224
+ task_id = request.get("task_id")
1225
+ if not task_id:
1226
+ raise HTTPException(status_code=400, detail="task_id is required")
1227
+
1228
+ enable_transparency = request.get("enable_transparency", True)
1229
+ temperature = request.get("temperature", 0.7)
1230
+ max_tokens = request.get("max_tokens", 500)
1231
+
1232
+ try:
1233
+ result = await swe_bench_service.generate_solution(
1234
+ task_id=task_id,
1235
+ model_manager=manager,
1236
+ enable_transparency=enable_transparency,
1237
+ temperature=temperature,
1238
+ max_tokens=max_tokens
1239
+ )
1240
+
1241
+ return result.to_dict()
1242
+
1243
+ except ValueError as e:
1244
+ raise HTTPException(status_code=404, detail=str(e))
1245
+ except Exception as e:
1246
+ logger.error(f"SWE-bench generation error: {e}")
1247
+ raise HTTPException(status_code=500, detail=str(e))
1248
+
1249
+ @app.post("/swe-bench/evaluate")
1250
+ async def evaluate_swe_bench_solution(
1251
+ request: Dict[str, Any],
1252
+ authenticated: bool = Depends(verify_api_key)
1253
+ ):
1254
+ """Evaluate a generated solution"""
1255
+ from .swe_bench_service import swe_bench_service
1256
+
1257
+ task_id = request.get("task_id")
1258
+ solution = request.get("solution")
1259
+ run_tests = request.get("run_tests", False)
1260
+
1261
+ if not task_id or not solution:
1262
+ raise HTTPException(status_code=400, detail="task_id and solution are required")
1263
+
1264
+ try:
1265
+ evaluation = await swe_bench_service.evaluate_solution(
1266
+ task_id=task_id,
1267
+ solution=solution,
1268
+ run_tests=run_tests
1269
+ )
1270
+
1271
+ return evaluation
1272
+
1273
+ except ValueError as e:
1274
+ raise HTTPException(status_code=404, detail=str(e))
1275
+ except Exception as e:
1276
+ logger.error(f"SWE-bench evaluation error: {e}")
1277
+ raise HTTPException(status_code=500, detail=str(e))
1278
+
1279
+ @app.get("/swe-bench/metrics")
1280
+ async def get_swe_bench_metrics(
1281
+ authenticated: bool = Depends(verify_api_key)
1282
+ ):
1283
+ """Get aggregate metrics for SWE-bench evaluations"""
1284
+ from .swe_bench_service import swe_bench_service
1285
+
1286
+ if not swe_bench_service.dataset_loaded:
1287
+ await swe_bench_service.load_dataset()
1288
+
1289
+ return swe_bench_service.get_metrics()
1290
+
1291
+ @app.get("/swe-bench/comparison/{task_id}")
1292
+ async def get_swe_bench_comparison(
1293
+ task_id: str,
1294
+ authenticated: bool = Depends(verify_api_key)
1295
+ ):
1296
+ """Get comparison results for a task (with vs without transparency)"""
1297
+ from .swe_bench_service import swe_bench_service
1298
+
1299
+ comparison = swe_bench_service.get_comparison_results(task_id)
1300
+ if not comparison:
1301
+ raise HTTPException(
1302
+ status_code=404,
1303
+ detail="No comparison data available. Generate solutions with and without transparency first."
1304
+ )
1305
+
1306
+ return comparison
1307
+
1308
  if __name__ == "__main__":
1309
  import uvicorn
1310
  uvicorn.run(app, host="0.0.0.0", port=8000)
backend/swe_bench_service.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ SWE-bench Integration Service for Visualisable.ai
3
+ Provides access to SWE-bench dataset and evaluation capabilities
4
+ """
5
+
6
+ from typing import Dict, List, Optional, Any
7
+ from dataclasses import dataclass, asdict
8
+ import json
9
+ import time
10
+ import logging
11
+ from datetime import datetime
12
+ import traceback
13
+ import numpy as np
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ @dataclass
18
+ class SWEBenchTask:
19
+ """Represents a SWE-bench task/issue"""
20
+ instance_id: str
21
+ repo: str
22
+ problem_statement: str
23
+ base_commit: str
24
+ patch: Optional[str] = None
25
+ test_patch: Optional[str] = None
26
+ hints_text: Optional[str] = None
27
+ created_at: Optional[str] = None
28
+ version: Optional[str] = None
29
+ FAIL_TO_PASS: Optional[List[str]] = None
30
+ PASS_TO_PASS: Optional[List[str]] = None
31
+
32
+ @property
33
+ def difficulty(self) -> str:
34
+ """Estimate difficulty based on patch size and test count"""
35
+ if not self.patch:
36
+ return "unknown"
37
+
38
+ patch_lines = len(self.patch.split('\n'))
39
+ test_count = len(self.FAIL_TO_PASS) if self.FAIL_TO_PASS else 0
40
+
41
+ # Adjusted thresholds for better distribution in SWE-bench_Lite
42
+ # Most tasks are complex, so we use percentile-based distribution
43
+ if patch_lines < 30:
44
+ return "easy"
45
+ elif patch_lines < 100:
46
+ return "medium"
47
+ else:
48
+ return "hard"
49
+
50
+ @property
51
+ def category(self) -> str:
52
+ """Categorize based on problem statement keywords"""
53
+ statement_lower = self.problem_statement.lower()
54
+
55
+ if any(word in statement_lower for word in ['bug', 'fix', 'error', 'crash', 'fail']):
56
+ return "bug-fix"
57
+ elif any(word in statement_lower for word in ['add', 'feature', 'implement', 'support']):
58
+ return "feature"
59
+ elif any(word in statement_lower for word in ['refactor', 'clean', 'improve', 'optimize']):
60
+ return "refactor"
61
+ elif any(word in statement_lower for word in ['test', 'coverage', 'assert']):
62
+ return "test"
63
+ elif any(word in statement_lower for word in ['doc', 'comment', 'readme']):
64
+ return "documentation"
65
+ else:
66
+ return "other"
67
+
68
+ @dataclass
69
+ class SWEBenchResult:
70
+ """Results from evaluating a solution"""
71
+ task_id: str
72
+ generated_solution: str
73
+ tokens: List[str]
74
+ token_probabilities: List[float]
75
+ attention_traces: List[Dict]
76
+ confidence_scores: List[float]
77
+ generation_time: float
78
+ success: Optional[bool] = None
79
+ tests_passed: Optional[int] = None
80
+ tests_failed: Optional[int] = None
81
+ error_message: Optional[str] = None
82
+ hallucination_risk: Optional[float] = None
83
+
84
+ def to_dict(self) -> Dict:
85
+ """Convert to dictionary for JSON serialization"""
86
+ return asdict(self)
87
+
88
+ class SWEBenchService:
89
+ """Service for managing SWE-bench tasks and evaluations"""
90
+
91
+ def __init__(self):
92
+ self.tasks: Dict[str, SWEBenchTask] = {}
93
+ self.results: Dict[str, List[SWEBenchResult]] = {}
94
+ self.dataset_loaded = False
95
+ self.metrics_cache: Dict[str, Any] = {}
96
+
97
+ async def load_dataset(self, dataset_name: str = "princeton-nlp/SWE-bench_Lite"):
98
+ """Load SWE-bench dataset from Hugging Face"""
99
+ try:
100
+ from datasets import load_dataset
101
+
102
+ logger.info(f"Loading SWE-bench dataset: {dataset_name}")
103
+
104
+ # Load the dataset
105
+ dataset = load_dataset(dataset_name, split='test')
106
+
107
+ # Convert to our task format
108
+ for item in dataset:
109
+ task = SWEBenchTask(
110
+ instance_id=item['instance_id'],
111
+ repo=item['repo'],
112
+ problem_statement=item['problem_statement'],
113
+ base_commit=item['base_commit'],
114
+ patch=item.get('patch'),
115
+ test_patch=item.get('test_patch'),
116
+ hints_text=item.get('hints_text'),
117
+ created_at=item.get('created_at'),
118
+ version=item.get('version'),
119
+ FAIL_TO_PASS=item.get('FAIL_TO_PASS'),
120
+ PASS_TO_PASS=item.get('PASS_TO_PASS')
121
+ )
122
+ self.tasks[task.instance_id] = task
123
+
124
+ self.dataset_loaded = True
125
+ logger.info(f"Loaded {len(self.tasks)} SWE-bench tasks")
126
+
127
+ # Initialize metrics cache
128
+ self._update_metrics_cache()
129
+
130
+ except ImportError:
131
+ logger.error("datasets library not installed. Run: pip install datasets")
132
+ raise
133
+ except Exception as e:
134
+ logger.error(f"Failed to load SWE-bench dataset: {e}")
135
+ raise
136
+
137
+ def get_tasks(
138
+ self,
139
+ category: Optional[str] = None,
140
+ difficulty: Optional[str] = None,
141
+ repo: Optional[str] = None,
142
+ limit: int = 100,
143
+ offset: int = 0
144
+ ) -> List[Dict]:
145
+ """Get filtered list of tasks"""
146
+ tasks = list(self.tasks.values())
147
+
148
+ # Apply filters
149
+ if category:
150
+ tasks = [t for t in tasks if t.category == category]
151
+ if difficulty:
152
+ tasks = [t for t in tasks if t.difficulty == difficulty]
153
+ if repo:
154
+ tasks = [t for t in tasks if t.repo == repo]
155
+
156
+ # Apply pagination
157
+ tasks = tasks[offset:offset + limit]
158
+
159
+ # Convert to dict format
160
+ return [
161
+ {
162
+ 'instance_id': t.instance_id,
163
+ 'repo': t.repo,
164
+ 'category': t.category,
165
+ 'difficulty': t.difficulty,
166
+ 'problem_statement': t.problem_statement[:500] + '...' if len(t.problem_statement) > 500 else t.problem_statement,
167
+ 'created_at': t.created_at,
168
+ 'has_patch': t.patch is not None,
169
+ 'has_tests': t.test_patch is not None,
170
+ 'test_count': len(t.FAIL_TO_PASS) if t.FAIL_TO_PASS else 0
171
+ }
172
+ for t in tasks
173
+ ]
174
+
175
+ def get_task_details(self, task_id: str) -> Optional[Dict]:
176
+ """Get detailed information about a specific task"""
177
+ task = self.tasks.get(task_id)
178
+ if not task:
179
+ return None
180
+
181
+ return {
182
+ 'instance_id': task.instance_id,
183
+ 'repo': task.repo,
184
+ 'category': task.category,
185
+ 'difficulty': task.difficulty,
186
+ 'problem_statement': task.problem_statement,
187
+ 'base_commit': task.base_commit,
188
+ 'hints': task.hints_text,
189
+ 'created_at': task.created_at,
190
+ 'version': task.version,
191
+ 'patch_preview': task.patch[:1000] if task.patch else None,
192
+ 'test_preview': task.test_patch[:1000] if task.test_patch else None,
193
+ 'fail_to_pass': task.FAIL_TO_PASS,
194
+ 'pass_to_pass': task.PASS_TO_PASS,
195
+ 'patch_size': len(task.patch.split('\n')) if task.patch else 0,
196
+ 'test_count': len(task.FAIL_TO_PASS) if task.FAIL_TO_PASS else 0
197
+ }
198
+
199
+ async def generate_solution(
200
+ self,
201
+ task_id: str,
202
+ model_manager,
203
+ enable_transparency: bool = True,
204
+ temperature: float = 0.7,
205
+ max_tokens: int = 500
206
+ ) -> SWEBenchResult:
207
+ """Generate a solution for a SWE-bench task"""
208
+ task = self.tasks.get(task_id)
209
+ if not task:
210
+ raise ValueError(f"Task {task_id} not found")
211
+
212
+ # Prepare prompt
213
+ prompt = self._create_prompt(task)
214
+
215
+ # Generate solution with traces
216
+ start_time = time.time()
217
+
218
+ try:
219
+ if enable_transparency:
220
+ # Generate with full trace extraction
221
+ result = await model_manager.generate_with_traces(
222
+ prompt=prompt,
223
+ max_tokens=max_tokens,
224
+ temperature=temperature,
225
+ sampling_rate=0.1,
226
+ layer_stride=2 # Sample every other layer for efficiency
227
+ )
228
+ else:
229
+ # Generate without traces (baseline)
230
+ result = await model_manager.generate_with_traces(
231
+ prompt=prompt,
232
+ max_tokens=max_tokens,
233
+ temperature=temperature,
234
+ sampling_rate=0, # No trace sampling
235
+ layer_stride=999 # Skip all layers
236
+ )
237
+
238
+ generation_time = time.time() - start_time
239
+
240
+ # Create result object
241
+ swe_result = SWEBenchResult(
242
+ task_id=task_id,
243
+ generated_solution=result.get('generated_text', ''),
244
+ tokens=result.get('tokens', []),
245
+ token_probabilities=result.get('probabilities', []),
246
+ attention_traces=result.get('traces', []) if enable_transparency else [],
247
+ confidence_scores=[p for p in result.get('probabilities', [])],
248
+ generation_time=generation_time,
249
+ hallucination_risk=result.get('hallucination_risk', 0.0)
250
+ )
251
+
252
+ # Store result
253
+ if task_id not in self.results:
254
+ self.results[task_id] = []
255
+ self.results[task_id].append(swe_result)
256
+
257
+ return swe_result
258
+
259
+ except Exception as e:
260
+ logger.error(f"Failed to generate solution for {task_id}: {e}")
261
+ logger.error(traceback.format_exc())
262
+ raise
263
+
264
+ def _create_prompt(self, task: SWEBenchTask) -> str:
265
+ """Create a prompt for the model based on the task"""
266
+ prompt_parts = []
267
+
268
+ # Add repository context
269
+ prompt_parts.append(f"# Repository: {task.repo}")
270
+ prompt_parts.append(f"# Base commit: {task.base_commit[:8]}")
271
+ prompt_parts.append("")
272
+
273
+ # Add problem statement
274
+ prompt_parts.append("# Issue Description:")
275
+ prompt_parts.append(task.problem_statement[:2000]) # Limit length
276
+ prompt_parts.append("")
277
+
278
+ # Add hints if available
279
+ if task.hints_text:
280
+ prompt_parts.append("# Developer Comments:")
281
+ prompt_parts.append(task.hints_text[:500])
282
+ prompt_parts.append("")
283
+
284
+ # Add instruction
285
+ prompt_parts.append("# Task: Write code to fix this issue")
286
+ prompt_parts.append("# Solution:")
287
+ prompt_parts.append("")
288
+
289
+ return "\n".join(prompt_parts)
290
+
291
+ async def evaluate_solution(
292
+ self,
293
+ task_id: str,
294
+ solution: str,
295
+ run_tests: bool = False
296
+ ) -> Dict:
297
+ """Evaluate a generated solution against the gold patch"""
298
+ task = self.tasks.get(task_id)
299
+ if not task:
300
+ raise ValueError(f"Task {task_id} not found")
301
+
302
+ evaluation = {
303
+ 'task_id': task_id,
304
+ 'has_gold_patch': task.patch is not None,
305
+ 'solution_length': len(solution.split('\n')),
306
+ 'gold_patch_length': len(task.patch.split('\n')) if task.patch else 0,
307
+ }
308
+
309
+ if task.patch:
310
+ # Calculate similarity metrics
311
+ from difflib import SequenceMatcher
312
+
313
+ # Basic similarity score
314
+ similarity = SequenceMatcher(None, solution, task.patch).ratio()
315
+ evaluation['similarity_score'] = similarity
316
+
317
+ # Check if key patterns from gold patch are present
318
+ gold_lines = set(line.strip() for line in task.patch.split('\n')
319
+ if line.strip() and not line.startswith(('#', '//', '"""')))
320
+ solution_lines = set(line.strip() for line in solution.split('\n')
321
+ if line.strip() and not line.startswith(('#', '//', '"""')))
322
+
323
+ if gold_lines:
324
+ pattern_coverage = len(gold_lines.intersection(solution_lines)) / len(gold_lines)
325
+ evaluation['pattern_coverage'] = pattern_coverage
326
+
327
+ if run_tests and task.test_patch:
328
+ # Placeholder for actual test execution
329
+ # In production, this would apply the patch and run tests in a container
330
+ evaluation['test_execution'] = {
331
+ 'status': 'not_implemented',
332
+ 'message': 'Test execution requires Docker setup'
333
+ }
334
+
335
+ return evaluation
336
+
337
+ def get_metrics(self) -> Dict:
338
+ """Get aggregate metrics across all evaluations"""
339
+ if not self.results:
340
+ return {
341
+ 'total_tasks': len(self.tasks),
342
+ 'tasks_attempted': 0,
343
+ 'total_generations': 0,
344
+ 'avg_generation_time': 0,
345
+ 'avg_confidence': 0,
346
+ 'avg_hallucination_risk': 0,
347
+ 'categories': self._get_category_distribution(),
348
+ 'difficulties': self._get_difficulty_distribution()
349
+ }
350
+
351
+ # Calculate metrics
352
+ all_results = []
353
+ for task_results in self.results.values():
354
+ all_results.extend(task_results)
355
+
356
+ if all_results:
357
+ avg_time = np.mean([r.generation_time for r in all_results])
358
+ avg_confidence = np.mean([np.mean(r.confidence_scores) for r in all_results if r.confidence_scores])
359
+ avg_hallucination = np.mean([r.hallucination_risk for r in all_results if r.hallucination_risk is not None])
360
+ else:
361
+ avg_time = avg_confidence = avg_hallucination = 0
362
+
363
+ return {
364
+ 'total_tasks': len(self.tasks),
365
+ 'tasks_attempted': len(self.results),
366
+ 'total_generations': len(all_results),
367
+ 'avg_generation_time': float(avg_time),
368
+ 'avg_confidence': float(avg_confidence),
369
+ 'avg_hallucination_risk': float(avg_hallucination),
370
+ 'categories': self._get_category_distribution(),
371
+ 'difficulties': self._get_difficulty_distribution(),
372
+ 'with_transparency': sum(1 for r in all_results if r.attention_traces),
373
+ 'without_transparency': sum(1 for r in all_results if not r.attention_traces)
374
+ }
375
+
376
+ def _get_category_distribution(self) -> Dict[str, int]:
377
+ """Get distribution of task categories"""
378
+ distribution = {}
379
+ for task in self.tasks.values():
380
+ category = task.category
381
+ distribution[category] = distribution.get(category, 0) + 1
382
+ return distribution
383
+
384
+ def _get_difficulty_distribution(self) -> Dict[str, int]:
385
+ """Get distribution of task difficulties"""
386
+ distribution = {}
387
+ for task in self.tasks.values():
388
+ difficulty = task.difficulty
389
+ distribution[difficulty] = distribution.get(difficulty, 0) + 1
390
+ return distribution
391
+
392
+ def _update_metrics_cache(self):
393
+ """Update cached metrics"""
394
+ self.metrics_cache = {
395
+ 'last_updated': datetime.now().isoformat(),
396
+ 'dataset_info': {
397
+ 'total_tasks': len(self.tasks),
398
+ 'repositories': len(set(t.repo for t in self.tasks.values())),
399
+ 'categories': self._get_category_distribution(),
400
+ 'difficulties': self._get_difficulty_distribution()
401
+ }
402
+ }
403
+
404
+ def get_comparison_results(self, task_id: str) -> Optional[Dict]:
405
+ """Get comparison between with/without transparency for a task"""
406
+ if task_id not in self.results:
407
+ return None
408
+
409
+ task_results = self.results[task_id]
410
+
411
+ # Separate results by transparency
412
+ with_transparency = [r for r in task_results if r.attention_traces]
413
+ without_transparency = [r for r in task_results if not r.attention_traces]
414
+
415
+ if not with_transparency or not without_transparency:
416
+ return None
417
+
418
+ # Get best results from each group
419
+ best_with = min(with_transparency, key=lambda r: r.generation_time)
420
+ best_without = min(without_transparency, key=lambda r: r.generation_time)
421
+
422
+ return {
423
+ 'task_id': task_id,
424
+ 'with_transparency': {
425
+ 'generation_time': best_with.generation_time,
426
+ 'avg_confidence': np.mean(best_with.confidence_scores) if best_with.confidence_scores else 0,
427
+ 'hallucination_risk': best_with.hallucination_risk,
428
+ 'solution_length': len(best_with.generated_solution.split('\n'))
429
+ },
430
+ 'without_transparency': {
431
+ 'generation_time': best_without.generation_time,
432
+ 'avg_confidence': np.mean(best_without.confidence_scores) if best_without.confidence_scores else 0,
433
+ 'hallucination_risk': best_without.hallucination_risk,
434
+ 'solution_length': len(best_without.generated_solution.split('\n'))
435
+ },
436
+ 'improvement': {
437
+ 'time_delta': best_with.generation_time - best_without.generation_time,
438
+ 'confidence_delta': (np.mean(best_with.confidence_scores) if best_with.confidence_scores else 0) -
439
+ (np.mean(best_without.confidence_scores) if best_without.confidence_scores else 0)
440
+ }
441
+ }
442
+
443
+ # Global service instance
444
+ swe_bench_service = SWEBenchService()
requirements.txt CHANGED
@@ -13,4 +13,7 @@ accelerate==0.24.1
13
  # Utilities
14
  numpy==1.24.3
15
  aiofiles==23.2.1
16
- python-dotenv==1.0.0
 
 
 
 
13
  # Utilities
14
  numpy==1.24.3
15
  aiofiles==23.2.1
16
+ python-dotenv==1.0.0
17
+
18
+ # SWE-bench support
19
+ datasets==2.14.0