Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- Dockerfile +1 -0
- README.md +1 -0
- dataqa_env/server/app.py +0 -21
- dataqa_env/server/tasks.py +19 -15
Dockerfile
CHANGED
|
@@ -32,4 +32,5 @@ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
|
| 32 |
|
| 33 |
EXPOSE 8000
|
| 34 |
|
|
|
|
| 35 |
CMD ["uvicorn", "dataqa_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
|
|
|
| 32 |
|
| 33 |
EXPOSE 8000
|
| 34 |
|
| 35 |
+
ENV ENABLE_WEB_INTERFACE=true
|
| 36 |
CMD ["uvicorn", "dataqa_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
|
README.md
CHANGED
|
@@ -8,6 +8,7 @@ pinned: false
|
|
| 8 |
app_port: 8000
|
| 9 |
tags:
|
| 10 |
- openenv
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
# DataQA Environment
|
|
|
|
| 8 |
app_port: 8000
|
| 9 |
tags:
|
| 10 |
- openenv
|
| 11 |
+
base_path: /web
|
| 12 |
---
|
| 13 |
|
| 14 |
# DataQA Environment
|
dataqa_env/server/app.py
CHANGED
|
@@ -30,27 +30,6 @@ def root():
|
|
| 30 |
}
|
| 31 |
|
| 32 |
|
| 33 |
-
@app.get("/debug/{task_id}")
|
| 34 |
-
def debug_task(task_id: str):
|
| 35 |
-
"""Debug endpoint β test task creation."""
|
| 36 |
-
import traceback
|
| 37 |
-
try:
|
| 38 |
-
from .tasks import get_task
|
| 39 |
-
except ImportError:
|
| 40 |
-
from dataqa_env.server.tasks import get_task
|
| 41 |
-
try:
|
| 42 |
-
t = get_task(task_id)
|
| 43 |
-
return {
|
| 44 |
-
"status": "ok",
|
| 45 |
-
"task_id": t.task_id,
|
| 46 |
-
"name": t.name,
|
| 47 |
-
"issues": len(t.planted_issues),
|
| 48 |
-
"csv_lines": len(t.corrupted_csv.strip().splitlines()),
|
| 49 |
-
}
|
| 50 |
-
except Exception as e:
|
| 51 |
-
return {"status": "error", "error": str(e), "traceback": traceback.format_exc()}
|
| 52 |
-
|
| 53 |
-
|
| 54 |
def main():
|
| 55 |
import uvicorn
|
| 56 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
|
|
|
| 30 |
}
|
| 31 |
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
def main():
|
| 34 |
import uvicorn
|
| 35 |
uvicorn.run(app, host="0.0.0.0", port=8000)
|
dataqa_env/server/tasks.py
CHANGED
|
@@ -612,8 +612,10 @@ def create_task_alignment(seed: int = 42) -> Task:
|
|
| 612 |
difficulty=2.0))
|
| 613 |
|
| 614 |
# Issue 6: Near-duplicate prompt β rephrased but semantically identical to another row
|
|
|
|
| 615 |
r = 6 # id=7 (centurion duties)
|
| 616 |
data[r][1] = "What is gelatin and how is it produced?" # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
|
|
|
|
| 617 |
issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
|
| 618 |
description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β wastes training compute and biases model",
|
| 619 |
difficulty=2.5))
|
|
@@ -640,15 +642,13 @@ def create_task_alignment(seed: int = 42) -> Task:
|
|
| 640 |
difficulty=3.0))
|
| 641 |
|
| 642 |
# Issue 10: Helpfulness score contradicts response quality
|
| 643 |
-
# Row 17 about most destructive disaster β
|
| 644 |
r = 16 # id=17
|
| 645 |
-
data[r][3] = "4" # helpfulness=4 but response is
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
# This is arguably correct but gives no context, no detail β helpfulness=4 and correctness=4
|
| 649 |
-
# for a 4-word answer to "most destructive natural disaster" is clearly inflated
|
| 650 |
issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
|
| 651 |
-
description="Helpfulness score is 4 but response is only
|
| 652 |
difficulty=2.5))
|
| 653 |
|
| 654 |
# Issue 11: Whitespace-only prompt (data pipeline artifact)
|
|
@@ -722,8 +722,8 @@ def create_task_coding(seed: int = 42) -> Task:
|
|
| 722 |
- language: string, one of [python, javascript, sql, java, cpp, rust, go]
|
| 723 |
- difficulty: string, one of [easy, medium, hard]
|
| 724 |
- response: string, non-empty, contains code that solves the instruction
|
| 725 |
-
- test_cases: string, non-empty, contains assertions or
|
| 726 |
-
- is_correct: boolean (true/false), whether the response correctly solves the instruction
|
| 727 |
- category: string, one of [algorithms, data_structures, strings, web, databases, design_patterns]"""
|
| 728 |
|
| 729 |
rules = """1. No missing values in any column
|
|
@@ -735,7 +735,8 @@ def create_task_coding(seed: int = 42) -> Task:
|
|
| 735 |
7. response must be syntactically valid code (no truncation or syntax errors)
|
| 736 |
8. test_cases must be relevant to the instruction
|
| 737 |
9. No duplicate instructions (same problem stated differently counts as duplicate)
|
| 738 |
-
10. category must match the actual nature of the problem
|
|
|
|
| 739 |
|
| 740 |
rows = _csv_to_rows(clean_csv)
|
| 741 |
header = rows[0]
|
|
@@ -814,13 +815,15 @@ def create_task_coding(seed: int = 42) -> Task:
|
|
| 814 |
description="Response uses eval() on user input β critical security vulnerability (code injection) but is_correct=true",
|
| 815 |
difficulty=3.0))
|
| 816 |
|
| 817 |
-
# Issue 9: Duplicate instruction β row 14
|
| 818 |
-
# Change instruction to
|
| 819 |
r = 13
|
| 820 |
data[r][1] = "Implement merge sort algorithm."
|
|
|
|
|
|
|
| 821 |
issues.append(PlantedIssue(
|
| 822 |
row=r + 1, col="instruction", issue_type="duplicate_row",
|
| 823 |
-
description="
|
| 824 |
difficulty=2.5))
|
| 825 |
|
| 826 |
# Issue 10: Wrong category β Dijkstra labeled as design_patterns (difficulty 1.5)
|
|
@@ -969,10 +972,11 @@ def create_task_toolcalling(seed: int = 42) -> Task:
|
|
| 969 |
description="Empty description field for summarize_text function",
|
| 970 |
difficulty=1.0))
|
| 971 |
|
| 972 |
-
# Issue 8: Duplicate function β row 17 (get_user_info) duplicates row 11 (create_user)
|
| 973 |
-
# Change function_name to create_user (
|
| 974 |
r = 16
|
| 975 |
data[r][1] = "create_user"
|
|
|
|
| 976 |
issues.append(PlantedIssue(
|
| 977 |
row=r + 1, col="function_name", issue_type="duplicate_row",
|
| 978 |
description="Duplicate function_name 'create_user' β already defined in row 11",
|
|
|
|
| 612 |
difficulty=2.0))
|
| 613 |
|
| 614 |
# Issue 6: Near-duplicate prompt β rephrased but semantically identical to another row
|
| 615 |
+
# Also change the response to a rephrased gelatin answer to avoid a secondary prompt-response mismatch
|
| 616 |
r = 6 # id=7 (centurion duties)
|
| 617 |
data[r][1] = "What is gelatin and how is it produced?" # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
|
| 618 |
+
data[r][2] = "Gelatin is a protein derived from collagen found in the bones and skin of animals mainly cows and pigs. The production process involves boiling these animal parts to extract collagen which is then processed and dried into powder or sheets for use in food and industrial applications."
|
| 619 |
issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
|
| 620 |
description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β wastes training compute and biases model",
|
| 621 |
difficulty=2.5))
|
|
|
|
| 642 |
difficulty=3.0))
|
| 643 |
|
| 644 |
# Issue 10: Helpfulness score contradicts response quality
|
| 645 |
+
# Row 17 about most destructive disaster β already terse (2 sentences), inflate helpfulness to 4
|
| 646 |
r = 16 # id=17
|
| 647 |
+
data[r][3] = "4" # helpfulness=4 but response is brief and lacks detail for a complex historical question
|
| 648 |
+
# Only change the helpfulness score β keep original response and correctness intact
|
| 649 |
+
# to avoid creating unplanted secondary issues
|
|
|
|
|
|
|
| 650 |
issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
|
| 651 |
+
description="Helpfulness score is 4 but response is only 2 short sentences with no context or analysis β score inflated",
|
| 652 |
difficulty=2.5))
|
| 653 |
|
| 654 |
# Issue 11: Whitespace-only prompt (data pipeline artifact)
|
|
|
|
| 722 |
- language: string, one of [python, javascript, sql, java, cpp, rust, go]
|
| 723 |
- difficulty: string, one of [easy, medium, hard]
|
| 724 |
- response: string, non-empty, contains code that solves the instruction
|
| 725 |
+
- test_cases: string, non-empty, contains assertions, test commands, or setup notes for testing
|
| 726 |
+
- is_correct: boolean (true/false), whether the response correctly solves the instruction (security vulnerabilities count as incorrect)
|
| 727 |
- category: string, one of [algorithms, data_structures, strings, web, databases, design_patterns]"""
|
| 728 |
|
| 729 |
rules = """1. No missing values in any column
|
|
|
|
| 735 |
7. response must be syntactically valid code (no truncation or syntax errors)
|
| 736 |
8. test_cases must be relevant to the instruction
|
| 737 |
9. No duplicate instructions (same problem stated differently counts as duplicate)
|
| 738 |
+
10. category must match the actual nature of the problem
|
| 739 |
+
11. response must not contain critical security vulnerabilities (e.g., eval on user input, SQL injection)"""
|
| 740 |
|
| 741 |
rows = _csv_to_rows(clean_csv)
|
| 742 |
header = rows[0]
|
|
|
|
| 815 |
description="Response uses eval() on user input β critical security vulnerability (code injection) but is_correct=true",
|
| 816 |
difficulty=3.0))
|
| 817 |
|
| 818 |
+
# Issue 9: Duplicate instruction β row 14 becomes a near-copy of row 7 (merge sort)
|
| 819 |
+
# Change both instruction AND response to make it a true duplicate (no instruction-response mismatch)
|
| 820 |
r = 13
|
| 821 |
data[r][1] = "Implement merge sort algorithm."
|
| 822 |
+
data[r][4] = data[6][4] # Copy merge sort response from row 7
|
| 823 |
+
data[r][5] = data[6][5] # Copy test cases too
|
| 824 |
issues.append(PlantedIssue(
|
| 825 |
row=r + 1, col="instruction", issue_type="duplicate_row",
|
| 826 |
+
description="Row 14 is a near-duplicate of row 7 (same merge sort instruction and code)",
|
| 827 |
difficulty=2.5))
|
| 828 |
|
| 829 |
# Issue 10: Wrong category β Dijkstra labeled as design_patterns (difficulty 1.5)
|
|
|
|
| 972 |
description="Empty description field for summarize_text function",
|
| 973 |
difficulty=1.0))
|
| 974 |
|
| 975 |
+
# Issue 8: Duplicate function β row 17 (get_user_info) duplicates row 11 (create_user)
|
| 976 |
+
# Change function_name AND example_call to create_user (avoid secondary mismatch)
|
| 977 |
r = 16
|
| 978 |
data[r][1] = "create_user"
|
| 979 |
+
data[r][6] = '{"function": "create_user", "arguments": {"username": "jdoe", "email": "jdoe@example.com", "role": "user"}}'
|
| 980 |
issues.append(PlantedIssue(
|
| 981 |
row=r + 1, col="function_name", issue_type="duplicate_row",
|
| 982 |
description="Duplicate function_name 'create_user' β already defined in row 11",
|