varb15 commited on
Commit
cf05dbb
Β·
verified Β·
1 Parent(s): 3ebb973

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. Dockerfile +1 -0
  2. README.md +1 -0
  3. dataqa_env/server/app.py +0 -21
  4. dataqa_env/server/tasks.py +19 -15
Dockerfile CHANGED
@@ -32,4 +32,5 @@ HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
32
 
33
  EXPOSE 8000
34
 
 
35
  CMD ["uvicorn", "dataqa_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
 
32
 
33
  EXPOSE 8000
34
 
35
+ ENV ENABLE_WEB_INTERFACE=true
36
  CMD ["uvicorn", "dataqa_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
README.md CHANGED
@@ -8,6 +8,7 @@ pinned: false
8
  app_port: 8000
9
  tags:
10
  - openenv
 
11
  ---
12
 
13
  # DataQA Environment
 
8
  app_port: 8000
9
  tags:
10
  - openenv
11
+ base_path: /web
12
  ---
13
 
14
  # DataQA Environment
dataqa_env/server/app.py CHANGED
@@ -30,27 +30,6 @@ def root():
30
  }
31
 
32
 
33
- @app.get("/debug/{task_id}")
34
- def debug_task(task_id: str):
35
- """Debug endpoint β€” test task creation."""
36
- import traceback
37
- try:
38
- from .tasks import get_task
39
- except ImportError:
40
- from dataqa_env.server.tasks import get_task
41
- try:
42
- t = get_task(task_id)
43
- return {
44
- "status": "ok",
45
- "task_id": t.task_id,
46
- "name": t.name,
47
- "issues": len(t.planted_issues),
48
- "csv_lines": len(t.corrupted_csv.strip().splitlines()),
49
- }
50
- except Exception as e:
51
- return {"status": "error", "error": str(e), "traceback": traceback.format_exc()}
52
-
53
-
54
  def main():
55
  import uvicorn
56
  uvicorn.run(app, host="0.0.0.0", port=8000)
 
30
  }
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def main():
34
  import uvicorn
35
  uvicorn.run(app, host="0.0.0.0", port=8000)
dataqa_env/server/tasks.py CHANGED
@@ -612,8 +612,10 @@ def create_task_alignment(seed: int = 42) -> Task:
612
  difficulty=2.0))
613
 
614
  # Issue 6: Near-duplicate prompt β€” rephrased but semantically identical to another row
 
615
  r = 6 # id=7 (centurion duties)
616
  data[r][1] = "What is gelatin and how is it produced?" # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
 
617
  issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
618
  description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β€” wastes training compute and biases model",
619
  difficulty=2.5))
@@ -640,15 +642,13 @@ def create_task_alignment(seed: int = 42) -> Task:
640
  difficulty=3.0))
641
 
642
  # Issue 10: Helpfulness score contradicts response quality
643
- # Row 17 about most destructive disaster β€” response is extremely terse for a complex question
644
  r = 16 # id=17
645
- data[r][3] = "4" # helpfulness=4 but response is just 2 sentences for a nuanced historical question
646
- data[r][4] = "4" # correctness=4 but the answer itself is debatable
647
- data[r][2] = "The 1556 Shaanxi earthquake."
648
- # This is arguably correct but gives no context, no detail β€” helpfulness=4 and correctness=4
649
- # for a 4-word answer to "most destructive natural disaster" is clearly inflated
650
  issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
651
- description="Helpfulness score is 4 but response is only 4 words ('The 1556 Shaanxi earthquake.') with no explanation β€” score inflated for an unhelpful response",
652
  difficulty=2.5))
653
 
654
  # Issue 11: Whitespace-only prompt (data pipeline artifact)
@@ -722,8 +722,8 @@ def create_task_coding(seed: int = 42) -> Task:
722
  - language: string, one of [python, javascript, sql, java, cpp, rust, go]
723
  - difficulty: string, one of [easy, medium, hard]
724
  - response: string, non-empty, contains code that solves the instruction
725
- - test_cases: string, non-empty, contains assertions or test descriptions
726
- - is_correct: boolean (true/false), whether the response correctly solves the instruction
727
  - category: string, one of [algorithms, data_structures, strings, web, databases, design_patterns]"""
728
 
729
  rules = """1. No missing values in any column
@@ -735,7 +735,8 @@ def create_task_coding(seed: int = 42) -> Task:
735
  7. response must be syntactically valid code (no truncation or syntax errors)
736
  8. test_cases must be relevant to the instruction
737
  9. No duplicate instructions (same problem stated differently counts as duplicate)
738
- 10. category must match the actual nature of the problem"""
 
739
 
740
  rows = _csv_to_rows(clean_csv)
741
  header = rows[0]
@@ -814,13 +815,15 @@ def create_task_coding(seed: int = 42) -> Task:
814
  description="Response uses eval() on user input β€” critical security vulnerability (code injection) but is_correct=true",
815
  difficulty=3.0))
816
 
817
- # Issue 9: Duplicate instruction β€” row 14 (quicksort) is semantically same as row 7 (merge sort)
818
- # Change instruction to match merge sort
819
  r = 13
820
  data[r][1] = "Implement merge sort algorithm."
 
 
821
  issues.append(PlantedIssue(
822
  row=r + 1, col="instruction", issue_type="duplicate_row",
823
- description="Instruction 'Implement merge sort algorithm' duplicates row 7 'Implement merge sort' (semantic duplicate)",
824
  difficulty=2.5))
825
 
826
  # Issue 10: Wrong category β€” Dijkstra labeled as design_patterns (difficulty 1.5)
@@ -969,10 +972,11 @@ def create_task_toolcalling(seed: int = 42) -> Task:
969
  description="Empty description field for summarize_text function",
970
  difficulty=1.0))
971
 
972
- # Issue 8: Duplicate function β€” row 17 (get_user_info) duplicates row 11 (create_user) in purpose
973
- # Change function_name to create_user (duplicate)
974
  r = 16
975
  data[r][1] = "create_user"
 
976
  issues.append(PlantedIssue(
977
  row=r + 1, col="function_name", issue_type="duplicate_row",
978
  description="Duplicate function_name 'create_user' β€” already defined in row 11",
 
612
  difficulty=2.0))
613
 
614
  # Issue 6: Near-duplicate prompt β€” rephrased but semantically identical to another row
615
+ # Also change the response to a rephrased gelatin answer to avoid a secondary prompt-response mismatch
616
  r = 6 # id=7 (centurion duties)
617
  data[r][1] = "What is gelatin and how is it produced?" # semantic duplicate of row 1 "What is gelatin made of and how is it made?"
618
+ data[r][2] = "Gelatin is a protein derived from collagen found in the bones and skin of animals mainly cows and pigs. The production process involves boiling these animal parts to extract collagen which is then processed and dried into powder or sheets for use in food and industrial applications."
619
  issues.append(PlantedIssue(row=r + 1, col="prompt", issue_type="duplicate_row",
620
  description="Prompt 'What is gelatin and how is it produced?' is a semantic duplicate of row 1 'What is gelatin made of and how is it made?' β€” wastes training compute and biases model",
621
  difficulty=2.5))
 
642
  difficulty=3.0))
643
 
644
  # Issue 10: Helpfulness score contradicts response quality
645
+ # Row 17 about most destructive disaster β€” already terse (2 sentences), inflate helpfulness to 4
646
  r = 16 # id=17
647
+ data[r][3] = "4" # helpfulness=4 but response is brief and lacks detail for a complex historical question
648
+ # Only change the helpfulness score β€” keep original response and correctness intact
649
+ # to avoid creating unplanted secondary issues
 
 
650
  issues.append(PlantedIssue(row=r + 1, col="helpfulness", issue_type="inconsistent_value",
651
+ description="Helpfulness score is 4 but response is only 2 short sentences with no context or analysis β€” score inflated",
652
  difficulty=2.5))
653
 
654
  # Issue 11: Whitespace-only prompt (data pipeline artifact)
 
722
  - language: string, one of [python, javascript, sql, java, cpp, rust, go]
723
  - difficulty: string, one of [easy, medium, hard]
724
  - response: string, non-empty, contains code that solves the instruction
725
+ - test_cases: string, non-empty, contains assertions, test commands, or setup notes for testing
726
+ - is_correct: boolean (true/false), whether the response correctly solves the instruction (security vulnerabilities count as incorrect)
727
  - category: string, one of [algorithms, data_structures, strings, web, databases, design_patterns]"""
728
 
729
  rules = """1. No missing values in any column
 
735
  7. response must be syntactically valid code (no truncation or syntax errors)
736
  8. test_cases must be relevant to the instruction
737
  9. No duplicate instructions (same problem stated differently counts as duplicate)
738
+ 10. category must match the actual nature of the problem
739
+ 11. response must not contain critical security vulnerabilities (e.g., eval on user input, SQL injection)"""
740
 
741
  rows = _csv_to_rows(clean_csv)
742
  header = rows[0]
 
815
  description="Response uses eval() on user input β€” critical security vulnerability (code injection) but is_correct=true",
816
  difficulty=3.0))
817
 
818
+ # Issue 9: Duplicate instruction β€” row 14 becomes a near-copy of row 7 (merge sort)
819
+ # Change both instruction AND response to make it a true duplicate (no instruction-response mismatch)
820
  r = 13
821
  data[r][1] = "Implement merge sort algorithm."
822
+ data[r][4] = data[6][4] # Copy merge sort response from row 7
823
+ data[r][5] = data[6][5] # Copy test cases too
824
  issues.append(PlantedIssue(
825
  row=r + 1, col="instruction", issue_type="duplicate_row",
826
+ description="Row 14 is a near-duplicate of row 7 (same merge sort instruction and code)",
827
  difficulty=2.5))
828
 
829
  # Issue 10: Wrong category β€” Dijkstra labeled as design_patterns (difficulty 1.5)
 
972
  description="Empty description field for summarize_text function",
973
  difficulty=1.0))
974
 
975
+ # Issue 8: Duplicate function β€” row 17 (get_user_info) duplicates row 11 (create_user)
976
+ # Change function_name AND example_call to create_user (avoid secondary mismatch)
977
  r = 16
978
  data[r][1] = "create_user"
979
+ data[r][6] = '{"function": "create_user", "arguments": {"username": "jdoe", "email": "jdoe@example.com", "role": "user"}}'
980
  issues.append(PlantedIssue(
981
  row=r + 1, col="function_name", issue_type="duplicate_row",
982
  description="Duplicate function_name 'create_user' β€” already defined in row 11",