Naman Gupta commited on
Commit
8d70360
·
unverified ·
2 Parent(s): 6b72bd23fc0eba

Merge pull request #7 from subhdotsol/fix

Browse files
Files changed (5) hide show
  1. .env.example +0 -36
  2. .gitignore +1 -0
  3. inference.py +40 -11
  4. llm/pipeline.py +4 -4
  5. tests/test_llm.py +2 -1
.env.example DELETED
@@ -1,36 +0,0 @@
1
- # Copy this file to .env and fill in your values.
2
- # Never commit .env to git — it's already in .gitignore.
3
-
4
- # ------------------------------------------------------------------
5
- # Groq (required — Person 3's LLM pipeline uses this)
6
- # Get your key at: https://console.groq.com → API Keys
7
- # ------------------------------------------------------------------
8
- GROQ_API_KEY=gsk_your_key_here
9
-
10
- # Which Groq model to use.
11
- # Fast + free options: llama-3.1-8b-instant, mixtral-8x7b-32768
12
- # Smarter but slower: llama-3.3-70b-versatile
13
- MODEL_NAME=llama-3.1-8b-instant
14
-
15
- # ------------------------------------------------------------------
16
- # Server settings
17
- # ------------------------------------------------------------------
18
-
19
- # Maximum number of attack turns per episode
20
- MAX_TURNS=10
21
-
22
- # Set to true to enable FastAPI debug mode and verbose logging
23
- DEBUG=false
24
-
25
- # How long to wait for a single Groq API call (seconds)
26
- LLM_TIMEOUT=30
27
-
28
- # How many times to retry a failed Groq call before giving up
29
- LLM_MAX_RETRIES=3
30
-
31
- # ------------------------------------------------------------------
32
- # HuggingFace (only needed if deploying to HF Spaces)
33
- # The inference.py attacker script uses this to call the HF API
34
- # ------------------------------------------------------------------
35
- HF_TOKEN=hf_your_token_here
36
- API_BASE_URL=https://api-inference.huggingface.co/models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -2,6 +2,7 @@ __pycache__/
2
  *.py[cod]
3
  *$py.class
4
  .venv/
 
5
  .env
6
  .pytest_cache/
7
  *.swp
 
2
  *.py[cod]
3
  *$py.class
4
  .venv/
5
+ venv
6
  .env
7
  .pytest_cache/
8
  *.swp
inference.py CHANGED
@@ -2,18 +2,23 @@ import os
2
  import asyncio
3
  import logging
4
  from openai import OpenAI
 
 
 
5
 
6
  logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
8
 
9
- API_BASE_URL = os.environ.get("API_BASE_URL", "https://rayugacodes-breach-os.hf.space")
10
- MODEL_NAME = os.environ.get("MODEL_NAME", "mistralai/Mistral-7B-Instruct-v0.3")
11
- HF_TOKEN = os.environ.get("HF_TOKEN", "")
12
 
 
13
  client = OpenAI(
14
- base_url = f"https://api-inference.huggingface.co/models/{MODEL_NAME}/v1",
15
- api_key = HF_TOKEN,
16
  )
 
 
17
 
18
  def generate_attack(defender_response: str, turn: int, previous_success: float) -> dict:
19
  strategies = [
@@ -42,7 +47,14 @@ def generate_attack(defender_response: str, turn: int, previous_success: float)
42
 
43
  async def run_episode(task: str = "easy") -> dict:
44
  import httpx
45
- async with httpx.AsyncClient(base_url=API_BASE_URL, timeout=60.0) as http:
 
 
 
 
 
 
 
46
  resp = await http.post("/reset")
47
  reset_data = resp.json()
48
  defender_resp = reset_data["observation"]["defender_response"]
@@ -52,27 +64,44 @@ async def run_episode(task: str = "easy") -> dict:
52
  while True:
53
  turn += 1
54
  action = generate_attack(defender_resp, turn, prev_success)
 
 
 
 
55
  resp = await http.post("/step", json=action)
56
  step_data = resp.json()
57
  obs = step_data["observation"]
 
58
  defender_resp = obs["defender_response"]
59
  prev_success = obs["attack_success_estimate"]
60
- if obs["episode_done"]: break
 
 
 
 
 
 
61
 
62
  grade_resp = await http.post("/grade")
63
- return {"task": task, "turns": turn, "grade": grade_resp.json()}
 
 
 
 
 
 
64
 
65
  async def main():
66
  import time
67
  start = time.time()
68
  for task in ["easy", "medium", "hard"]:
69
- logger.info(f"Running {task}...")
70
  try:
71
  await run_episode(task)
72
  except Exception as e:
73
  logger.error(f"Failed {task}: {e}")
74
- if time.time() - start > 1200:
75
- logger.warning("Exceeded 20 mins!")
 
76
 
77
  if __name__ == "__main__":
78
  asyncio.run(main())
 
2
  import asyncio
3
  import logging
4
  from openai import OpenAI
5
+ from dotenv import load_dotenv
6
+
7
+ load_dotenv()
8
 
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
+ # Backend server
13
+ SERVER_URL = "https://rayugacodes-breach-os.hf.space"
 
14
 
15
+ # Attacker LLM (Configured to Groq)
16
  client = OpenAI(
17
+ base_url = "https://api.groq.com/openai/v1",
18
+ api_key = os.environ.get("GROQ_API_KEY"),
19
  )
20
+ MODEL_NAME = "llama-3.1-8b-instant"
21
+
22
 
23
  def generate_attack(defender_response: str, turn: int, previous_success: float) -> dict:
24
  strategies = [
 
47
 
48
  async def run_episode(task: str = "easy") -> dict:
49
  import httpx
50
+ import json
51
+
52
+ print(f"\n{'-'*60}\nSTARTING EPISODE: [{task.upper()} TASK]\n{'-'*60}")
53
+
54
+ async with httpx.AsyncClient(base_url=SERVER_URL, timeout=60.0) as http:
55
+ # Hide httpx logs to keep it clean
56
+ logging.getLogger("httpx").setLevel(logging.WARNING)
57
+
58
  resp = await http.post("/reset")
59
  reset_data = resp.json()
60
  defender_resp = reset_data["observation"]["defender_response"]
 
64
  while True:
65
  turn += 1
66
  action = generate_attack(defender_resp, turn, prev_success)
67
+
68
+ print(f"\n[ TURN {turn} ] Strategy: {action['strategy_type']} | Intensity: {action['intensity']:.2f}")
69
+ print(f"ATTACKER: {action['framing']}\n")
70
+
71
  resp = await http.post("/step", json=action)
72
  step_data = resp.json()
73
  obs = step_data["observation"]
74
+
75
  defender_resp = obs["defender_response"]
76
  prev_success = obs["attack_success_estimate"]
77
+
78
+ print(f"DEFENDER: {defender_resp}\n")
79
+ print(f"[ METRICS ] Attack Success: {prev_success:.2f} | Defense Quality: {obs.get('defense_score', 0):.2f}")
80
+
81
+ if obs["episode_done"]:
82
+ print(f"\n*** EPISODE TERMINATED (Turn {turn}) ***\n")
83
+ break
84
 
85
  grade_resp = await http.post("/grade")
86
+
87
+ grade_data = grade_resp.json()
88
+ print(f"FINAL EPISODE GRADE ({task.upper()}):")
89
+ print(json.dumps(grade_data, indent=2))
90
+ print(f"{'-'*60}\n")
91
+
92
+ return {"task": task, "turns": turn, "grade": grade_data}
93
 
94
  async def main():
95
  import time
96
  start = time.time()
97
  for task in ["easy", "medium", "hard"]:
 
98
  try:
99
  await run_episode(task)
100
  except Exception as e:
101
  logger.error(f"Failed {task}: {e}")
102
+
103
+ total_time = time.time() - start
104
+ print(f"\nFULL RUN COMPLETED IN {total_time:.1f} SECONDS.")
105
 
106
  if __name__ == "__main__":
107
  asyncio.run(main())
llm/pipeline.py CHANGED
@@ -123,10 +123,10 @@ def grade_episode_with_llm(history: list[dict]) -> dict:
123
  logger.debug(f"Episode grader output:\n{raw_output}")
124
 
125
  # Parse each labeled score; use fallback for any that didn't parse
126
- scores = {
127
- dim: _extract_labeled_score(raw_output, dim) or fallback_scores[dim]
128
- for dim in fallback_scores
129
- }
130
 
131
  except Exception as error:
132
  logger.warning(f"Episode grader unavailable: {error} — using fallback scores")
 
123
  logger.debug(f"Episode grader output:\n{raw_output}")
124
 
125
  # Parse each labeled score; use fallback for any that didn't parse
126
+ scores = {}
127
+ for dim in fallback_scores:
128
+ val = _extract_labeled_score(raw_output, dim)
129
+ scores[dim] = val if val is not None else fallback_scores[dim]
130
 
131
  except Exception as error:
132
  logger.warning(f"Episode grader unavailable: {error} — using fallback scores")
tests/test_llm.py CHANGED
@@ -322,7 +322,8 @@ class TestGradeEpisodeWithLlm:
322
  assert result["robustness"] == 0.8
323
  assert result["clarity"] == 0.85
324
  assert result["helpfulness"] == 0.6
325
- assert result["overall"] == round((0.9 + 0.8 + 0.85 + 0.6) / 4, 3)
 
326
 
327
  def test_falls_back_to_defaults_on_api_error(self):
328
  from llm.pipeline import grade_episode_with_llm
 
322
  assert result["robustness"] == 0.8
323
  assert result["clarity"] == 0.85
324
  assert result["helpfulness"] == 0.6
325
+ expected_overall = round(sum([0.9, 0.8, 0.85, 0.6]) / 4, 3)
326
+ assert result["overall"] == expected_overall
327
 
328
  def test_falls_back_to_defaults_on_api_error(self):
329
  from llm.pipeline import grade_episode_with_llm