dmpantiu commited on
Commit
09f0030
·
verified ·
1 Parent(s): 4fbe36e

Upload folder using huggingface_hub

Browse files
Files changed (3) hide show
  1. RESPONSES.TXT +0 -0
  2. scripts/qa_image_review.py +38 -16
  3. src/eurus/config.py +1 -1
RESPONSES.TXT ADDED
File without changes
scripts/qa_image_review.py CHANGED
@@ -105,21 +105,43 @@ QA_QUERIES = {
105
 
106
 
107
  REVIEW_SYSTEM_PROMPT = """\
108
- You are a senior scientific visualization reviewer for a climate/weather data agent.
109
  You will receive one or more PNG plots generated by an AI agent and the TASK that the agent was asked to complete.
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  Review each plot against the task and provide a structured assessment:
112
 
113
- 1. **Task Compliance** (1-10): Does the plot address what was asked?
114
- 2. **Scientific Accuracy** (1-10): Are axes labeled, units correct, colorbar present, projections reasonable?
115
- 3. **Visual Quality** (1-10): Is the plot publication-quality? Good resolution, readable labels, professional aesthetics?
116
- 4. **Spatial/Map Quality** (1-10): If it's a map — does it have coastlines, proper projection, geographic labels? If not a map, rate the chart type appropriateness.
117
- 5. **Overall Score** (1-10): Weighted average considering all factors.
 
 
 
 
118
 
119
  Also provide:
120
- - **Summary**: 1-2 sentence summary of what the plot shows.
121
- - **Strengths**: Key things done well.
122
- - **Issues**: Any problems, missing elements, or improvements needed.
 
 
123
 
124
  Respond ONLY in valid JSON with this exact structure:
125
  {
@@ -130,7 +152,7 @@ Respond ONLY in valid JSON with this exact structure:
130
  "overall_score": <int>,
131
  "summary": "<string>",
132
  "strengths": ["<string>", ...],
133
- "issues": ["<string>", ...]
134
  }
135
  """
136
 
@@ -162,7 +184,7 @@ def review_single_question(client: genai.Client, qid: int, task: str,
162
  img_bytes = f.read()
163
  parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
164
 
165
- for attempt in range(4):
166
  try:
167
  response = client.models.generate_content(
168
  model=model,
@@ -170,7 +192,7 @@ def review_single_question(client: genai.Client, qid: int, task: str,
170
  config=types.GenerateContentConfig(
171
  system_instruction=REVIEW_SYSTEM_PROMPT,
172
  temperature=0.2,
173
- max_output_tokens=1000,
174
  ),
175
  )
176
  raw = response.text.strip()
@@ -194,12 +216,12 @@ def review_single_question(client: genai.Client, qid: int, task: str,
194
  except Exception as e:
195
  err_str = str(e)
196
  if "429" in err_str or "RESOURCE_EXHAUSTED" in err_str:
197
- wait = min(2 ** attempt * 5, 60)
198
- print(f"\n Rate limited, waiting {wait}s (attempt {attempt+1}/4)...", end="", flush=True)
199
  time.sleep(wait)
200
  else:
201
- if attempt < 3:
202
- time.sleep(2)
203
  continue
204
  return {"error": str(e)[:300]}
205
 
 
105
 
106
 
107
  REVIEW_SYSTEM_PROMPT = """\
108
+ You are a RUTHLESS, METICULOUS senior scientific visualization reviewer for a climate/weather data agent.
109
  You will receive one or more PNG plots generated by an AI agent and the TASK that the agent was asked to complete.
110
 
111
+ YOUR #1 JOB: For EVERY issue you find, describe it with EXACT SPECIFICITY.
112
+ Do NOT say "labels are unclear" — say EXACTLY which label, where it is, and what is wrong with it.
113
+ Do NOT say "colorbar could be better" — say EXACTLY what the colorbar shows, what it should show, and what the specific problem is.
114
+ Do NOT give vague feedback. Every single issue MUST pinpoint the EXACT location and EXACT problem in the figure.
115
+
116
+ CRITICAL: Be EXTREMELY SPECIFIC about problems. Point to EXACT elements:
117
+ - "The y-axis label says 'Value' but should say 'Temperature (°C)'"
118
+ - "The colorbar range is 270-310K but should be converted to °C for readability"
119
+ - "Coastlines are missing from the spatial map — there is no land/ocean boundary visible"
120
+ - "The title says 'January 2024' but the x-axis data only covers December 2023"
121
+ - "The legend overlaps with the data in the upper-right quadrant, obscuring the January peak"
122
+ - "Wind vectors are plotted but have no reference arrow showing the scale"
123
+ - "The projection is PlateCarree but should be a polar stereographic for Arctic data above 70°N"
124
+
125
+ For EACH problem: describe WHERE in the figure it is, WHAT exactly is wrong, and WHAT it should be instead.
126
+
127
  Review each plot against the task and provide a structured assessment:
128
 
129
+ 1. **Task Compliance** (1-10): Does the plot address EXACTLY what was asked? Check every single requirement in the task description. If the task says "two-panel" and there's only one panel, that is a major failure. If the task says "vs" comparison and only one dataset is shown, that is a failure. Be strict.
130
+
131
+ 2. **Scientific Accuracy** (1-10): Are ALL axes labeled with correct units? Is the colorbar present with proper units and range? Are values physically reasonable (e.g., SST not showing 0K)? Are projections appropriate for the region? Check EVERY axis, EVERY label, EVERY unit.
132
+
133
+ 3. **Visual Quality** (1-10): Is it publication-quality? Check: font sizes readable? Labels not overlapping data? Grid lines appropriate? Color scheme suitable (e.g., diverging for anomalies, sequential for absolute values)? Title descriptive and correct?
134
+
135
+ 4. **Spatial/Map Quality** (1-10): For maps — are coastlines drawn? Is the projection correct for the region? Are lat/lon gridlines present? Are geographic features identifiable? For non-maps — is the chart type appropriate?
136
+
137
+ 5. **Overall Score** (1-10): Weighted average. Be HARSH — a score of 8+ means near-perfect.
138
 
139
  Also provide:
140
+ - **Summary**: 1-2 sentence factual summary of what the plot actually shows.
141
+ - **Strengths**: Specific things done well. Be precise — not "good colors" but "diverging RdBu colormap correctly centered at zero for anomaly data".
142
+ - **Issues**: LIST EVERY SINGLE PROBLEM. Each issue MUST describe the EXACT element, its EXACT location in the figure, WHAT is wrong, and WHAT it should be. DO NOT BE VAGUE. This is the MOST IMPORTANT part of your review. Be exhaustive. Miss nothing.
143
+
144
+ I REPEAT: The "issues" field is the MOST CRITICAL part. Every issue must be SPECIFIC and ACTIONABLE. Generic feedback like "could be improved" is UNACCEPTABLE. Say EXACTLY what needs to change and WHERE.
145
 
146
  Respond ONLY in valid JSON with this exact structure:
147
  {
 
152
  "overall_score": <int>,
153
  "summary": "<string>",
154
  "strengths": ["<string>", ...],
155
+ "issues": ["<string — MUST be specific and exact, describing WHERE and WHAT>", ...]
156
  }
157
  """
158
 
 
184
  img_bytes = f.read()
185
  parts.append(types.Part.from_bytes(data=img_bytes, mime_type="image/png"))
186
 
187
+ for attempt in range(6):
188
  try:
189
  response = client.models.generate_content(
190
  model=model,
 
192
  config=types.GenerateContentConfig(
193
  system_instruction=REVIEW_SYSTEM_PROMPT,
194
  temperature=0.2,
195
+ max_output_tokens=4096,
196
  ),
197
  )
198
  raw = response.text.strip()
 
216
  except Exception as e:
217
  err_str = str(e)
218
  if "429" in err_str or "RESOURCE_EXHAUSTED" in err_str:
219
+ wait = min(2 ** attempt * 15, 120)
220
+ print(f"\n Rate limited, waiting {wait}s (attempt {attempt+1}/6)...", end="", flush=True)
221
  time.sleep(wait)
222
  else:
223
+ if attempt < 5:
224
+ time.sleep(3)
225
  continue
226
  return {"error": str(e)[:300]}
227
 
src/eurus/config.py CHANGED
@@ -506,7 +506,7 @@ class AgentConfig:
506
  # Data Settings
507
  data_source: str = "earthmover-public/era5-surface-aws"
508
  default_query_type: str = "temporal"
509
- max_download_size_gb: float = 5.0
510
 
511
  # Retrieval Settings
512
  max_retries: int = 5
 
506
  # Data Settings
507
  data_source: str = "earthmover-public/era5-surface-aws"
508
  default_query_type: str = "temporal"
509
+ max_download_size_gb: float = 15.0
510
 
511
  # Retrieval Settings
512
  max_retries: int = 5