Update src/streamlit_app.py
Browse files- src/streamlit_app.py +22 -14
src/streamlit_app.py
CHANGED
|
@@ -376,36 +376,44 @@ def evaluate_dataset_with_gpt(subtask: str, df: pd.DataFrame, client=openai_clie
|
|
| 376 |
column_info = {col: str(df[col].dtype) for col in selected_cols}
|
| 377 |
sample_rows = df.head(3)[selected_cols].to_dict(orient="records") # take 3 example rows
|
| 378 |
|
| 379 |
-
|
| 380 |
You are a data‑validation assistant. Decide whether the dataset below is useful for the research subtask.
|
| 381 |
|
| 382 |
===== TASK =====
|
| 383 |
-
Subtask: "{subtask}"
|
| 384 |
|
| 385 |
===== DATASET PREVIEW =====
|
| 386 |
-
Schema (first {len(selected_cols)} columns):
|
| 387 |
-
{json.dumps(column_info, indent=2)}
|
| 388 |
Sample rows (3 max):
|
| 389 |
-
{json.dumps(sample_rows, indent=2)}
|
| 390 |
|
| 391 |
===== OUTPUT INSTRUCTIONS (follow strictly) =====
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
• Write exactly two sentences, each no more than 30 words.
|
| 394 |
• Summarize what the dataset contains and why it helps the subtask.
|
| 395 |
• Do not mention column names or list individual rows.
|
| 396 |
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
•
|
|
|
|
|
|
|
| 401 |
- [Name of Source](URL)
|
| 402 |
• Then list 2–3 bullet points, each on its own line, starting with “- ” followed immediately by a URL likely to contain the needed data.
|
| 403 |
• No additional commentary.
|
| 404 |
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
General rules:
|
| 408 |
-
Plain text only — no code fences. Markdown link syntax (`[text](url)`) is allowed.
|
| 409 |
"""
|
| 410 |
|
| 411 |
rsp = client.chat.completions.create(
|
|
|
|
| 376 |
column_info = {col: str(df[col].dtype) for col in selected_cols}
|
| 377 |
sample_rows = df.head(3)[selected_cols].to_dict(orient="records") # take 3 example rows
|
| 378 |
|
| 379 |
+
prompt = f"""
|
| 380 |
You are a data‑validation assistant. Decide whether the dataset below is useful for the research subtask.
|
| 381 |
|
| 382 |
===== TASK =====
|
| 383 |
+
Subtask: "{{subtask}}"
|
| 384 |
|
| 385 |
===== DATASET PREVIEW =====
|
| 386 |
+
Schema (first {{len(selected_cols)}} columns):
|
| 387 |
+
{{json.dumps(column_info, indent=2)}}
|
| 388 |
Sample rows (3 max):
|
| 389 |
+
{{json.dumps(sample_rows, indent=2)}}
|
| 390 |
|
| 391 |
===== OUTPUT INSTRUCTIONS (follow strictly) =====
|
| 392 |
+
First, begin your response with one of the following labels on its own line:
|
| 393 |
+
Relevant
|
| 394 |
+
or
|
| 395 |
+
Irrelevant
|
| 396 |
+
|
| 397 |
+
Then follow the appropriate instruction below based on your decision.
|
| 398 |
+
|
| 399 |
+
---
|
| 400 |
+
|
| 401 |
+
If you choose "Relevant":
|
| 402 |
• Write exactly two sentences, each no more than 30 words.
|
| 403 |
• Summarize what the dataset contains and why it helps the subtask.
|
| 404 |
• Do not mention column names or list individual rows.
|
| 405 |
|
| 406 |
+
---
|
| 407 |
+
|
| 408 |
+
If you choose "Irrelevant":
|
| 409 |
+
• Write one or two sentences, each no more than 30 words, describing only what the dataset contains.
|
| 410 |
+
• Do NOT mention the subtask, relevance, suitability, limitations, or missing information (avoid phrases like “not related,” “does not focus,” “irrelevant,” etc.).
|
| 411 |
+
• After the sentences, output the header **Additionally, here are some external resources you might find helpful:** on a new line. Format your output in markdown as:
|
| 412 |
- [Name of Source](URL)
|
| 413 |
• Then list 2–3 bullet points, each on its own line, starting with “- ” followed immediately by a URL likely to contain the needed data.
|
| 414 |
• No additional commentary.
|
| 415 |
|
| 416 |
+
Plain text only — no code fences. Markdown link syntax (`[text](url)`) is allowed.
|
|
|
|
|
|
|
|
|
|
| 417 |
"""
|
| 418 |
|
| 419 |
rsp = client.chat.completions.create(
|