Update src/streamlit_app.py
Browse files- src/streamlit_app.py +32 -24
src/streamlit_app.py
CHANGED
|
@@ -379,40 +379,48 @@ def evaluate_dataset_with_gpt(subtask: str, df: pd.DataFrame, client=openai_clie
|
|
| 379 |
sample_rows = df.head(3)[selected_cols].to_dict(orient="records") # take 3 example rows
|
| 380 |
|
| 381 |
prompt = f"""
|
| 382 |
-
You are a data
|
| 383 |
|
| 384 |
-
===== TASK =====
|
| 385 |
Subtask: "{{subtask}}"
|
| 386 |
|
| 387 |
-
===== DATASET PREVIEW =====
|
| 388 |
-
Schema
|
| 389 |
-
{{
|
| 390 |
-
Sample rows (20 max):
|
| 391 |
-
{{json.dumps(sample_rows, indent=20)}}
|
| 392 |
|
| 393 |
===== OUTPUT INSTRUCTIONS (follow strictly) =====
|
| 394 |
-
First, begin your response with one of the following labels on its own line:
|
| 395 |
-
Relevant
|
| 396 |
-
or
|
| 397 |
-
Irrelevant
|
| 398 |
|
| 399 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
|
| 401 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
|
| 403 |
-
|
| 404 |
-
• Write exactly two sentences, each no more than 30 words.
|
| 405 |
-
• Summarize what the dataset contains and why it helps the subtask.
|
| 406 |
-
• Do not mention column names or list individual rows.
|
| 407 |
|
| 408 |
-
|
|
|
|
|
|
|
|
|
|
| 409 |
|
| 410 |
-
|
| 411 |
-
•
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
"""
|
| 417 |
|
| 418 |
rsp = client.chat.completions.create(
|
|
|
|
| 379 |
sample_rows = df.head(3)[selected_cols].to_dict(orient="records") # take 3 example rows
|
| 380 |
|
| 381 |
prompt = f"""
|
| 382 |
+
You are a data-validation assistant. Your job is to determine whether the dataset below is useful for the research subtask.
|
| 383 |
|
| 384 |
+
===== TASK =====
|
| 385 |
Subtask: "{{subtask}}"
|
| 386 |
|
| 387 |
+
===== DATASET PREVIEW =====
|
| 388 |
+
Schema: {{schema}}
|
| 389 |
+
Example Rows: {{example_rows}}
|
|
|
|
|
|
|
| 390 |
|
| 391 |
===== OUTPUT INSTRUCTIONS (follow strictly) =====
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
|
| 393 |
+
Case A – Relevant:
|
| 394 |
+
• Write exactly two sentences, each no more than 30 words.
|
| 395 |
+
• Sentence 1: summarize what the dataset contains.
|
| 396 |
+
• Sentence 2: explain why it helps answer the subtask.
|
| 397 |
+
• Do not mention specific column names or list individual rows.
|
| 398 |
+
• Do NOT generate any additional explanation or markdown formatting.
|
| 399 |
|
| 400 |
+
Case B – Not Relevant:
|
| 401 |
+
• Write one or two sentences, each no more than 30 words, describing **only what the dataset contains**.
|
| 402 |
+
• Do **NOT** mention the subtask, usefulness, relevance, or missing information.
|
| 403 |
+
• Do **NOT** use words like “irrelevant,” “not related,” “not useful,” “not sufficient,” etc.
|
| 404 |
+
• After the sentence(s), output the exact header:
|
| 405 |
|
| 406 |
+
**Additionally, here are some external resources you might find helpful:**
|
|
|
|
|
|
|
|
|
|
| 407 |
|
| 408 |
+
• Then output 2–3 **real** resources in Markdown link format. Each must:
|
| 409 |
+
- Have a **real source name** (e.g., “MatWeb”, not “Name of Source”)
|
| 410 |
+
- Contain a **real, working URL** to a page or dataset related to the subtask
|
| 411 |
+
- Be formatted exactly like: `- [Source Name](https://example.com)`
|
| 412 |
|
| 413 |
+
• Do NOT use placeholder text like “Name of Source” or “URL”.
|
| 414 |
+
• Do NOT generate any commentary after the list.
|
| 415 |
+
|
| 416 |
+
Example for Case B:
|
| 417 |
+
|
| 418 |
+
The dataset contains technical specifications of commercial vehicles, such as engine types and dimensions.
|
| 419 |
+
|
| 420 |
+
**Additionally, here are some external resources you might find helpful:**
|
| 421 |
+
- [Polymer Property Database](https://polymerdatabase.com/)
|
| 422 |
+
- [MatWeb Materials Data Sheets](https://www.matweb.com/)
|
| 423 |
+
- [NIST Thermophysical Properties of Polymers](https://www.nist.gov/srd/nist-standard-reference-database-147)
|
| 424 |
"""
|
| 425 |
|
| 426 |
rsp = client.chat.completions.create(
|