Spaces:
Sleeping
Sleeping
dolphinium
commited on
Commit
·
bdcb123
1
Parent(s):
d0ea80f
update field definitions for clarity and usage instructions in metadata and also prompt for better field_usage while facet generation.
Browse files
app.py
CHANGED
|
@@ -101,15 +101,15 @@ field_metadata = [
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"field_name": "company_name",
|
| 104 |
-
"type": "string (exact match)",
|
| 105 |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
|
| 106 |
-
"definition": "The canonical, standardized name of a company.
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"field_name": "company_name_s",
|
| 110 |
"type": "string (multi-valued, for searching)",
|
| 111 |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
|
| 112 |
-
"definition": "A field containing all known names
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"field_name": "territory_hq_s",
|
|
@@ -121,61 +121,61 @@ field_metadata = [
|
|
| 121 |
"field_name": "therapeutic_category",
|
| 122 |
"type": "string (specific)",
|
| 123 |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
|
| 124 |
-
"definition": "The specific disease or therapeutic area being targeted
|
| 125 |
},
|
| 126 |
{
|
| 127 |
"field_name": "therapeutic_category_s",
|
| 128 |
"type": "string (multi-valued, for searching)",
|
| 129 |
"example_values": ["cancer", "oncology", "infections", "cns"],
|
| 130 |
-
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches**
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"field_name": "compound_name",
|
| 134 |
-
"type": "string (exact match)",
|
| 135 |
"example_values": ["opdivo injection solution", "keytruda injection solution"],
|
| 136 |
-
"definition": "The specific, full trade
|
| 137 |
},
|
| 138 |
{
|
| 139 |
"field_name": "compound_name_s",
|
| 140 |
"type": "string (multi-valued, for searching)",
|
| 141 |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
|
| 142 |
-
"definition": "A field
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"field_name": "molecule_name",
|
| 146 |
-
"type": "string (exact match)",
|
| 147 |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
|
| 148 |
-
"definition": "The generic, non-proprietary name of the active molecule.
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"field_name": "molecule_name_s",
|
| 152 |
"type": "string (multi-valued, for searching)",
|
| 153 |
"example_values": ["cbd", "s1-220", "a1002n5s"],
|
| 154 |
-
"definition": "A field
|
| 155 |
},
|
| 156 |
{
|
| 157 |
"field_name": "highest_phase",
|
| 158 |
"type": "string (categorical)",
|
| 159 |
"example_values": ["marketed", "phase 2", "phase 1"],
|
| 160 |
-
"definition": "The highest stage of development a drug has ever reached.
|
| 161 |
},
|
| 162 |
{
|
| 163 |
"field_name": "drug_delivery_branch_s",
|
| 164 |
"type": "string (multi-valued, for searching)",
|
| 165 |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
|
| 166 |
-
"definition": "The method of drug administration
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"field_name": "drug_delivery_branch",
|
| 170 |
-
"type": "string (categorical, specific)",
|
| 171 |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
|
| 172 |
-
"definition": "The most specific category of drug delivery technology
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"field_name": "route_branch",
|
| 176 |
"type": "string (categorical)",
|
| 177 |
"example_values": ["injection", "oral", "topical", "inhalation"],
|
| 178 |
-
"definition": "The
|
| 179 |
},
|
| 180 |
{
|
| 181 |
"field_name": "molecule_api_group",
|
|
@@ -193,13 +193,13 @@ field_metadata = [
|
|
| 193 |
"field_name": "date",
|
| 194 |
"type": "date",
|
| 195 |
"example_values": ["2020-10-22T00:00:00Z"],
|
| 196 |
-
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries
|
| 197 |
},
|
| 198 |
{
|
| 199 |
"field_name": "date_year",
|
| 200 |
"type": "number (year)",
|
| 201 |
"example_values": [2020, 2021, 2022],
|
| 202 |
-
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')
|
| 203 |
},
|
| 204 |
{
|
| 205 |
"field_name": "total_deal_value_in_million",
|
|
@@ -244,11 +244,14 @@ You are an expert Solr query engineer who converts natural language questions in
|
|
| 244 |
### CONTEXT & RULES
|
| 245 |
|
| 246 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
| 247 |
-
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
|
| 248 |
-
3. **
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
---
|
| 254 |
### FIELD DEFINITIONS (Your Source of Truth)
|
|
|
|
| 101 |
},
|
| 102 |
{
|
| 103 |
"field_name": "company_name",
|
| 104 |
+
"type": "string (exact match, for faceting)",
|
| 105 |
"example_values": ["pfizer inc.", "astrazeneca plc", "roche"],
|
| 106 |
+
"definition": "The canonical, standardized name of a company. **Crucially, you MUST use this field for `terms` faceting** to group results by a unique company. Do NOT use this for searching."
|
| 107 |
},
|
| 108 |
{
|
| 109 |
"field_name": "company_name_s",
|
| 110 |
"type": "string (multi-valued, for searching)",
|
| 111 |
"example_values": ["pfizer inc.", "roche", "f. hoffmann-la roche ag", "nih"],
|
| 112 |
+
"definition": "A field containing all known names and synonyms for a company. **You MUST use this field for all `query` parameter searches involving a company name** to ensure comprehensive results. Do NOT use for `terms` faceting."
|
| 113 |
},
|
| 114 |
{
|
| 115 |
"field_name": "territory_hq_s",
|
|
|
|
| 121 |
"field_name": "therapeutic_category",
|
| 122 |
"type": "string (specific)",
|
| 123 |
"example_values": ["cancer, other", "cancer, nsclc metastatic", "alzheimer's"],
|
| 124 |
+
"definition": "The specific disease or therapeutic area being targeted. Use for very specific disease queries."
|
| 125 |
},
|
| 126 |
{
|
| 127 |
"field_name": "therapeutic_category_s",
|
| 128 |
"type": "string (multi-valued, for searching)",
|
| 129 |
"example_values": ["cancer", "oncology", "infections", "cns"],
|
| 130 |
+
"definition": "Broader, multi-valued therapeutic categories and their synonyms. **Use this field for broad category searches** in the `query` parameter."
|
| 131 |
},
|
| 132 |
{
|
| 133 |
"field_name": "compound_name",
|
| 134 |
+
"type": "string (exact match, for faceting)",
|
| 135 |
"example_values": ["opdivo injection solution", "keytruda injection solution"],
|
| 136 |
+
"definition": "The specific, full trade name of a drug. **Use this field for `terms` faceting** on compounds."
|
| 137 |
},
|
| 138 |
{
|
| 139 |
"field_name": "compound_name_s",
|
| 140 |
"type": "string (multi-valued, for searching)",
|
| 141 |
"example_values": ["nivolumab injection solution", "opdivo injection solution", "ono-4538 injection solution"],
|
| 142 |
+
"definition": "A field with all known trade names and synonyms for a drug. **Use this field for all `query` parameter searches** involving a compound name."
|
| 143 |
},
|
| 144 |
{
|
| 145 |
"field_name": "molecule_name",
|
| 146 |
+
"type": "string (exact match, for faceting)",
|
| 147 |
"example_values": ["cannabidiol", "paclitaxel", "pembrolizumab"],
|
| 148 |
+
"definition": "The generic, non-proprietary name of the active molecule. **Use this field for `terms` faceting** on molecules."
|
| 149 |
},
|
| 150 |
{
|
| 151 |
"field_name": "molecule_name_s",
|
| 152 |
"type": "string (multi-valued, for searching)",
|
| 153 |
"example_values": ["cbd", "s1-220", "a1002n5s"],
|
| 154 |
+
"definition": "A field with all known generic names and synonyms for a molecule. **Use this field for all `query` parameter searches** involving a molecule name."
|
| 155 |
},
|
| 156 |
{
|
| 157 |
"field_name": "highest_phase",
|
| 158 |
"type": "string (categorical)",
|
| 159 |
"example_values": ["marketed", "phase 2", "phase 1"],
|
| 160 |
+
"definition": "The highest stage of development a drug has ever reached."
|
| 161 |
},
|
| 162 |
{
|
| 163 |
"field_name": "drug_delivery_branch_s",
|
| 164 |
"type": "string (multi-valued, for searching)",
|
| 165 |
"example_values": ["injection", "parenteral", "oral", "injection, other", "oral, other"],
|
| 166 |
+
"definition": "The method of drug administration. **Use this for `query` parameter searches about route of administration** as it contains broader, search-friendly terms."
|
| 167 |
},
|
| 168 |
{
|
| 169 |
"field_name": "drug_delivery_branch",
|
| 170 |
+
"type": "string (categorical, specific, for faceting)",
|
| 171 |
"example_values": ["injection, other", "prefilled syringes", "np liposome", "oral enteric/delayed release"],
|
| 172 |
+
"definition": "The most specific category of drug delivery technology. **Use this field for `terms` faceting** on specific delivery technologies."
|
| 173 |
},
|
| 174 |
{
|
| 175 |
"field_name": "route_branch",
|
| 176 |
"type": "string (categorical)",
|
| 177 |
"example_values": ["injection", "oral", "topical", "inhalation"],
|
| 178 |
+
"definition": "The primary route of drug administration. Good for faceting on exact routes."
|
| 179 |
},
|
| 180 |
{
|
| 181 |
"field_name": "molecule_api_group",
|
|
|
|
| 193 |
"field_name": "date",
|
| 194 |
"type": "date",
|
| 195 |
"example_values": ["2020-10-22T00:00:00Z"],
|
| 196 |
+
"definition": "The full publication date and time in ISO 8601 format. Use for precise date range queries."
|
| 197 |
},
|
| 198 |
{
|
| 199 |
"field_name": "date_year",
|
| 200 |
"type": "number (year)",
|
| 201 |
"example_values": [2020, 2021, 2022],
|
| 202 |
+
"definition": "The 4-digit year of publication. **Use this for queries involving whole years** (e.g., 'in 2023', 'last year', 'since 2020')."
|
| 203 |
},
|
| 204 |
{
|
| 205 |
"field_name": "total_deal_value_in_million",
|
|
|
|
| 244 |
### CONTEXT & RULES
|
| 245 |
|
| 246 |
1. **Today's Date for Calculations**: {datetime.datetime.now().date().strftime("%Y-%m-%d")}
|
| 247 |
+
2. **Field Usage**: You MUST use the fields described in the 'Field Definitions' section. Pay close attention to the definitions to select the correct field.
|
| 248 |
+
3. **Facet vs. Query Field Distinction**: This is critical.
|
| 249 |
+
* For searching in the main `query` parameter, ALWAYS use the multi-valued search fields (ending in `_s`, like `company_name_s`) to get comprehensive results.
|
| 250 |
+
* For grouping in a `terms` facet, ALWAYS use the canonical, single-value field (e.g., `company_name`, `molecule_name`) to ensure unique and accurate grouping.
|
| 251 |
+
4. **No `count(*)`**: Do NOT use functions like `count(*)`. The default facet bucket count is sufficient for counting documents.
|
| 252 |
+
5. **Allowed Aggregations**: For statistical facets (`stats` or `stat` type), only use these functions: `sum`, `avg`, `min`, `max`, `unique`. The primary metric field is `total_deal_value_in_million`.
|
| 253 |
+
6. **Term Facet Limits**: Every `terms` facet MUST include a `limit` key. Default to `limit: 10` unless the user specifies a different number of top results.
|
| 254 |
+
7. **Output Format**: Your final output must be a single, raw JSON object and nothing else.
|
| 255 |
|
| 256 |
---
|
| 257 |
### FIELD DEFINITIONS (Your Source of Truth)
|