Spaces:
Sleeping
Sleeping
Create QueryMetadataExtractor.py
Browse files- QueryMetadataExtractor.py +52 -0
QueryMetadataExtractor.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from typing import Dict, List
|
| 3 |
+
|
| 4 |
+
from haystack import Pipeline, component
|
| 5 |
+
from haystack.components.builders import PromptBuilder
|
| 6 |
+
from haystack.components.generators import OpenAIGenerator
|
| 7 |
+
|
| 8 |
+
@component()
|
| 9 |
+
class QueryMetadataExtractor:
|
| 10 |
+
|
| 11 |
+
def __init__(self):
|
| 12 |
+
prompt = """
|
| 13 |
+
You are part of an information system that processes users queries.
|
| 14 |
+
Given a user query you extract information from it that matches a given list of metadata fields.
|
| 15 |
+
The information to be extracted from the query must match the semantics associated with the given metadata fields.
|
| 16 |
+
The information that you extracted from the query will then be used as filters to narrow down the search space
|
| 17 |
+
when querying an index.
|
| 18 |
+
Just include the value of the extracted metadata without including the name of the metadata field.
|
| 19 |
+
The extracted information in 'Extracted metadata' must be returned as a valid JSON structure.
|
| 20 |
+
###
|
| 21 |
+
Example 1:
|
| 22 |
+
Query: "What was the revenue of Nvidia in 2022?"
|
| 23 |
+
Metadata fields: {"company", "year"}
|
| 24 |
+
Extracted metadata fields: {"company": "nvidia", "year": 2022}
|
| 25 |
+
###
|
| 26 |
+
Example 2:
|
| 27 |
+
Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?"
|
| 28 |
+
Metadata fields: {"disease", "year"}
|
| 29 |
+
Extracted metadata fields: {"disease": "Alzheimer", "year": 2023}
|
| 30 |
+
###
|
| 31 |
+
Example 3:
|
| 32 |
+
Query: "{{query}}"
|
| 33 |
+
Metadata fields: "{{metadata_fields}}"
|
| 34 |
+
Extracted metadata fields:
|
| 35 |
+
"""
|
| 36 |
+
self.pipeline = Pipeline()
|
| 37 |
+
self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt))
|
| 38 |
+
self.pipeline.add_component(name="llm", instance=OpenAIGenerator(model="gpt-3.5-turbo"))
|
| 39 |
+
self.pipeline.connect("builder", "llm")
|
| 40 |
+
|
| 41 |
+
@component.output_types(filters=Dict[str, str])
|
| 42 |
+
def run(self, query: str, metadata_fields: List[str]):
|
| 43 |
+
result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}})
|
| 44 |
+
metadata = json.loads(result['llm']['replies'][0])
|
| 45 |
+
|
| 46 |
+
# this can be done with specific data structures and in a more sophisticated way
|
| 47 |
+
filters = []
|
| 48 |
+
for key, value in metadata.items():
|
| 49 |
+
field = f"meta.{key}"
|
| 50 |
+
filters.append({f"field": field, "operator": "==", "value": value})
|
| 51 |
+
|
| 52 |
+
return {"filters": {"operator": "AND", "conditions": filters}}
|