dinhquangson commited on
Commit
f92376d
·
verified ·
1 Parent(s): 1d55f55

Create QueryMetadataExtractor.py

Browse files
Files changed (1) hide show
  1. QueryMetadataExtractor.py +52 -0
QueryMetadataExtractor.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import Dict, List
3
+
4
+ from haystack import Pipeline, component
5
+ from haystack.components.builders import PromptBuilder
6
+ from haystack.components.generators import OpenAIGenerator
7
+
8
+ @component()
9
+ class QueryMetadataExtractor:
10
+
11
+ def __init__(self):
12
+ prompt = """
13
+ You are part of an information system that processes users queries.
14
+ Given a user query you extract information from it that matches a given list of metadata fields.
15
+ The information to be extracted from the query must match the semantics associated with the given metadata fields.
16
+ The information that you extracted from the query will then be used as filters to narrow down the search space
17
+ when querying an index.
18
+ Just include the value of the extracted metadata without including the name of the metadata field.
19
+ The extracted information in 'Extracted metadata' must be returned as a valid JSON structure.
20
+ ###
21
+ Example 1:
22
+ Query: "What was the revenue of Nvidia in 2022?"
23
+ Metadata fields: {"company", "year"}
24
+ Extracted metadata fields: {"company": "nvidia", "year": 2022}
25
+ ###
26
+ Example 2:
27
+ Query: "What were the most influential publications in 2023 regarding Alzheimer's disease?"
28
+ Metadata fields: {"disease", "year"}
29
+ Extracted metadata fields: {"disease": "Alzheimer", "year": 2023}
30
+ ###
31
+ Example 3:
32
+ Query: "{{query}}"
33
+ Metadata fields: "{{metadata_fields}}"
34
+ Extracted metadata fields:
35
+ """
36
+ self.pipeline = Pipeline()
37
+ self.pipeline.add_component(name="builder", instance=PromptBuilder(prompt))
38
+ self.pipeline.add_component(name="llm", instance=OpenAIGenerator(model="gpt-3.5-turbo"))
39
+ self.pipeline.connect("builder", "llm")
40
+
41
+ @component.output_types(filters=Dict[str, str])
42
+ def run(self, query: str, metadata_fields: List[str]):
43
+ result = self.pipeline.run({'builder': {'query': query, 'metadata_fields': metadata_fields}})
44
+ metadata = json.loads(result['llm']['replies'][0])
45
+
46
+ # this can be done with specific data structures and in a more sophisticated way
47
+ filters = []
48
+ for key, value in metadata.items():
49
+ field = f"meta.{key}"
50
+ filters.append({f"field": field, "operator": "==", "value": value})
51
+
52
+ return {"filters": {"operator": "AND", "conditions": filters}}