AegionX commited on
Commit
c36f648
·
verified ·
1 Parent(s): b583310

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -0
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This module contains utility functions for the project"""
2
+
3
+ import mmh3
4
+ from haystack import Document
5
+
6
+
7
+ def get_unique_docs(dataset, unique_docs: set):
8
+ """Get unique documents from dataset
9
+ Args:
10
+ dataset: list of dictionaries
11
+ Returns:
12
+ docs: list of haystack.Document
13
+ """
14
+ docs = list()
15
+ for doc in dataset:
16
+ if doc["context"] is not None and doc["context_id"] not in unique_docs:
17
+ unique_docs.add(doc["context_id"])
18
+ document = Document(
19
+ content=doc["context"],
20
+ meta={
21
+ "title": doc["context_title"],
22
+ "context_id": doc["context_id"],
23
+ "url": doc["url"],
24
+ "source": "QASports",
25
+ },
26
+ )
27
+ docs.append(document)
28
+ return docs