Response-Quality-Assessment / src /data_processor /hotpot_qa_processor.py
Ryoya Awano
deploy: fix MedLFQA Marginal mode sample matching
19fc84f
import json
from src.rag.retrieval import DocDB
from src.utils.string_utils import extract_tag_content
from src.data_processor.raw_data_processor import DatasetProcessor
class HotpotQAProcessor(DatasetProcessor):
def process_queries(self, input_file: str) -> list:
queries = []
with open(input_file, "r", encoding="utf-8") as file:
for line in file:
data = json.loads(line)
query = {
"input": data["input"],
"output": {
"answer": data["output"][0]["answer"],
"provenance": [
{
"title": item["title"],
"wikipedia_id": int(item["wikipedia_id"]),
}
for item in data["output"][0]["provenance"]
],
},
}
queries.append(query)
return queries
def process_documents(
self, query_file: str, db: DocDB, queries: dict = None, **kwargs
) -> dict:
documents = {}
# if sampled queries are provided, use them instead of the queries in the query_file
# however, for medlfqa, the query file is mandatory
if queries is None:
with open(query_file, "r", encoding="utf-8") as jsonfile:
queries = json.load(jsonfile)
for query in queries:
for provenance in query["output"]["provenance"]:
title = provenance["title"]
if title not in documents:
document = self._get_documents_per_query(title, db)
documents[title] = document
return documents
def _get_documents_per_query(self, title: str, db: DocDB) -> list:
"""Returns a list of documents for a given query."""
contents = ""
try:
docs = db.get_text_from_title(title)
for data in docs:
contents += data["text"]
return extract_tag_content(contents)
except Exception as e:
print(f"Error retrieving documents for title {title}: {e}")
return []