| import ast |
| import json |
|
|
| from src.rag.retrieval import DocDB |
| from src.utils.string_utils import extract_tag_content |
| from src.data_processor.raw_data_processor import DatasetProcessor |
|
|
|
|
| class PopQAProcessor(DatasetProcessor): |
| def process_queries(self, input_file: str) -> list: |
| queries = [] |
| with open(input_file, "r", encoding="utf-8") as file: |
| for line in file: |
| data = json.loads(line) |
| query = { |
| "input": data["question"], |
| "output": { |
| "answer": ", or ".join( |
| ast.literal_eval(data["possible_answers"]) |
| ), |
| "provenance": [ |
| { |
| "title": data["s_wiki_title"], |
| "wikipedia_id": data["subj_id"], |
| } |
| ], |
| }, |
| } |
| queries.append(query) |
| return queries |
|
|
| def process_documents( |
| self, query_file: str, db: DocDB, queries: dict = None, **kwargs |
| ) -> dict: |
| documents = {} |
| |
| |
| if queries is None: |
| with open(query_file, "r", encoding="utf-8") as jsonfile: |
| queries = json.load(jsonfile) |
|
|
| with open(query_file, "r", encoding="utf-8") as jsonfile: |
| queries = json.load(jsonfile) |
| for query in queries: |
| title = query["output"]["provenance"][0]["title"] |
| if title not in documents: |
| document = self._get_documents_per_query(title, db) |
| documents[title] = document |
| return documents |
|
|
| def _get_documents_per_query(self, title: str, db: DocDB) -> list: |
| """Returns a list of documents for a given query.""" |
| contents = "" |
| try: |
| docs = db.get_text_from_title(title) |
| for data in docs: |
| contents += data["text"] |
| return extract_tag_content(contents) |
| except Exception as e: |
| print(f"Error retrieving documents for title {title}: {e}") |
| return [] |
|
|