Spaces:
Sleeping
Sleeping
File size: 1,949 Bytes
29f88f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
from datasets import load_dataset
from typing import List, Dict, Any
def preprocess_context(text: str) -> str:
"""Text cleaning: remove line breaks and extra spaces."""
if text is None:
return ""
return " ".join(text.split())
def preprocess_table(table: Any) -> str:
"""Convert the table to plain text for retriever."""
if table is None:
return ""
return str(table)
def create_documents(dataset: List[Dict[str, Any]], chunk_size: int = 500) -> List[Dict[str, Any]]:
"""
Creates documents from context chunks and a table.
Args:
dataset: list of examples from the dataset.
chunk_size: chunk size in characters.
Returns:
List of documents in the format:
[{"id": ..., "text": ..., "metadata": {...}}, ...]
"""
documents = []
for example in dataset:
context_text = preprocess_context(example.get('context'))
table_text = preprocess_table(example.get('table'))
full_text = context_text
if table_text:
full_text += "\nTable:\n" + table_text
for i in range(0, len(full_text), chunk_size):
chunk_text = full_text[i:i+chunk_size]
documents.append({
"id": f"{example['id']}_{i // chunk_size}",
"text": chunk_text,
"metadata": {
"question": example.get('question'),
"original_answer": example.get('original_answer'),
"file_name": example.get('file_name'),
"company_name": example.get('company_name'),
"report_year": example.get('report_year')
}
})
return documents
def load_finqa_dataset(split: str = "train") -> List[Dict[str, Any]]:
"""Loads the FinQA dataset and returns a list of documents."""
ds = load_dataset("G4KMU/t2-ragbench", "FinQA")[split]
return create_documents(ds)
|