Spaces:

amaherovskyi
/

Financial_RAG

Sleeping

File size: 1,949 Bytes

29f88f7

from datasets import load_dataset
from typing import List, Dict, Any


def preprocess_context(text: str) -> str:
    """Text cleaning: remove line breaks and extra spaces."""
    if text is None:
        return ""
    return " ".join(text.split())


def preprocess_table(table: Any) -> str:
    """Convert the table to plain text for retriever."""
    if table is None:
        return ""
    return str(table)


def create_documents(dataset: List[Dict[str, Any]], chunk_size: int = 500) -> List[Dict[str, Any]]:
    """
    Creates documents from context chunks and a table.

    Args:
        dataset: list of examples from the dataset.
        chunk_size: chunk size in characters.

    Returns:
        List of documents in the format:
        [{"id": ..., "text": ..., "metadata": {...}}, ...]
    """
    documents = []
    for example in dataset:
        context_text = preprocess_context(example.get('context'))
        table_text = preprocess_table(example.get('table'))

        full_text = context_text
        if table_text:
            full_text += "\nTable:\n" + table_text

        for i in range(0, len(full_text), chunk_size):
            chunk_text = full_text[i:i+chunk_size]
            documents.append({
                "id": f"{example['id']}_{i // chunk_size}",
                "text": chunk_text,
                "metadata": {
                    "question": example.get('question'),
                    "original_answer": example.get('original_answer'),
                    "file_name": example.get('file_name'),
                    "company_name": example.get('company_name'),
                    "report_year": example.get('report_year')
                }
            })
    return documents


def load_finqa_dataset(split: str = "train") -> List[Dict[str, Any]]:
    """Loads the FinQA dataset and returns a list of documents."""
    ds = load_dataset("G4KMU/t2-ragbench", "FinQA")[split]
    return create_documents(ds)