Spaces:
Sleeping
Sleeping
| from datasets import load_dataset | |
| from typing import List, Dict, Any | |
| def preprocess_context(text: str) -> str: | |
| """Text cleaning: remove line breaks and extra spaces.""" | |
| if text is None: | |
| return "" | |
| return " ".join(text.split()) | |
| def preprocess_table(table: Any) -> str: | |
| """Convert the table to plain text for retriever.""" | |
| if table is None: | |
| return "" | |
| return str(table) | |
| def create_documents(dataset: List[Dict[str, Any]], chunk_size: int = 500) -> List[Dict[str, Any]]: | |
| """ | |
| Creates documents from context chunks and a table. | |
| Args: | |
| dataset: list of examples from the dataset. | |
| chunk_size: chunk size in characters. | |
| Returns: | |
| List of documents in the format: | |
| [{"id": ..., "text": ..., "metadata": {...}}, ...] | |
| """ | |
| documents = [] | |
| for example in dataset: | |
| context_text = preprocess_context(example.get('context')) | |
| table_text = preprocess_table(example.get('table')) | |
| full_text = context_text | |
| if table_text: | |
| full_text += "\nTable:\n" + table_text | |
| for i in range(0, len(full_text), chunk_size): | |
| chunk_text = full_text[i:i+chunk_size] | |
| documents.append({ | |
| "id": f"{example['id']}_{i // chunk_size}", | |
| "text": chunk_text, | |
| "metadata": { | |
| "question": example.get('question'), | |
| "original_answer": example.get('original_answer'), | |
| "file_name": example.get('file_name'), | |
| "company_name": example.get('company_name'), | |
| "report_year": example.get('report_year') | |
| } | |
| }) | |
| return documents | |
| def load_finqa_dataset(split: str = "train") -> List[Dict[str, Any]]: | |
| """Loads the FinQA dataset and returns a list of documents.""" | |
| ds = load_dataset("G4KMU/t2-ragbench", "FinQA")[split] | |
| return create_documents(ds) | |