hsg_rag_eea / src /pipeline /utils /serializer.py
Pygmales
synched versions
698965e
raw
history blame
2.2 kB
from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider
from docling_core.transforms.serializer.base import BaseTableSerializer, SerializationResult
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.types.doc.document import RichTableCell
class EnhancedTableSerializer(BaseTableSerializer):
def serialize(self, *, item, doc_serializer, doc, **kwargs) -> SerializationResult:
if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
return create_ser_result(text='')
grid = item.data.grid
if not grid:
return create_ser_result(text='')
row_cells = []
for row in grid:
clean_row = []
for cell in row:
if isinstance(cell, RichTableCell):
ser = doc_serializer.serialize(item=cell.ref.resolve(doc), **kwargs)
clean_row.append(ser.text.strip())
else:
clean_row.append((cell.text or "").strip())
if any(c for c in clean_row):
row_cells.append(clean_row)
headers = row_cells[0]
data_rows = row_cells[1:]
lines = []
for row in data_rows:
if len(row) < 2 or not row[0].strip():
continue
main_key = row[0].strip().replace('\n', ' ')
top_line = f'- {main_key}:'
lines.append(top_line)
for i in range(1, len(row)):
value = row[i].strip().replace('\n', ' ')
if not value: continue
sub_header = headers[i].strip().replace('\n', ' ') if i < len(headers) else f""
sub_line = f' - {sub_header}: {value}'
lines.append(sub_line)
lines.append("")
final_text = "\n".join(lines).rstrip()
return create_ser_result(text=final_text, span_source=item)
class EnhansedSerializerProvider(ChunkingSerializerProvider):
def get_serializer(self, doc):
return ChunkingDocSerializer(
doc=doc,
table_serializer=EnhancedTableSerializer(),
)