Spaces:
Running
Running
File size: 2,200 Bytes
698965e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 | from docling_core.transforms.chunker.hierarchical_chunker import ChunkingDocSerializer, ChunkingSerializerProvider
from docling_core.transforms.serializer.base import BaseTableSerializer, SerializationResult
from docling_core.transforms.serializer.common import create_ser_result
from docling_core.types.doc.document import RichTableCell
class EnhancedTableSerializer(BaseTableSerializer):
def serialize(self, *, item, doc_serializer, doc, **kwargs) -> SerializationResult:
if item.self_ref in doc_serializer.get_excluded_refs(**kwargs):
return create_ser_result(text='')
grid = item.data.grid
if not grid:
return create_ser_result(text='')
row_cells = []
for row in grid:
clean_row = []
for cell in row:
if isinstance(cell, RichTableCell):
ser = doc_serializer.serialize(item=cell.ref.resolve(doc), **kwargs)
clean_row.append(ser.text.strip())
else:
clean_row.append((cell.text or "").strip())
if any(c for c in clean_row):
row_cells.append(clean_row)
headers = row_cells[0]
data_rows = row_cells[1:]
lines = []
for row in data_rows:
if len(row) < 2 or not row[0].strip():
continue
main_key = row[0].strip().replace('\n', ' ')
top_line = f'- {main_key}:'
lines.append(top_line)
for i in range(1, len(row)):
value = row[i].strip().replace('\n', ' ')
if not value: continue
sub_header = headers[i].strip().replace('\n', ' ') if i < len(headers) else f""
sub_line = f' - {sub_header}: {value}'
lines.append(sub_line)
lines.append("")
final_text = "\n".join(lines).rstrip()
return create_ser_result(text=final_text, span_source=item)
class EnhansedSerializerProvider(ChunkingSerializerProvider):
def get_serializer(self, doc):
return ChunkingDocSerializer(
doc=doc,
table_serializer=EnhancedTableSerializer(),
)
|