Spaces:

jawadsaghir12
/

google-mcp-server

Runtime error

App Files Files Community

google-mcp-server / gdocs /docs_structure.py

jawadsaghir12

new

0887862 19 days ago

raw

history blame contribute delete

12.1 kB

	"""
	Google Docs Document Structure Parsing and Analysis

	This module provides utilities for parsing and analyzing the structure
	of Google Docs documents, including finding tables, cells, and other elements.
	"""

	import logging
	from typing import Any, Optional

	logger = logging.getLogger(__name__)


	def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]:
	"""
	Parse the full document structure into a navigable format.

	Args:
	doc_data: Raw document data from Google Docs API

	Returns:
	Dictionary containing parsed structure with elements and their positions
	"""
	structure = {
	"title": doc_data.get("title", ""),
	"body": [],
	"tables": [],
	"headers": {},
	"footers": {},
	"total_length": 0,
	}

	body = doc_data.get("body", {})
	content = body.get("content", [])

	for element in content:
	element_info = _parse_element(element)
	if element_info:
	structure["body"].append(element_info)
	if element_info["type"] == "table":
	structure["tables"].append(element_info)

	# Calculate total document length
	if structure["body"]:
	last_element = structure["body"][-1]
	structure["total_length"] = last_element.get("end_index", 0)

	# Parse headers and footers
	for header_id, header_data in doc_data.get("headers", {}).items():
	structure["headers"][header_id] = _parse_segment(header_data)

	for footer_id, footer_data in doc_data.get("footers", {}).items():
	structure["footers"][footer_id] = _parse_segment(footer_data)

	return structure


	def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]:
	"""
	Parse a single document element.

	Args:
	element: Element data from document

	Returns:
	Parsed element information or None
	"""
	element_info = {
	"start_index": element.get("startIndex", 0),
	"end_index": element.get("endIndex", 0),
	}

	if "paragraph" in element:
	paragraph = element["paragraph"]
	element_info["type"] = "paragraph"
	element_info["text"] = _extract_paragraph_text(paragraph)
	element_info["style"] = paragraph.get("paragraphStyle", {})

	elif "table" in element:
	table = element["table"]
	element_info["type"] = "table"
	element_info["rows"] = len(table.get("tableRows", []))
	element_info["columns"] = len(
	table.get("tableRows", [{}])[0].get("tableCells", [])
	)
	element_info["cells"] = _parse_table_cells(table)
	element_info["table_style"] = table.get("tableStyle", {})

	elif "sectionBreak" in element:
	element_info["type"] = "section_break"
	element_info["section_style"] = element["sectionBreak"].get("sectionStyle", {})

	elif "tableOfContents" in element:
	element_info["type"] = "table_of_contents"

	else:
	return None

	return element_info


	def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]:
	"""
	Parse table cells with their positions and content.

	Args:
	table: Table element data

	Returns:
	2D list of cell information
	"""
	cells = []
	for row_idx, row in enumerate(table.get("tableRows", [])):
	row_cells = []
	for col_idx, cell in enumerate(row.get("tableCells", [])):
	# Find the first paragraph in the cell for insertion
	insertion_index = cell.get("startIndex", 0) + 1 # Default fallback

	# Look for the first paragraph in cell content
	content_elements = cell.get("content", [])
	for element in content_elements:
	if "paragraph" in element:
	paragraph = element["paragraph"]
	# Get the first element in the paragraph
	para_elements = paragraph.get("elements", [])
	if para_elements:
	first_element = para_elements[0]
	if "startIndex" in first_element:
	insertion_index = first_element["startIndex"]
	break

	cell_info = {
	"row": row_idx,
	"column": col_idx,
	"start_index": cell.get("startIndex", 0),
	"end_index": cell.get("endIndex", 0),
	"insertion_index": insertion_index, # Where to insert text in this cell
	"content": _extract_cell_text(cell),
	"content_elements": content_elements,
	}
	row_cells.append(cell_info)
	cells.append(row_cells)
	return cells


	def _extract_paragraph_text(paragraph: dict[str, Any]) -> str:
	"""Extract text from a paragraph element."""
	text_parts = []
	for element in paragraph.get("elements", []):
	if "textRun" in element:
	text_parts.append(element["textRun"].get("content", ""))
	return "".join(text_parts)


	def _extract_cell_text(cell: dict[str, Any]) -> str:
	"""Extract text content from a table cell."""
	text_parts = []
	for element in cell.get("content", []):
	if "paragraph" in element:
	text_parts.append(_extract_paragraph_text(element["paragraph"]))
	return "".join(text_parts)


	def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]:
	"""Parse a document segment (header/footer)."""
	return {
	"content": segment_data.get("content", []),
	"start_index": segment_data.get("content", [{}])[0].get("startIndex", 0)
	if segment_data.get("content")
	else 0,
	"end_index": segment_data.get("content", [{}])[-1].get("endIndex", 0)
	if segment_data.get("content")
	else 0,
	}


	def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]:
	"""
	Find all tables in the document with their positions and dimensions.

	Args:
	doc_data: Raw document data from Google Docs API

	Returns:
	List of table information dictionaries
	"""
	tables = []
	structure = parse_document_structure(doc_data)

	for idx, table_info in enumerate(structure["tables"]):
	tables.append(
	{
	"index": idx,
	"start_index": table_info["start_index"],
	"end_index": table_info["end_index"],
	"rows": table_info["rows"],
	"columns": table_info["columns"],
	"cells": table_info["cells"],
	}
	)

	return tables


	def get_table_cell_indices(
	doc_data: dict[str, Any], table_index: int = 0
	) -> Optional[list[list[tuple[int, int]]]]:
	"""
	Get content indices for all cells in a specific table.

	Args:
	doc_data: Raw document data from Google Docs API
	table_index: Index of the table (0-based)

	Returns:
	2D list of (start_index, end_index) tuples for each cell, or None if table not found
	"""
	tables = find_tables(doc_data)

	if table_index >= len(tables):
	logger.warning(
	f"Table index {table_index} not found. Document has {len(tables)} tables."
	)
	return None

	table = tables[table_index]
	cell_indices = []

	for row in table["cells"]:
	row_indices = []
	for cell in row:
	# Each cell contains at least one paragraph
	# Find the first paragraph in the cell for content insertion
	cell_content = cell.get("content_elements", [])
	if cell_content:
	# Look for the first paragraph in cell content
	first_para = None
	for element in cell_content:
	if "paragraph" in element:
	first_para = element["paragraph"]
	break

	if first_para and "elements" in first_para and first_para["elements"]:
	# Insert at the start of the first text run in the paragraph
	first_text_element = first_para["elements"][0]
	if "textRun" in first_text_element:
	start_idx = first_text_element.get(
	"startIndex", cell["start_index"] + 1
	)
	end_idx = first_text_element.get("endIndex", start_idx + 1)
	row_indices.append((start_idx, end_idx))
	continue

	# Fallback: use cell boundaries with safe margins
	content_start = cell["start_index"] + 1
	content_end = cell["end_index"] - 1
	row_indices.append((content_start, content_end))
	cell_indices.append(row_indices)

	return cell_indices


	def find_element_at_index(
	doc_data: dict[str, Any], index: int
	) -> Optional[dict[str, Any]]:
	"""
	Find what element exists at a given index in the document.

	Args:
	doc_data: Raw document data from Google Docs API
	index: Position in the document

	Returns:
	Information about the element at that position, or None
	"""
	structure = parse_document_structure(doc_data)

	for element in structure["body"]:
	if element["start_index"] <= index < element["end_index"]:
	element_copy = element.copy()

	# If it's a table, find which cell contains the index
	if element["type"] == "table" and "cells" in element:
	for row_idx, row in enumerate(element["cells"]):
	for col_idx, cell in enumerate(row):
	if cell["start_index"] <= index < cell["end_index"]:
	element_copy["containing_cell"] = {
	"row": row_idx,
	"column": col_idx,
	"cell_start": cell["start_index"],
	"cell_end": cell["end_index"],
	}
	break

	return element_copy

	return None


	def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int:
	"""
	Find the next safe position to insert content after a given index.

	Args:
	doc_data: Raw document data from Google Docs API
	after_index: Index after which to find insertion point

	Returns:
	Safe index for insertion
	"""
	structure = parse_document_structure(doc_data)

	# Find the first paragraph element after the given index
	for element in structure["body"]:
	if element["type"] == "paragraph" and element["start_index"] > after_index:
	# Insert at the end of the previous element or start of this paragraph
	return element["start_index"]

	# If no paragraph found, return the end of document
	return structure["total_length"] - 1 if structure["total_length"] > 0 else 1


	def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]:
	"""
	Analyze document complexity and provide statistics.

	Args:
	doc_data: Raw document data from Google Docs API

	Returns:
	Dictionary with document statistics
	"""
	structure = parse_document_structure(doc_data)

	stats = {
	"total_elements": len(structure["body"]),
	"tables": len(structure["tables"]),
	"paragraphs": sum(1 for e in structure["body"] if e.get("type") == "paragraph"),
	"section_breaks": sum(
	1 for e in structure["body"] if e.get("type") == "section_break"
	),
	"total_length": structure["total_length"],
	"has_headers": bool(structure["headers"]),
	"has_footers": bool(structure["footers"]),
	}

	# Add table statistics
	if structure["tables"]:
	total_cells = sum(
	table["rows"] * table["columns"] for table in structure["tables"]
	)
	stats["total_table_cells"] = total_cells
	stats["largest_table"] = max(
	(t["rows"] * t["columns"] for t in structure["tables"]), default=0
	)

	return stats