Spaces:
Runtime error
Runtime error
| """ | |
| Google Docs Document Structure Parsing and Analysis | |
| This module provides utilities for parsing and analyzing the structure | |
| of Google Docs documents, including finding tables, cells, and other elements. | |
| """ | |
| import logging | |
| from typing import Any, Optional | |
| logger = logging.getLogger(__name__) | |
| def parse_document_structure(doc_data: dict[str, Any]) -> dict[str, Any]: | |
| """ | |
| Parse the full document structure into a navigable format. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| Returns: | |
| Dictionary containing parsed structure with elements and their positions | |
| """ | |
| structure = { | |
| "title": doc_data.get("title", ""), | |
| "body": [], | |
| "tables": [], | |
| "headers": {}, | |
| "footers": {}, | |
| "total_length": 0, | |
| } | |
| body = doc_data.get("body", {}) | |
| content = body.get("content", []) | |
| for element in content: | |
| element_info = _parse_element(element) | |
| if element_info: | |
| structure["body"].append(element_info) | |
| if element_info["type"] == "table": | |
| structure["tables"].append(element_info) | |
| # Calculate total document length | |
| if structure["body"]: | |
| last_element = structure["body"][-1] | |
| structure["total_length"] = last_element.get("end_index", 0) | |
| # Parse headers and footers | |
| for header_id, header_data in doc_data.get("headers", {}).items(): | |
| structure["headers"][header_id] = _parse_segment(header_data) | |
| for footer_id, footer_data in doc_data.get("footers", {}).items(): | |
| structure["footers"][footer_id] = _parse_segment(footer_data) | |
| return structure | |
| def _parse_element(element: dict[str, Any]) -> Optional[dict[str, Any]]: | |
| """ | |
| Parse a single document element. | |
| Args: | |
| element: Element data from document | |
| Returns: | |
| Parsed element information or None | |
| """ | |
| element_info = { | |
| "start_index": element.get("startIndex", 0), | |
| "end_index": element.get("endIndex", 0), | |
| } | |
| if "paragraph" in element: | |
| paragraph = element["paragraph"] | |
| element_info["type"] = "paragraph" | |
| element_info["text"] = _extract_paragraph_text(paragraph) | |
| element_info["style"] = paragraph.get("paragraphStyle", {}) | |
| elif "table" in element: | |
| table = element["table"] | |
| element_info["type"] = "table" | |
| element_info["rows"] = len(table.get("tableRows", [])) | |
| element_info["columns"] = len( | |
| table.get("tableRows", [{}])[0].get("tableCells", []) | |
| ) | |
| element_info["cells"] = _parse_table_cells(table) | |
| element_info["table_style"] = table.get("tableStyle", {}) | |
| elif "sectionBreak" in element: | |
| element_info["type"] = "section_break" | |
| element_info["section_style"] = element["sectionBreak"].get("sectionStyle", {}) | |
| elif "tableOfContents" in element: | |
| element_info["type"] = "table_of_contents" | |
| else: | |
| return None | |
| return element_info | |
| def _parse_table_cells(table: dict[str, Any]) -> list[list[dict[str, Any]]]: | |
| """ | |
| Parse table cells with their positions and content. | |
| Args: | |
| table: Table element data | |
| Returns: | |
| 2D list of cell information | |
| """ | |
| cells = [] | |
| for row_idx, row in enumerate(table.get("tableRows", [])): | |
| row_cells = [] | |
| for col_idx, cell in enumerate(row.get("tableCells", [])): | |
| # Find the first paragraph in the cell for insertion | |
| insertion_index = cell.get("startIndex", 0) + 1 # Default fallback | |
| # Look for the first paragraph in cell content | |
| content_elements = cell.get("content", []) | |
| for element in content_elements: | |
| if "paragraph" in element: | |
| paragraph = element["paragraph"] | |
| # Get the first element in the paragraph | |
| para_elements = paragraph.get("elements", []) | |
| if para_elements: | |
| first_element = para_elements[0] | |
| if "startIndex" in first_element: | |
| insertion_index = first_element["startIndex"] | |
| break | |
| cell_info = { | |
| "row": row_idx, | |
| "column": col_idx, | |
| "start_index": cell.get("startIndex", 0), | |
| "end_index": cell.get("endIndex", 0), | |
| "insertion_index": insertion_index, # Where to insert text in this cell | |
| "content": _extract_cell_text(cell), | |
| "content_elements": content_elements, | |
| } | |
| row_cells.append(cell_info) | |
| cells.append(row_cells) | |
| return cells | |
| def _extract_paragraph_text(paragraph: dict[str, Any]) -> str: | |
| """Extract text from a paragraph element.""" | |
| text_parts = [] | |
| for element in paragraph.get("elements", []): | |
| if "textRun" in element: | |
| text_parts.append(element["textRun"].get("content", "")) | |
| return "".join(text_parts) | |
| def _extract_cell_text(cell: dict[str, Any]) -> str: | |
| """Extract text content from a table cell.""" | |
| text_parts = [] | |
| for element in cell.get("content", []): | |
| if "paragraph" in element: | |
| text_parts.append(_extract_paragraph_text(element["paragraph"])) | |
| return "".join(text_parts) | |
| def _parse_segment(segment_data: dict[str, Any]) -> dict[str, Any]: | |
| """Parse a document segment (header/footer).""" | |
| return { | |
| "content": segment_data.get("content", []), | |
| "start_index": segment_data.get("content", [{}])[0].get("startIndex", 0) | |
| if segment_data.get("content") | |
| else 0, | |
| "end_index": segment_data.get("content", [{}])[-1].get("endIndex", 0) | |
| if segment_data.get("content") | |
| else 0, | |
| } | |
| def find_tables(doc_data: dict[str, Any]) -> list[dict[str, Any]]: | |
| """ | |
| Find all tables in the document with their positions and dimensions. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| Returns: | |
| List of table information dictionaries | |
| """ | |
| tables = [] | |
| structure = parse_document_structure(doc_data) | |
| for idx, table_info in enumerate(structure["tables"]): | |
| tables.append( | |
| { | |
| "index": idx, | |
| "start_index": table_info["start_index"], | |
| "end_index": table_info["end_index"], | |
| "rows": table_info["rows"], | |
| "columns": table_info["columns"], | |
| "cells": table_info["cells"], | |
| } | |
| ) | |
| return tables | |
| def get_table_cell_indices( | |
| doc_data: dict[str, Any], table_index: int = 0 | |
| ) -> Optional[list[list[tuple[int, int]]]]: | |
| """ | |
| Get content indices for all cells in a specific table. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| table_index: Index of the table (0-based) | |
| Returns: | |
| 2D list of (start_index, end_index) tuples for each cell, or None if table not found | |
| """ | |
| tables = find_tables(doc_data) | |
| if table_index >= len(tables): | |
| logger.warning( | |
| f"Table index {table_index} not found. Document has {len(tables)} tables." | |
| ) | |
| return None | |
| table = tables[table_index] | |
| cell_indices = [] | |
| for row in table["cells"]: | |
| row_indices = [] | |
| for cell in row: | |
| # Each cell contains at least one paragraph | |
| # Find the first paragraph in the cell for content insertion | |
| cell_content = cell.get("content_elements", []) | |
| if cell_content: | |
| # Look for the first paragraph in cell content | |
| first_para = None | |
| for element in cell_content: | |
| if "paragraph" in element: | |
| first_para = element["paragraph"] | |
| break | |
| if first_para and "elements" in first_para and first_para["elements"]: | |
| # Insert at the start of the first text run in the paragraph | |
| first_text_element = first_para["elements"][0] | |
| if "textRun" in first_text_element: | |
| start_idx = first_text_element.get( | |
| "startIndex", cell["start_index"] + 1 | |
| ) | |
| end_idx = first_text_element.get("endIndex", start_idx + 1) | |
| row_indices.append((start_idx, end_idx)) | |
| continue | |
| # Fallback: use cell boundaries with safe margins | |
| content_start = cell["start_index"] + 1 | |
| content_end = cell["end_index"] - 1 | |
| row_indices.append((content_start, content_end)) | |
| cell_indices.append(row_indices) | |
| return cell_indices | |
| def find_element_at_index( | |
| doc_data: dict[str, Any], index: int | |
| ) -> Optional[dict[str, Any]]: | |
| """ | |
| Find what element exists at a given index in the document. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| index: Position in the document | |
| Returns: | |
| Information about the element at that position, or None | |
| """ | |
| structure = parse_document_structure(doc_data) | |
| for element in structure["body"]: | |
| if element["start_index"] <= index < element["end_index"]: | |
| element_copy = element.copy() | |
| # If it's a table, find which cell contains the index | |
| if element["type"] == "table" and "cells" in element: | |
| for row_idx, row in enumerate(element["cells"]): | |
| for col_idx, cell in enumerate(row): | |
| if cell["start_index"] <= index < cell["end_index"]: | |
| element_copy["containing_cell"] = { | |
| "row": row_idx, | |
| "column": col_idx, | |
| "cell_start": cell["start_index"], | |
| "cell_end": cell["end_index"], | |
| } | |
| break | |
| return element_copy | |
| return None | |
| def get_next_paragraph_index(doc_data: dict[str, Any], after_index: int = 0) -> int: | |
| """ | |
| Find the next safe position to insert content after a given index. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| after_index: Index after which to find insertion point | |
| Returns: | |
| Safe index for insertion | |
| """ | |
| structure = parse_document_structure(doc_data) | |
| # Find the first paragraph element after the given index | |
| for element in structure["body"]: | |
| if element["type"] == "paragraph" and element["start_index"] > after_index: | |
| # Insert at the end of the previous element or start of this paragraph | |
| return element["start_index"] | |
| # If no paragraph found, return the end of document | |
| return structure["total_length"] - 1 if structure["total_length"] > 0 else 1 | |
| def analyze_document_complexity(doc_data: dict[str, Any]) -> dict[str, Any]: | |
| """ | |
| Analyze document complexity and provide statistics. | |
| Args: | |
| doc_data: Raw document data from Google Docs API | |
| Returns: | |
| Dictionary with document statistics | |
| """ | |
| structure = parse_document_structure(doc_data) | |
| stats = { | |
| "total_elements": len(structure["body"]), | |
| "tables": len(structure["tables"]), | |
| "paragraphs": sum(1 for e in structure["body"] if e.get("type") == "paragraph"), | |
| "section_breaks": sum( | |
| 1 for e in structure["body"] if e.get("type") == "section_break" | |
| ), | |
| "total_length": structure["total_length"], | |
| "has_headers": bool(structure["headers"]), | |
| "has_footers": bool(structure["footers"]), | |
| } | |
| # Add table statistics | |
| if structure["tables"]: | |
| total_cells = sum( | |
| table["rows"] * table["columns"] for table in structure["tables"] | |
| ) | |
| stats["total_table_cells"] = total_cells | |
| stats["largest_table"] = max( | |
| (t["rows"] * t["columns"] for t in structure["tables"]), default=0 | |
| ) | |
| return stats | |