Spaces:
Sleeping
Sleeping
| import math | |
| import spacy | |
| spacy.cli.download("en_core_web_sm") | |
| # Load the pre-trained spaCy model for English => will bee used in (verb ratio feeature) | |
| nlp = spacy.load("en_core_web_sm") | |
| def verb_ratio(text): | |
| """Calculate the ratio of verbs in a sentence | |
| Args: | |
| text : Sentence | |
| Returns: | |
| The ratio of verbs in a sentence => (0 if text length > 5 words or no valid words) | |
| """ | |
| doc = nlp(text) | |
| if len(doc) > 5: | |
| return 0.0 | |
| verb_count = sum(1 for token in doc if token.pos_ == "VERB" and token.lemma_.lower() != "username") | |
| total_words = sum(1 for token in doc if token.is_alpha) # Count only valid words | |
| return verb_count / total_words if total_words > 0 else 0.0 # Avoid division by zero | |
| def is_near_gray(r, g, b, threshold = 30, min_val = 50, max_val = 200): | |
| """Check if an RGB color is near a shade of gray within a specified threshold | |
| Args: | |
| threshold : Maximum difference between RGB channels | |
| min_val : Minimum acceptable channel value | |
| max_val : Maximum acceptable channel value | |
| Returns: | |
| True if the color is near gray (within threshold) | |
| Notes: | |
| - Excludes very dark (<50) or very light (>200) gray shades. | |
| """ | |
| return ( | |
| min_val <= r <= max_val and | |
| min_val <= g <= max_val and | |
| min_val <= b <= max_val and | |
| abs(r - g) <= threshold and | |
| abs(g - b) <= threshold and | |
| abs(r - b) <= threshold | |
| ) | |
| def find_nearest_text_node(node, text_nodes): | |
| """Calculate the Euclidean distance to the nearest text node. | |
| Args: | |
| text_nodes : List of text nodes with thier x,y coordinates | |
| Returns: | |
| Distance to the nearest text node, or 9999999 if no text nodes exist | |
| """ | |
| if not text_nodes: | |
| return 9999999.0 # Default large value if no text nodes | |
| node_data = node.get("node", {}) | |
| x = node_data.get("x", 0) + node_data.get("width", 0) / 2 | |
| y = node_data.get("y", 0) + node_data.get("height", 0) / 2 | |
| min_distance = float('inf') | |
| for text_node in text_nodes: | |
| tx, ty = text_node['x'], text_node['y'] | |
| distance = math.sqrt((x - tx) ** 2 + (y - ty) ** 2) | |
| min_distance = min(min_distance, distance) | |
| return min_distance | |
| def color_difference(color1, color2): | |
| """ | |
| Calculate a color difference between two RGB colors | |
| Returns a value between 0 and 1 where 0 means identical and 1 means completely different. | |
| """ | |
| if not all([color1, color2]): | |
| return 0 | |
| # Extract RGB values | |
| r1, g1, b1 = color1 | |
| r2, g2, b2 = color2 | |
| # Calculate Euclidean distance in RGB space | |
| distance = math.sqrt((r2-r1)**2 + (g2-g1)**2 + (b2-b1)**2) | |
| # Normalize to 0-1 range | |
| max_distance = math.sqrt(3 * 255**2) | |
| normalized_distance = distance / max_distance | |
| return normalized_distance | |
| def collect_text_nodes(node): | |
| text_nodes_list = [] | |
| # Function to check if a node has meaningful text | |
| def has_meaningful_text(node_data): | |
| return node_data.get('type','') == "TEXT" | |
| node_data = node.get("node", {}) | |
| # If this node has meaningful text | |
| if has_meaningful_text(node_data): | |
| text_nodes_list.append({ | |
| 'x': node_data.get("x", 0) + node_data.get("width", 0) / 2, | |
| 'y': node_data.get("y", 0) + node_data.get("height", 0) / 2, | |
| 'text': node_data.get('characters', '').strip() | |
| }) | |
| # Recursively check children | |
| for child in node.get("children", []): | |
| text_nodes_list.extend(collect_text_nodes(child)) | |
| return text_nodes_list | |
| def count_all_descendants(node): | |
| """Count all descendants in the subtree | |
| """ | |
| count = 0 | |
| for child in node.get("children", []): | |
| count += 1 | |
| count += count_all_descendants(child) | |
| return count | |
| def count_chars_to_end(node: dict) -> int: | |
| """Count total characters in the subtree. | |
| Returns: | |
| Total number of characters in the text nodes of the sub tree | |
| """ | |
| count = 0 | |
| for child in node.get("children", []): | |
| node_data = child.get("node", {}) | |
| count += len(node_data.get("characters", "")) | |
| count += count_chars_to_end(child) | |
| return count | |
| def get_center_of_weight(node): | |
| """Calculate the center of weight of children in the parent node | |
| """ | |
| parent_node_data = node.get("node", {}) | |
| parent_x_center = parent_node_data.get("x", 0) + parent_node_data.get("width", 0) / 2 | |
| total_area = 0 | |
| total = 0 | |
| for child in node.get("children", []): | |
| child_node_data = child.get("node", {}) | |
| x = child_node_data.get("x", 0) | |
| width = child_node_data.get("width", 0) | |
| height = child_node_data.get("height", 0) | |
| child_x_center = x + width / 2 | |
| area = width * height | |
| total += area * child_x_center | |
| total_area += area | |
| weighted_x = total / total_area if total_area else parent_x_center | |
| diff = abs(parent_x_center - weighted_x) / (parent_node_data.get("width", 0) if parent_node_data.get("width", 0) else 1) | |
| return diff | |