toktik-pgx
/

markuplm-large

Model card Files Files and versions

Tigran Tokmajyan commited on Feb 19, 2025

Commit

38356e3

·

1 Parent(s): 8b15006

Introduce handler.py

Files changed (2) hide show

handler.py +76 -0
requirements.txt +3 -0

handler.py ADDED Viewed

	@@ -0,0 +1,76 @@

+from transformers import AutoModelForQuestionAnswering, AutoTokenizer, MarkupLMFeatureExtractor
+import torch
+from bs4 import BeautifulSoup  # For HTML parsing
+class EndpointHandler:
+    def __init__(self, path=""):
+        # Load model, tokenizer, and feature extractor
+        self.model = AutoModelForQuestionAnswering.from_pretrained(path)
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.feature_extractor = MarkupLMFeatureExtractor()
+    def _parse_html(self, html):
+        # Parse HTML to extract tags and xpaths
+        soup = BeautifulSoup(html, "html.parser")
+        nodes = []
+        xpaths = []
+        # Simple example: Extract tags and generate xpaths
+        for element in soup.descendants:
+            if element.name:
+                # Get XPath (simplified for demonstration)
+                xpath = self._get_xpath(element)
+                nodes.append(element.name)
+                xpaths.append(xpath)
+        return nodes, xpaths
+    def _get_xpath(self, element):
+        # Generate simplified XPath for an element
+        parts = []
+        while element.parent is not None:
+            if element.name:
+                parts.append(element.name)
+            element = element.parent
+        return "/".join(reversed(parts))
+    def __call__(self, data):
+        # Extract inputs from data
+        html = data.get("html", "")
+        question = data.get("question", "")
+        # Parse HTML to get nodes and xpaths
+        nodes, xpaths = self._parse_html(html)
+        # Tokenize text and prepare features
+        encoding = self.tokenizer(
+            text=question,
+            text_pair=html,
+            return_tensors="pt",
+            truncation=True,
+            padding=True,
+        )
+        # Prepare node features
+        features = self.feature_extractor(
+            nodes=[nodes],  # List of node tags
+            xpaths=[xpaths],  # List of XPath strings
+            node_labels=None,  # Optional: Add if you have labels
+        )
+        # Combine tokenizer and feature extractor outputs
+        inputs = {**encoding, **features}
+        # Run inference
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        # Get answer span
+        answer_start = torch.argmax(outputs.start_logits)
+        answer_end = torch.argmax(outputs.end_logits) + 1
+        answer = self.tokenizer.convert_tokens_to_string(
+            self.tokenizer.convert_ids_to_tokens(
+                encoding["input_ids"][0][answer_start:answer_end]
+            )
+        )
+        return {"answer": answer}

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers
+torch
+beautifulsoup4