Tigran Tokmajyan commited on
Commit
38356e3
·
1 Parent(s): 8b15006

Introduce handler.py

Browse files
Files changed (2) hide show
  1. handler.py +76 -0
  2. requirements.txt +3 -0
handler.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForQuestionAnswering, AutoTokenizer, MarkupLMFeatureExtractor
2
+ import torch
3
+ from bs4 import BeautifulSoup # For HTML parsing
4
+
5
+ class EndpointHandler:
6
+ def __init__(self, path=""):
7
+ # Load model, tokenizer, and feature extractor
8
+ self.model = AutoModelForQuestionAnswering.from_pretrained(path)
9
+ self.tokenizer = AutoTokenizer.from_pretrained(path)
10
+ self.feature_extractor = MarkupLMFeatureExtractor()
11
+
12
+ def _parse_html(self, html):
13
+ # Parse HTML to extract tags and xpaths
14
+ soup = BeautifulSoup(html, "html.parser")
15
+ nodes = []
16
+ xpaths = []
17
+
18
+ # Simple example: Extract tags and generate xpaths
19
+ for element in soup.descendants:
20
+ if element.name:
21
+ # Get XPath (simplified for demonstration)
22
+ xpath = self._get_xpath(element)
23
+ nodes.append(element.name)
24
+ xpaths.append(xpath)
25
+ return nodes, xpaths
26
+
27
+ def _get_xpath(self, element):
28
+ # Generate simplified XPath for an element
29
+ parts = []
30
+ while element.parent is not None:
31
+ if element.name:
32
+ parts.append(element.name)
33
+ element = element.parent
34
+ return "/".join(reversed(parts))
35
+
36
+ def __call__(self, data):
37
+ # Extract inputs from data
38
+ html = data.get("html", "")
39
+ question = data.get("question", "")
40
+
41
+ # Parse HTML to get nodes and xpaths
42
+ nodes, xpaths = self._parse_html(html)
43
+
44
+ # Tokenize text and prepare features
45
+ encoding = self.tokenizer(
46
+ text=question,
47
+ text_pair=html,
48
+ return_tensors="pt",
49
+ truncation=True,
50
+ padding=True,
51
+ )
52
+
53
+ # Prepare node features
54
+ features = self.feature_extractor(
55
+ nodes=[nodes], # List of node tags
56
+ xpaths=[xpaths], # List of XPath strings
57
+ node_labels=None, # Optional: Add if you have labels
58
+ )
59
+
60
+ # Combine tokenizer and feature extractor outputs
61
+ inputs = {**encoding, **features}
62
+
63
+ # Run inference
64
+ with torch.no_grad():
65
+ outputs = self.model(**inputs)
66
+
67
+ # Get answer span
68
+ answer_start = torch.argmax(outputs.start_logits)
69
+ answer_end = torch.argmax(outputs.end_logits) + 1
70
+ answer = self.tokenizer.convert_tokens_to_string(
71
+ self.tokenizer.convert_ids_to_tokens(
72
+ encoding["input_ids"][0][answer_start:answer_end]
73
+ )
74
+ )
75
+
76
+ return {"answer": answer}
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ beautifulsoup4