junqiu-dev commited on
Commit
4f35bdd
·
1 Parent(s): 8ccc65f

Remove Usage section from README

Browse files
Files changed (1) hide show
  1. README.md +0 -143
README.md CHANGED
@@ -19,149 +19,6 @@ pipeline_tag: text-classification
19
 
20
  The OpenSearch semantic highlighter is a trained classifier that takes a document and query as input and returns a binary score for each sentence in the document indicating its relevance to the query.
21
 
22
- ## Usage
23
-
24
- The model is intended to be used within the OpenSearch cluster. However, for illustrative purposes, we include an example of how it can be used outside of OpenSearch:
25
-
26
- ```python
27
- import nltk
28
- import torch
29
- import numpy as np
30
- from datasets import Dataset
31
- from functools import partial
32
- from torch.utils.data import DataLoader
33
- from dataclasses import dataclass, field
34
- from typing import Any, Dict, List, Union
35
- from torch.nn.utils.rnn import pad_sequence
36
- from transformers import AutoTokenizer
37
- from highlighter_model_tracing import TraceableBertTaggerForSentenceExtractionWithBackoff
38
-
39
- @dataclass
40
- class DataCollatorWithPadding:
41
- pad_kvs: Dict[str, Union[int, float]] = field(default_factory=dict)
42
-
43
- def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
44
- first = features[0]
45
- batch = {}
46
-
47
- # pad and collate keys in self.pad_kvs
48
- for key, pad_value in self.pad_kvs.items():
49
- if key in first and first[key] is not None:
50
- batch[key] = pad_sequence(
51
- [torch.tensor(f[key]) for f in features],
52
- batch_first=True,
53
- padding_value=pad_value,
54
- )
55
-
56
- # collate remaining keys assuming that the values can be stacked
57
- for k, v in first.items():
58
- if k not in self.pad_kvs and v is not None and isinstance(v, torch.Tensor):
59
- batch[k] = torch.stack([f[k] for f in features])
60
-
61
- return batch
62
-
63
-
64
- def prepare_input_features(
65
- tokenizer, examples, max_seq_length=512, stride=128, padding=False
66
- ):
67
-
68
- # jointly tokenize questions and context
69
- tokenized_examples = tokenizer(
70
- examples["question"],
71
- examples["context"],
72
- truncation="only_second",
73
- max_length=max_seq_length,
74
- stride=stride,
75
- return_overflowing_tokens=True,
76
- padding=padding,
77
- is_split_into_words=True,
78
- )
79
-
80
- sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
81
- tokenized_examples["example_id"] = []
82
- tokenized_examples["word_ids"] = []
83
- tokenized_examples["sentence_ids"] = []
84
-
85
- # process model inputs
86
- for i, sample_index in enumerate(sample_mapping):
87
- word_ids = tokenized_examples.word_ids(i)
88
- word_level_sentence_ids = examples["word_level_sentence_ids"][sample_index]
89
-
90
- sequence_ids = tokenized_examples.sequence_ids(i)
91
- token_start_index = 0
92
- while sequence_ids[token_start_index] != 1:
93
- token_start_index += 1
94
-
95
- sentences_ids = [-100] * token_start_index
96
- for word_idx in word_ids[token_start_index:]:
97
- if word_idx is not None:
98
- sentences_ids.append(word_level_sentence_ids[word_idx])
99
- else:
100
- sentences_ids.append(-100)
101
-
102
- tokenized_examples["sentence_ids"].append(sentences_ids)
103
- tokenized_examples["example_id"].append(examples["id"][sample_index])
104
- tokenized_examples["word_ids"].append(word_ids)
105
-
106
- return tokenized_examples
107
-
108
-
109
- # example highlighting case, from OpenSearch documentation
110
- query = "When does OpenSearch use text reanalysis for highlighting?"
111
- document = "To highlight the search terms, the highlighter needs the start and end character offsets of each term. The offsets mark the term's position in the original text. The highlighter can obtain the offsets from the following sources: Postings: When documents are indexed, OpenSearch creates an inverted search index—a core data structure used to search for documents. Postings represent the inverted search index and store the mapping of each analyzed term to the list of documents in which it occurs. If you set the index_options parameter to offsets when mapping a text field, OpenSearch adds each term's start and end character offsets to the inverted index. During highlighting, the highlighter reruns the original query directly on the postings to locate each term. Thus, storing offsets makes highlighting more efficient for large fields because it does not require reanalyzing the text. Storing term offsets requires additional disk space, but uses less disk space than storing term vectors. Text reanalysis: In the absence of both postings and term vectors, the highlighter reanalyzes text in order to highlight it. For every document and every field that needs highlighting, the highlighter creates a small in-memory index and reruns the original query through Lucene's query execution planner to access low-level match information for the current document. Reanalyzing the text works well in most use cases. However, this method is more memory and time intensive for large fields."
112
-
113
- # sentence-level parsing
114
- sentence_ids = []
115
- context = []
116
- document_sents = nltk.sent_tokenize(document)
117
- for sent_id, sent in enumerate(document_sents):
118
- sent_words = sent.split(' ')
119
- context += sent_words
120
- sentence_ids += [sent_id] * len(sent_words)
121
-
122
- # format example highlighting case as a Dataset
123
- example_dataset = Dataset.from_dict({'question': [[query]],
124
- 'context': [context],
125
- 'word_level_sentence_ids': [sentence_ids],
126
- 'id': [0]})
127
-
128
- # prepare to featurize the raw text data
129
- base_model_id = "bert-base-uncased"
130
- tokenizer = AutoTokenizer.from_pretrained(base_model_id)
131
- collator = DataCollatorWithPadding(
132
- pad_kvs={
133
- "input_ids": 0,
134
- "token_type_ids": 0,
135
- "attention_mask": 0,
136
- "sentence_ids": -100,
137
- "sentence_labels": -100,
138
- }
139
- )
140
- preprocess_fn = partial(prepare_input_features, tokenizer)
141
-
142
- # featurize
143
- example_dataset = example_dataset.map(
144
- preprocess_fn,
145
- batched=True,
146
- remove_columns=example_dataset.column_names,
147
- desc="Preparing model inputs",
148
- )
149
- loader = DataLoader(example_dataset, batch_size=1, collate_fn=collator)
150
- batch = next(iter(loader))
151
-
152
- # load model and get sentence highlights
153
- model = TraceableBertTaggerForSentenceExtractionWithBackoff.from_pretrained('opensearch-project/opensearch-semantic-highlighter-v1')
154
- highlights = model(
155
- batch["input_ids"],
156
- batch["attention_mask"],
157
- batch["token_type_ids"],
158
- batch["sentence_ids"],
159
- )
160
- highlighted_sentences = [document_sents[x] for x in highlights[0]]
161
- print(highlighted_sentences)
162
- ```
163
-
164
-
165
  ## License
166
 
167
  This project is licensed under the [Apache v2.0 License](https://github.com/opensearch-project/neural-search/blob/main/LICENSE).
 
19
 
20
  The OpenSearch semantic highlighter is a trained classifier that takes a document and query as input and returns a binary score for each sentence in the document indicating its relevance to the query.
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  ## License
23
 
24
  This project is licensed under the [Apache v2.0 License](https://github.com/opensearch-project/neural-search/blob/main/LICENSE).