Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -13,8 +13,32 @@ class NumpyEncoder(json.JSONEncoder):
|
|
| 13 |
return json.JSONEncoder.default(self, obj)
|
| 14 |
|
| 15 |
def text_to_embedding(text):
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
inputs = gr.inputs.Textbox(default="Type text here.")
|
| 20 |
outputs = gr.outputs.Textbox()
|
|
|
|
| 13 |
return json.JSONEncoder.default(self, obj)
|
| 14 |
|
| 15 |
def text_to_embedding(text):
|
| 16 |
+
# Tokenize the input text
|
| 17 |
+
tokens = model.tokenize(text)
|
| 18 |
+
|
| 19 |
+
# Check if the token count exceeds the model's maximum sequence length
|
| 20 |
+
if len(tokens) > model.max_seq_length:
|
| 21 |
+
|
| 22 |
+
# Split the input text into chunks
|
| 23 |
+
chunks = []
|
| 24 |
+
for i in range(0, len(tokens), model.max_seq_length):
|
| 25 |
+
chunk = tokens[i:i + model.max_seq_length]
|
| 26 |
+
chunks.append(model.tokenizer.convert_tokens_to_string(chunk))
|
| 27 |
+
|
| 28 |
+
# Encode each chunk and store the embeddings
|
| 29 |
+
embeddings = []
|
| 30 |
+
for chunk in chunks:
|
| 31 |
+
embedding = model.encode(chunk)
|
| 32 |
+
embeddings.append(embedding)
|
| 33 |
+
|
| 34 |
+
# Calculate the average embedding
|
| 35 |
+
avg_embedding = np.mean(embeddings, axis=0)
|
| 36 |
+
|
| 37 |
+
else:
|
| 38 |
+
# If the token count is within the limit, just encode the input text
|
| 39 |
+
avg_embedding = model.encode(text)
|
| 40 |
+
|
| 41 |
+
return json.dumps(avg_embedding, cls=NumpyEncoder)
|
| 42 |
|
| 43 |
inputs = gr.inputs.Textbox(default="Type text here.")
|
| 44 |
outputs = gr.outputs.Textbox()
|