Spaces:
Running
on
Zero
Running
on
Zero
Liam Dyer
commited on
kerfuffles
Browse files
app.py
CHANGED
|
@@ -13,15 +13,6 @@ model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m")
|
|
| 13 |
model.to(device="cuda")
|
| 14 |
|
| 15 |
|
| 16 |
-
def chunk(text, max_length=512):
|
| 17 |
-
chunks = []
|
| 18 |
-
while len(text) > max_length:
|
| 19 |
-
chunks.append(text[:max_length])
|
| 20 |
-
text = text[max_length:]
|
| 21 |
-
chunks.append(text)
|
| 22 |
-
return chunks
|
| 23 |
-
|
| 24 |
-
|
| 25 |
@spaces.GPU
|
| 26 |
def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
|
| 27 |
query_embeddings = model.encode(queries, prompt_name="query")
|
|
@@ -118,6 +109,15 @@ def convert(input_file) -> str:
|
|
| 118 |
return convert_pandoc(input_file, input_file)
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
@spaces.GPU
|
| 122 |
def predict(queries, documents, max_characters) -> list[list[str]]:
|
| 123 |
queries = queries.split("\n")
|
|
@@ -131,7 +131,7 @@ def predict(queries, documents, max_characters) -> list[list[str]]:
|
|
| 131 |
return [[doc] for doc, _ in converted_docs]
|
| 132 |
|
| 133 |
# Embed the documents in 512 character chunks
|
| 134 |
-
chunked_docs = [
|
| 135 |
embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
|
| 136 |
|
| 137 |
# Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
|
|
|
|
| 13 |
model.to(device="cuda")
|
| 14 |
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
@spaces.GPU
|
| 17 |
def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
|
| 18 |
query_embeddings = model.encode(queries, prompt_name="query")
|
|
|
|
| 109 |
return convert_pandoc(input_file, input_file)
|
| 110 |
|
| 111 |
|
| 112 |
+
def chunk_to_length(text, max_length=512):
|
| 113 |
+
chunks = []
|
| 114 |
+
while len(text) > max_length:
|
| 115 |
+
chunks.append(text[:max_length])
|
| 116 |
+
text = text[max_length:]
|
| 117 |
+
chunks.append(text)
|
| 118 |
+
return chunks
|
| 119 |
+
|
| 120 |
+
|
| 121 |
@spaces.GPU
|
| 122 |
def predict(queries, documents, max_characters) -> list[list[str]]:
|
| 123 |
queries = queries.split("\n")
|
|
|
|
| 131 |
return [[doc] for doc, _ in converted_docs]
|
| 132 |
|
| 133 |
# Embed the documents in 512 character chunks
|
| 134 |
+
chunked_docs = [chunk_to_length(doc, 512) for doc in converted_docs]
|
| 135 |
embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
|
| 136 |
|
| 137 |
# Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
|