Spaces:
Sleeping
Sleeping
Upload app.py
Browse files
app.py
CHANGED
|
@@ -4,6 +4,7 @@ import os
|
|
| 4 |
import json
|
| 5 |
import torch
|
| 6 |
from transformers import AutoTokenizer, AutoModel
|
|
|
|
| 7 |
|
| 8 |
# --- Path Configuration ---
|
| 9 |
# Get the absolute path of the directory containing this script
|
|
@@ -25,8 +26,14 @@ else:
|
|
| 25 |
|
| 26 |
# --- Model and Tokenizer Loading ---
|
| 27 |
try:
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
except Exception as e:
|
| 31 |
raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")
|
| 32 |
|
|
@@ -39,28 +46,110 @@ with open(data_path, "r") as f:
|
|
| 39 |
for line in f:
|
| 40 |
dataset.append(json.loads(line))
|
| 41 |
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
# Pre-compute corpus embeddings
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
def find_similar(prompt, top_k):
|
| 50 |
start_time = time.time()
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
end_time = time.time()
|
| 60 |
|
| 61 |
results = []
|
| 62 |
-
for
|
| 63 |
-
|
|
|
|
| 64 |
|
| 65 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
| 66 |
|
|
@@ -71,7 +160,7 @@ iface = gr.Interface(
|
|
| 71 |
gr.Slider(1, 20, value=5, step=1, label="Top K")
|
| 72 |
],
|
| 73 |
outputs=[
|
| 74 |
-
gr.Dataframe(headers=["
|
| 75 |
gr.Textbox(label="Time Taken")
|
| 76 |
],
|
| 77 |
title="RSFT Alice Embeddings (Transformers)",
|
|
|
|
| 4 |
import json
|
| 5 |
import torch
|
| 6 |
from transformers import AutoTokenizer, AutoModel
|
| 7 |
+
from sentence_transformers import SentenceTransformer, util
|
| 8 |
|
| 9 |
# --- Path Configuration ---
|
| 10 |
# Get the absolute path of the directory containing this script
|
|
|
|
| 26 |
|
| 27 |
# --- Model and Tokenizer Loading ---
|
| 28 |
try:
|
| 29 |
+
# model_path = "sentence-transformers/all-MiniLM-L6-v2"
|
| 30 |
+
model_path = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
|
| 31 |
+
# model_path = "Qwen/Qwen3-Embedding-0.6B"
|
| 32 |
+
|
| 33 |
+
# tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 34 |
+
# model = AutoModel.from_pretrained(model_path)
|
| 35 |
+
model = SentenceTransformer(model_path)
|
| 36 |
+
|
| 37 |
except Exception as e:
|
| 38 |
raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")
|
| 39 |
|
|
|
|
| 46 |
for line in f:
|
| 47 |
dataset.append(json.loads(line))
|
| 48 |
|
| 49 |
+
# Pre-compute corpus embeddings
|
| 50 |
+
import re
|
| 51 |
+
|
| 52 |
+
def split_into_sentences(text):
|
| 53 |
+
"""Splits a paragraph into sentences based on capitalization and punctuation."""
|
| 54 |
+
# This regex looks for a capital letter, followed by anything that's not a period,
|
| 55 |
+
# exclamation mark, or question mark, and then ends with one of those punctuation marks.
|
| 56 |
+
sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
|
| 57 |
+
return sentences
|
| 58 |
+
|
| 59 |
+
def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
|
| 60 |
+
chunked_corpus = []
|
| 61 |
+
for doc_idx, doc_text in enumerate(corpus_documents):
|
| 62 |
+
sentences = split_into_sentences(doc_text)
|
| 63 |
+
if not sentences:
|
| 64 |
+
continue
|
| 65 |
+
|
| 66 |
+
# If there are fewer sentences than chunk_size, just use the whole document as one chunk
|
| 67 |
+
if len(sentences) < chunk_size:
|
| 68 |
+
chunked_corpus.append({
|
| 69 |
+
"text": doc_text,
|
| 70 |
+
"original_doc_idx": doc_idx,
|
| 71 |
+
"start_sentence_idx": 0,
|
| 72 |
+
"end_sentence_idx": len(sentences) - 1
|
| 73 |
+
})
|
| 74 |
+
continue
|
| 75 |
+
|
| 76 |
+
for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
|
| 77 |
+
chunk_sentences = sentences[i : i + chunk_size]
|
| 78 |
+
chunk_text = " ".join(chunk_sentences)
|
| 79 |
+
chunked_corpus.append({
|
| 80 |
+
"text": chunk_text,
|
| 81 |
+
"original_doc_idx": doc_idx,
|
| 82 |
+
"start_sentence_idx": i,
|
| 83 |
+
"end_sentence_idx": i + chunk_size - 1
|
| 84 |
+
})
|
| 85 |
+
return chunked_corpus
|
| 86 |
+
|
| 87 |
+
def process_documents_for_chunking(documents):
|
| 88 |
+
chunked_corpus_data = create_overlapped_chunks(documents)
|
| 89 |
+
flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
|
| 90 |
+
return chunked_corpus_data, flat_corpus_chunks
|
| 91 |
|
| 92 |
# Pre-compute corpus embeddings
|
| 93 |
+
original_corpus = [item["positive"] for item in dataset]
|
| 94 |
+
# chunked_corpus_data, flat_corpus_chunks = process_documents_for_chunking(original_corpus)
|
| 95 |
+
# corpus_embeddings = model.encode(flat_corpus_chunks)
|
| 96 |
+
corpus_embeddings = model.encode(original_corpus)
|
| 97 |
+
|
| 98 |
+
# def find_similar(prompt, top_k):
|
| 99 |
+
# start_time = time.time()
|
| 100 |
+
|
| 101 |
+
# prompt_embedding = model.encode(prompt)
|
| 102 |
+
# scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
|
| 103 |
+
|
| 104 |
+
# # Pair scores with the chunked corpus data
|
| 105 |
+
# scored_chunks = []
|
| 106 |
+
# for i, score in enumerate(scores):
|
| 107 |
+
# scored_chunks.append({
|
| 108 |
+
# "score": score,
|
| 109 |
+
# "text": chunked_corpus_data[i]["text"],
|
| 110 |
+
# "original_doc_idx": chunked_corpus_data[i]["original_doc_idx"]
|
| 111 |
+
# })
|
| 112 |
+
|
| 113 |
+
# # Sort by decreasing score
|
| 114 |
+
# scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)
|
| 115 |
+
|
| 116 |
+
# results = []
|
| 117 |
+
# for item in scored_chunks[:top_k]:
|
| 118 |
+
# # Return the original document text, not just the chunk
|
| 119 |
+
# original_doc_text = original_corpus[item["original_doc_idx"]]
|
| 120 |
+
# results.append((item["score"], original_doc_text))
|
| 121 |
+
|
| 122 |
+
# end_time = time.time()
|
| 123 |
+
|
| 124 |
+
# return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
| 125 |
+
|
| 126 |
+
# with torch.no_grad():
|
| 127 |
+
# encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
|
| 128 |
+
# corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)
|
| 129 |
|
| 130 |
def find_similar(prompt, top_k):
|
| 131 |
start_time = time.time()
|
| 132 |
|
| 133 |
+
prompt_embedding = model.encode(prompt)
|
| 134 |
+
scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
|
| 135 |
+
doc_score_pairs = list(zip(original_corpus, scores))
|
| 136 |
+
|
| 137 |
+
#Sort by decreasing score
|
| 138 |
+
doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
|
| 139 |
+
|
| 140 |
+
# with torch.no_grad():
|
| 141 |
+
# encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
|
| 142 |
+
# prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)
|
| 143 |
|
| 144 |
+
# cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
|
| 145 |
+
# top_results = torch.topk(cos_scores, k=int(top_k))
|
| 146 |
|
| 147 |
end_time = time.time()
|
| 148 |
|
| 149 |
results = []
|
| 150 |
+
# for doc, score in doc_score_pairs[:top_k]:
|
| 151 |
+
for doc, score in doc_score_pairs:
|
| 152 |
+
results.append((score, doc))
|
| 153 |
|
| 154 |
return results, f"{(end_time - start_time) * 1000:.2f} ms"
|
| 155 |
|
|
|
|
| 160 |
gr.Slider(1, 20, value=5, step=1, label="Top K")
|
| 161 |
],
|
| 162 |
outputs=[
|
| 163 |
+
gr.Dataframe(headers=[ "Score", "Response"]),
|
| 164 |
gr.Textbox(label="Time Taken")
|
| 165 |
],
|
| 166 |
title="RSFT Alice Embeddings (Transformers)",
|