philtoms commited on
Commit
520f5a2
·
verified ·
1 Parent(s): d698881

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -14
app.py CHANGED
@@ -4,6 +4,7 @@ import os
4
  import json
5
  import torch
6
  from transformers import AutoTokenizer, AutoModel
 
7
 
8
  # --- Path Configuration ---
9
  # Get the absolute path of the directory containing this script
@@ -25,8 +26,14 @@ else:
25
 
26
  # --- Model and Tokenizer Loading ---
27
  try:
28
- tokenizer = AutoTokenizer.from_pretrained(model_path)
29
- model = AutoModel.from_pretrained(model_path)
 
 
 
 
 
 
30
  except Exception as e:
31
  raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")
32
 
@@ -39,28 +46,110 @@ with open(data_path, "r") as f:
39
  for line in f:
40
  dataset.append(json.loads(line))
41
 
42
- corpus = [item["positive"] for item in dataset]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  # Pre-compute corpus embeddings
45
- with torch.no_grad():
46
- encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
47
- corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def find_similar(prompt, top_k):
50
  start_time = time.time()
51
 
52
- with torch.no_grad():
53
- encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
54
- prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)
 
 
 
 
 
 
 
55
 
56
- cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
57
- top_results = torch.topk(cos_scores, k=int(top_k))
58
 
59
  end_time = time.time()
60
 
61
  results = []
62
- for score, idx in zip(top_results.values, top_results.indices):
63
- results.append((corpus[idx], score.item()))
 
64
 
65
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
66
 
@@ -71,7 +160,7 @@ iface = gr.Interface(
71
  gr.Slider(1, 20, value=5, step=1, label="Top K")
72
  ],
73
  outputs=[
74
- gr.Dataframe(headers=["Response", "Score"]),
75
  gr.Textbox(label="Time Taken")
76
  ],
77
  title="RSFT Alice Embeddings (Transformers)",
 
4
  import json
5
  import torch
6
  from transformers import AutoTokenizer, AutoModel
7
+ from sentence_transformers import SentenceTransformer, util
8
 
9
  # --- Path Configuration ---
10
  # Get the absolute path of the directory containing this script
 
26
 
27
  # --- Model and Tokenizer Loading ---
28
  try:
29
+ # model_path = "sentence-transformers/all-MiniLM-L6-v2"
30
+ model_path = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
31
+ # model_path = "Qwen/Qwen3-Embedding-0.6B"
32
+
33
+ # tokenizer = AutoTokenizer.from_pretrained(model_path)
34
+ # model = AutoModel.from_pretrained(model_path)
35
+ model = SentenceTransformer(model_path)
36
+
37
  except Exception as e:
38
  raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")
39
 
 
46
  for line in f:
47
  dataset.append(json.loads(line))
48
 
49
+ # Pre-compute corpus embeddings
50
+ import re
51
+
52
+ def split_into_sentences(text):
53
+ """Splits a paragraph into sentences based on capitalization and punctuation."""
54
+ # This regex looks for a capital letter, followed by anything that's not a period,
55
+ # exclamation mark, or question mark, and then ends with one of those punctuation marks.
56
+ sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
57
+ return sentences
58
+
59
+ def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
60
+ chunked_corpus = []
61
+ for doc_idx, doc_text in enumerate(corpus_documents):
62
+ sentences = split_into_sentences(doc_text)
63
+ if not sentences:
64
+ continue
65
+
66
+ # If there are fewer sentences than chunk_size, just use the whole document as one chunk
67
+ if len(sentences) < chunk_size:
68
+ chunked_corpus.append({
69
+ "text": doc_text,
70
+ "original_doc_idx": doc_idx,
71
+ "start_sentence_idx": 0,
72
+ "end_sentence_idx": len(sentences) - 1
73
+ })
74
+ continue
75
+
76
+ for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
77
+ chunk_sentences = sentences[i : i + chunk_size]
78
+ chunk_text = " ".join(chunk_sentences)
79
+ chunked_corpus.append({
80
+ "text": chunk_text,
81
+ "original_doc_idx": doc_idx,
82
+ "start_sentence_idx": i,
83
+ "end_sentence_idx": i + chunk_size - 1
84
+ })
85
+ return chunked_corpus
86
+
87
+ def process_documents_for_chunking(documents):
88
+ chunked_corpus_data = create_overlapped_chunks(documents)
89
+ flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
90
+ return chunked_corpus_data, flat_corpus_chunks
91
 
92
  # Pre-compute corpus embeddings
93
+ original_corpus = [item["positive"] for item in dataset]
94
+ # chunked_corpus_data, flat_corpus_chunks = process_documents_for_chunking(original_corpus)
95
+ # corpus_embeddings = model.encode(flat_corpus_chunks)
96
+ corpus_embeddings = model.encode(original_corpus)
97
+
98
+ # def find_similar(prompt, top_k):
99
+ # start_time = time.time()
100
+
101
+ # prompt_embedding = model.encode(prompt)
102
+ # scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
103
+
104
+ # # Pair scores with the chunked corpus data
105
+ # scored_chunks = []
106
+ # for i, score in enumerate(scores):
107
+ # scored_chunks.append({
108
+ # "score": score,
109
+ # "text": chunked_corpus_data[i]["text"],
110
+ # "original_doc_idx": chunked_corpus_data[i]["original_doc_idx"]
111
+ # })
112
+
113
+ # # Sort by decreasing score
114
+ # scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)
115
+
116
+ # results = []
117
+ # for item in scored_chunks[:top_k]:
118
+ # # Return the original document text, not just the chunk
119
+ # original_doc_text = original_corpus[item["original_doc_idx"]]
120
+ # results.append((item["score"], original_doc_text))
121
+
122
+ # end_time = time.time()
123
+
124
+ # return results, f"{(end_time - start_time) * 1000:.2f} ms"
125
+
126
+ # with torch.no_grad():
127
+ # encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
128
+ # corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)
129
 
130
  def find_similar(prompt, top_k):
131
  start_time = time.time()
132
 
133
+ prompt_embedding = model.encode(prompt)
134
+ scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
135
+ doc_score_pairs = list(zip(original_corpus, scores))
136
+
137
+ #Sort by decreasing score
138
+ doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)
139
+
140
+ # with torch.no_grad():
141
+ # encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
142
+ # prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)
143
 
144
+ # cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
145
+ # top_results = torch.topk(cos_scores, k=int(top_k))
146
 
147
  end_time = time.time()
148
 
149
  results = []
150
+ # for doc, score in doc_score_pairs[:top_k]:
151
+ for doc, score in doc_score_pairs:
152
+ results.append((score, doc))
153
 
154
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
155
 
 
160
  gr.Slider(1, 20, value=5, step=1, label="Top K")
161
  ],
162
  outputs=[
163
+ gr.Dataframe(headers=[ "Score", "Response"]),
164
  gr.Textbox(label="Time Taken")
165
  ],
166
  title="RSFT Alice Embeddings (Transformers)",