philtoms commited on
Commit
42f4105
·
verified ·
1 Parent(s): 450c5c6

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +46 -29
  2. requirements.txt +3 -1
app.py CHANGED
@@ -1,28 +1,38 @@
1
-
2
  import gradio as gr
3
  import time
4
- from sentence_transformers import SentenceTransformer, util
5
  import os
6
  import json
 
 
 
 
 
 
 
 
 
7
 
8
- print(os.environ)
9
- # Determine model path based on environment
10
- if "SPACE_ID" in os.environ:
11
- # Running on Hugging Face Spaces
12
- # Assumes the model is in a repository with the same name as the space
13
- model_path ='https://huggingface.co/philtoms/minilm-alice-base-rsft-v1'
14
- print(f"Running on HF Spaces. Using model: {model_path}")
15
  else:
16
- # Running locally
17
- model_path = "../models/minilm-alice-base-rsft-v1/final"
18
- print(f"Running locally. Using model: {model_path}")
 
19
 
20
- # Load the model
21
- model = SentenceTransformer(model_path)
 
 
 
 
22
 
23
- # Load the dataset
24
- # Adjust the data path for local vs. HF environment
25
- data_path = "alice_pairs.jsonl" if "SPACE_ID" in os.environ else "../data/alice_pairs.jsonl"
26
 
27
  dataset = []
28
  with open(data_path, "r") as f:
@@ -30,21 +40,28 @@ with open(data_path, "r") as f:
30
  dataset.append(json.loads(line))
31
 
32
  corpus = [item["passage"] for item in dataset]
33
- corpus_embeddings = model.encode(corpus, convert_to_tensor=True)
 
 
 
 
34
 
35
  def find_similar(prompt, top_k):
36
  start_time = time.time()
37
-
38
- prompt_embedding = model.encode(prompt, convert_to_tensor=True)
39
- cos_scores = util.cos_sim(prompt_embedding, corpus_embeddings)[0]
40
- top_results = cos_scores.topk(k=int(top_k))
41
-
 
 
 
42
  end_time = time.time()
43
-
44
  results = []
45
- for score, idx in zip(top_results[0], top_results[1]):
46
  results.append((corpus[idx], score.item()))
47
-
48
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
49
 
50
  iface = gr.Interface(
@@ -57,9 +74,9 @@ iface = gr.Interface(
57
  gr.Dataframe(headers=["Response", "Score"]),
58
  gr.Textbox(label="Time Taken")
59
  ],
60
- title="RSFT Alice embeddings",
61
- description="Enter a prompt and get the most similar sentences from the corpus."
62
  )
63
 
64
  if __name__ == "__main__":
65
- iface.launch()
 
 
1
  import gradio as gr
2
  import time
 
3
  import os
4
  import json
5
+ import torch
6
+ from transformers import AutoTokenizer, AutoModel
7
+
8
+ # --- Path Configuration ---
9
+ # Get the absolute path of the directory containing this script
10
+ script_dir = os.path.dirname(os.path.abspath(__file__))
11
+
12
+ # Check if running in a Hugging Face Space
13
+ is_hf_space = "SPACE_ID" in os.environ
14
 
15
+ if is_hf_space:
16
+ # In a Space, load model from the Hub and data from the repo root
17
+ model_path = os.environ.get("MODEL_REPO_ID", "philtoms/minilm-alice-base-rsft-v1")
18
+ data_path = "alice_pairs.jsonl"
19
+ print(f"Running on HF Spaces. Using model from Hub: {model_path}")
 
 
20
  else:
21
+ # Locally, construct absolute paths based on the script's location
22
+ model_path = os.path.join(script_dir, "..", "models", "minilm-alice-base-rsft-v1", "final")
23
+ data_path = os.path.join(script_dir, "..", "data", "alice_pairs.jsonl")
24
+ print(f"Running locally. Using local model at: {model_path}")
25
 
26
+ # --- Model and Tokenizer Loading ---
27
+ try:
28
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
29
+ model = AutoModel.from_pretrained(model_path)
30
+ except Exception as e:
31
+ raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")
32
 
33
+ # --- Dataset Loading ---
34
+ if not os.path.exists(data_path):
35
+ raise gr.Error(f"Data file not found at '{data_path}'. Please ensure the file exists.")
36
 
37
  dataset = []
38
  with open(data_path, "r") as f:
 
40
  dataset.append(json.loads(line))
41
 
42
  corpus = [item["passage"] for item in dataset]
43
+
44
+ # Pre-compute corpus embeddings
45
+ with torch.no_grad():
46
+ encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
47
+ corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)
48
 
49
  def find_similar(prompt, top_k):
50
  start_time = time.time()
51
+
52
+ with torch.no_grad():
53
+ encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
54
+ prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)
55
+
56
+ cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
57
+ top_results = torch.topk(cos_scores, k=int(top_k))
58
+
59
  end_time = time.time()
60
+
61
  results = []
62
+ for score, idx in zip(top_results.values, top_results.indices):
63
  results.append((corpus[idx], score.item()))
64
+
65
  return results, f"{(end_time - start_time) * 1000:.2f} ms"
66
 
67
  iface = gr.Interface(
 
74
  gr.Dataframe(headers=["Response", "Score"]),
75
  gr.Textbox(label="Time Taken")
76
  ],
77
+ title="RSFT Alice Embeddings (Transformers)",
78
+ description=f"Enter a prompt to find similar sentences from the corpus."
79
  )
80
 
81
  if __name__ == "__main__":
82
+ iface.launch()
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  gradio
2
- sentence-transformers
 
 
 
1
  gradio
2
+ transformers
3
+ torch
4
+ huggingface_hub