Spaces:

philtoms
/

minilm-alice-base-rsft-v1

Sleeping

App Files Files Community

minilm-alice-base-rsft-v1 / app.py

philtoms

Upload app.py

380225c verified 5 months ago

raw

history blame contribute delete

6.67 kB

	import gradio as gr
	import time
	import os
	import json
	import torch
	from transformers import AutoTokenizer, AutoModel
	from sentence_transformers import SentenceTransformer, util

	# --- Path Configuration ---
	# Get the absolute path of the directory containing this script
	script_dir = os.path.dirname(os.path.abspath(__file__))

	# Check if running in a Hugging Face Space
	is_hf_space = "SPACE_ID" in os.environ

	if is_hf_space:
	# In a Space, load model from the Hub and data from the repo root
	model_path = os.environ.get("MODEL_REPO_ID", "philtoms/minilm-alice-base-rsft-v2")
	data_path = "training_triplets.jsonl"
	print(f"Running on HF Spaces. Using model from Hub: {model_path}")
	else:
	# Locally, construct absolute paths based on the script's location
	model_path = os.path.join(script_dir, "..", "models", "minilm-alice-base-rsft-v2", "final")
	data_path = os.path.join(script_dir, "..", "data", "training_triplets.jsonl")
	print(f"Running locally. Using local model at: {model_path}")

	# --- Model and Tokenizer Loading ---
	try:
	# model_path = "sentence-transformers/all-MiniLM-L6-v2"
	model_path = "sentence-transformers/multi-qa-mpnet-base-cos-v1"
	# model_path = "Qwen/Qwen3-Embedding-0.6B"

	# tokenizer = AutoTokenizer.from_pretrained(model_path)
	# model = AutoModel.from_pretrained(model_path)
	model = SentenceTransformer(model_path)

	except Exception as e:
	raise gr.Error(f"Failed to load model from '{model_path}'. Error: {e}")

	# --- Dataset Loading ---
	if not os.path.exists(data_path):
	raise gr.Error(f"Data file not found at '{data_path}'. Please ensure the file exists.")

	dataset = []
	with open(data_path, "r") as f:
	for line in f:
	dataset.append(json.loads(line))

	# Pre-compute corpus embeddings
	import re

	# def split_into_sentences(text):
	# """Splits a paragraph into sentences based on capitalization and punctuation."""
	# # This regex looks for a capital letter, followed by anything that's not a period,
	# # exclamation mark, or question mark, and then ends with one of those punctuation marks.
	# sentences = re.findall(r'([A-Z][^.!?]*[.!?])', text)
	# return sentences

	# def create_overlapped_chunks(corpus_documents, chunk_size=2, overlap=1):
	# chunked_corpus = []
	# for doc_idx, doc_text in enumerate(corpus_documents):
	# sentences = split_into_sentences(doc_text)
	# if not sentences:
	# continue

	# # If there are fewer sentences than chunk_size, just use the whole document as one chunk
	# if len(sentences) < chunk_size:
	# chunked_corpus.append({
	# "text": doc_text,
	# "original_doc_idx": doc_idx,
	# "start_sentence_idx": 0,
	# "end_sentence_idx": len(sentences) - 1
	# })
	# continue

	# for i in range(0, len(sentences) - chunk_size + 1, chunk_size - overlap):
	# chunk_sentences = sentences[i : i + chunk_size]
	# chunk_text = " ".join(chunk_sentences)
	# chunked_corpus.append({
	# "text": chunk_text,
	# "original_doc_idx": doc_idx,
	# "start_sentence_idx": i,
	# "end_sentence_idx": i + chunk_size - 1
	# })
	# return chunked_corpus

	# def process_documents_for_chunking(documents):
	# chunked_corpus_data = create_overlapped_chunks(documents)
	# flat_corpus_chunks = [item["text"] for item in chunked_corpus_data]
	# return chunked_corpus_data, flat_corpus_chunks

	# Pre-compute corpus embeddings
	original_corpus = [item["positive"] for item in dataset]
	# chunked_corpus_data, flat_corpus_chunks = process_documents_for_chunking(original_corpus)
	# corpus_embeddings = model.encode(flat_corpus_chunks)
	corpus_embeddings = model.encode(original_corpus)

	# def find_similar(prompt, top_k):
	# start_time = time.time()

	# prompt_embedding = model.encode(prompt)
	# scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()

	# # Pair scores with the chunked corpus data
	# scored_chunks = []
	# for i, score in enumerate(scores):
	# scored_chunks.append({
	# "score": score,
	# "text": chunked_corpus_data[i]["text"],
	# "original_doc_idx": chunked_corpus_data[i]["original_doc_idx"]
	# })

	# # Sort by decreasing score
	# scored_chunks = sorted(scored_chunks, key=lambda x: x["score"], reverse=True)

	# results = []
	# for item in scored_chunks[:top_k]:
	# # Return the original document text, not just the chunk
	# original_doc_text = original_corpus[item["original_doc_idx"]]
	# results.append((item["score"], original_doc_text))

	# end_time = time.time()

	# return results, f"{(end_time - start_time) * 1000:.2f} ms"

	# with torch.no_grad():
	# encoded_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors='pt')
	# corpus_embeddings = model(**encoded_corpus).last_hidden_state.mean(dim=1)

	def find_similar(prompt, top_k):
	start_time = time.time()

	prompt_embedding = model.encode(prompt)
	scores = util.dot_score(prompt_embedding, corpus_embeddings)[0].cpu().tolist()
	doc_score_pairs = list(zip(original_corpus, scores))

	#Sort by decreasing score
	doc_score_pairs = sorted(doc_score_pairs, key=lambda x: x[1], reverse=True)

	# with torch.no_grad():
	# encoded_prompt = tokenizer(prompt, padding=True, truncation=True, return_tensors='pt')
	# prompt_embedding = model(**encoded_prompt).last_hidden_state.mean(dim=1)

	# cos_scores = torch.nn.functional.cosine_similarity(prompt_embedding, corpus_embeddings, dim=1)
	# top_results = torch.topk(cos_scores, k=int(top_k))

	end_time = time.time()

	results = []
	for doc, score in doc_score_pairs[:top_k]:
	# for doc, score in doc_score_pairs:
	results.append((score, doc))

	return results, f"{(end_time - start_time) * 1000:.2f} ms"

	iface = gr.Interface(
	fn=find_similar,
	inputs=[
	gr.Dropdown(
	["Alice sees White rabbit for the first time", "Alice meets caterpillar", "sad turtle story"],
	label="Select a prompt or type your own",
	allow_custom_value=True
	),
	gr.Slider(1, 20, value=5, step=1, label="Top K")
	],
	outputs=[
	gr.Dataframe(headers=[ "Score", "Response"]),
	gr.Textbox(label="Time Taken")
	],
	title="RSFT Alice Embeddings (Transformers)",
	description=f"Enter a prompt to find similar sentences from the corpus."
	)

	if __name__ == "__main__":
	iface.launch()