Spaces:

stevenbucaille
/

semantic-code-repo

Sleeping

App Files Files Community

semantic-code-repo / app.py

stevenbucaille

fix

c56dc56 2 months ago

raw

history blame contribute delete

7.41 kB

	import gradio as gr
	import spaces
	from sentence_transformers import SentenceTransformer
	import pandas as pd
	from datasets import Dataset, load_dataset
	import os
	import json
	import torch

	# Global model cache to avoid reloading same model repeatedly
	MODELS = {}
	REPO_ID = "stevenbucaille/semantic-transformers"


	def get_model(model_name):
	if model_name not in MODELS:
	print(f"Loading model: {model_name}")
	MODELS[model_name] = SentenceTransformer(model_name, trust_remote_code=True)
	return MODELS[model_name]


	@spaces.GPU(size="xlarge", duration=120)
	def encode_batch_gpu(texts, model_name):
	"""
	GPU-accelerated function to encode a list of texts.
	Takes a list of strings, returns numpy array of embeddings.
	"""
	print(f"Encoding batch of {len(texts)} items with {model_name}...")
	model = get_model(model_name)

	device = model.device
	# Adjust internal batch size for the model.encode method
	internal_batch_size = 512 if device.type == "cuda" else 4

	embeddings = model.encode(
	texts,
	batch_size=internal_batch_size,
	show_progress_bar=True,
	convert_to_numpy=True,
	)

	return embeddings


	def process_dataset(model_name, progress=gr.Progress()):
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	return None, "Error: HF_TOKEN environment variable is not set."

	try:
	# 1. Load Data
	progress(0.1, desc="Loading Dataset from Hub...")
	print(f"Loading dataset {REPO_ID}...")
	try:
	ds = load_dataset(REPO_ID, split="train")
	df = ds.to_pandas()
	except Exception as e:
	return None, f"Error loading dataset: {e}. Make sure it exists."

	print(f"Loaded {len(df)} rows.")

	# 2. Check/Init Embeddings
	if "embedding" not in df.columns:
	print("Initializing embedding column...")
	df["embedding"] = None
	df["embedding_model"] = None

	# Ensure embedding column allows objects (arrays) or None
	if df["embedding"].dtype != "object":
	df["embedding"] = df["embedding"].astype("object")

	# 3. Find Unprocessed
	# We process rows where embedding is None
	unprocessed_mask = df["embedding"].isnull()
	unprocessed_indices = df[unprocessed_mask].index.tolist()
	total_unprocessed = len(unprocessed_indices)

	print(f"Total unprocessed rows: {total_unprocessed}")

	if total_unprocessed == 0:
	return None, "Dataset is already fully embedded!"

	# 4. Processing Loop
	# We iterate in chunks. If GPU timeout happens, we catch it and save progress.
	# User requested max 10k per call. Let's use 5k to be safe with 120s limit.
	CHUNK_SIZE = 5000

	processed_count = 0
	error_occurred = False

	progress(0.2, desc=f"Starting processing of {total_unprocessed} rows...")

	for i in range(0, total_unprocessed, CHUNK_SIZE):
	batch_indices = unprocessed_indices[i : i + CHUNK_SIZE]
	batch_texts = df.loc[batch_indices, "content"].tolist()

	current_progress = 0.2 + 0.7 * (i / total_unprocessed)
	progress(
	current_progress, desc=f"Encoding batch {i}/{total_unprocessed}..."
	)

	try:
	# Call GPU function
	# This call is protected by @spaces.GPU timeout
	embeddings = encode_batch_gpu(batch_texts, model_name)

	# Update DataFrame
	# Use explicit loop to avoid "Must have equal len keys and value" error
	# when assigning list of arrays to pandas slice
	for idx, emb in zip(batch_indices, embeddings):
	df.at[idx, "embedding"] = emb
	df.at[idx, "embedding_model"] = model_name

	processed_count += len(batch_indices)

	# --- Checkpoint Saving ---
	print(
	f"Batch completed. Saving checkpoint for {processed_count} processed rows..."
	)
	# Save locally first (fast)
	df.to_parquet("embeddings_checkpoint.parquet")

	# Push to Hub (slower but persistent across machines)
	if hf_token and REPO_ID:
	try:
	# Convert only if necessary or optimize
	# Creating a new dataset every time might apply memory pressure
	# but it is what ensures the Hub is up to date
	temp_ds = Dataset.from_pandas(df)
	temp_ds.push_to_hub(REPO_ID, token=hf_token)
	print("Checkpoint pushed to Hub.")
	del temp_ds
	except Exception as hub_err:
	print(f"Warning: Failed to push checkpoint to Hub: {hub_err}")

	except Exception as e:
	print(f"Error during GPU encoding batch {i}: {e}")
	error_occurred = True
	# We stop processing but proceed to save what we have
	break

	# 5. Save & Push
	progress(0.95, desc="Saving progress to Hub...")
	output_msg = f"Processed {processed_count} rows out of {total_unprocessed}.\n"

	if error_occurred:
	output_msg += "⚠️ Run interrupted (timeout/error). Saving progress...\n"
	output_msg += "Please click 'Generate' again to continue."
	else:
	output_msg += "✅ All batches completed successfully."

	try:
	# Convert back to Dataset
	updated_ds = Dataset.from_pandas(df)
	updated_ds.push_to_hub(REPO_ID, token=hf_token)
	output_msg += f"\nDataset saved to {REPO_ID}"
	except Exception as e:
	output_msg += f"\n❌ Error saving to Hub: {e}"

	# Optional: Save parquet locally too
	output_file = "embeddings_partial.parquet"
	try:
	df.to_parquet(output_file)
	except:
	pass

	return output_file, output_msg

	except Exception as e:
	import traceback

	traceback.print_exc()
	return None, f"Global Error: {str(e)}"


	# UI
	with gr.Blocks(title="Code Embedding Generator") as demo:
	gr.Markdown("# 🚀 ZeroGPU Code Embedding Generator")
	gr.Markdown(
	f"Generates embeddings for {REPO_ID}. <br>"
	"If the process times out, successfull batches are saved. Run again to resume."
	)

	with gr.Row():
	with gr.Column():
	model_selector = gr.Dropdown(
	choices=[
	"Snowflake/snowflake-arctic-embed-m",
	"BAAI/bge-m3",
	"sentence-transformers/all-MiniLM-L6-v2",
	],
	value="Snowflake/snowflake-arctic-embed-m",
	label="Embedding Model",
	)
	submit_btn = gr.Button("Generate Embeddings (Resume)", variant="primary")

	with gr.Column():
	output_file = gr.File(label="Download Parquet (Partial/Full)")
	status_output = gr.Textbox(label="Status Log", lines=10)

	submit_btn.click(
	fn=process_dataset,
	inputs=[model_selector],
	outputs=[output_file, status_output],
	)

	if __name__ == "__main__":
	demo.launch()