Spaces:

Sensei13k
/

chatbot

Sleeping

App Files Files Community

chatbot / generate_embeddings.py

Sensei13k

Upload 18 files

02acac5 verified 8 months ago

raw

history blame contribute delete

7.12 kB

	import os
	import json
	import google.generativeai as genai
	from typing import List, Dict, Any
	import time

	# 1. Configure Gemini SDK
	API_KEY = os.getenv("GOOGLE_API_KEY")
	if not API_KEY:
	raise ValueError("Set GOOGLE_API_KEY in env before running.")

	genai.configure(api_key=API_KEY) # type: ignore

	# 2. File paths
	DATA_DIR = "data"
	PROFILE_IN = os.path.join(DATA_DIR, "onboarding_profiles.jsonl")
	JOB_IN = os.path.join(DATA_DIR, "job_listings.jsonl")
	PROFILE_OUT = os.path.join(DATA_DIR, "embeddings_profiles.jsonl")
	JOB_OUT = os.path.join(DATA_DIR, "embeddings_jobs.jsonl")

	def get_embedding(text: str, model: str = "models/text-embedding-004", task_type: str = "retrieval_document") -> List[float]:
	"""
	Get embedding for a single text using Gemini API.

	Args:
	text: Text to embed
	model: Embedding model to use
	task_type: Task type for the embedding

	Returns:
	List of floats representing the embedding vector
	"""
	try:
	# Use the updated API for google-generativeai >= 0.8.0
	response = genai.embed_content( # type: ignore
	model=model,
	content=text,
	task_type=task_type,
	title=None # Optional title for the content
	)
	return response['embedding']
	except Exception as e:
	print(f"Error getting embedding: {e}")
	# Retry once after a short delay
	time.sleep(1)
	try:
	response = genai.embed_content( # type: ignore
	model=model,
	content=text,
	task_type=task_type
	)
	return response['embedding']
	except Exception as e2:
	print(f"Retry failed: {e2}")
	raise e2

	def create_profile_text(record: Dict[str, Any]) -> str:
	"""Create a comprehensive text representation of a profile."""
	text_parts = []

	if record.get('name'):
	text_parts.append(f"Name: {record['name']}")

	if record.get('role'):
	text_parts.append(f"Role: {record['role']}")

	if record.get('skills'):
	skills = record['skills']
	if isinstance(skills, list):
	text_parts.append(f"Skills: {', '.join(skills)}")
	else:
	text_parts.append(f"Skills: {skills}")

	if record.get('experience'):
	text_parts.append(f"Experience: {record['experience']}")

	if record.get('location'):
	text_parts.append(f"Location: {record['location']}")

	return ". ".join(text_parts) + "."

	def create_job_text(record: Dict[str, Any]) -> str:
	"""Create a comprehensive text representation of a job listing."""
	text_parts = []

	if record.get('title'):
	text_parts.append(f"Title: {record['title']}")

	if record.get('company'):
	text_parts.append(f"Company: {record['company']}")

	if record.get('type'):
	text_parts.append(f"Type: {record['type']}")

	if record.get('skills'):
	skills = record['skills']
	if isinstance(skills, list):
	text_parts.append(f"Required Skills: {', '.join(skills)}")
	else:
	text_parts.append(f"Required Skills: {skills}")

	if record.get('description'):
	text_parts.append(f"Description: {record['description']}")

	if record.get('location'):
	text_parts.append(f"Location: {record['location']}")

	return ". ".join(text_parts) + "."

	def embed_and_write(in_path: str, out_path: str, is_profile: bool = True,
	embed_model: str = "models/text-embedding-004"):
	"""
	Read JSONL from in_path, generate embeddings, and write to out_path.

	Args:
	in_path: Input JSONL file path
	out_path: Output JSONL file path
	is_profile: Whether processing profiles (True) or jobs (False)
	embed_model: Embedding model to use
	"""
	if not os.path.exists(in_path):
	print(f"Input file not found: {in_path}")
	return

	processed_count = 0
	error_count = 0

	with open(in_path, "r", encoding="utf-8") as f_in, \
	open(out_path, "w", encoding="utf-8") as f_out:

	for line_num, line in enumerate(f_in, 1):
	try:
	record = json.loads(line.strip())
	doc_id = record.get("id", f"unknown_{line_num}")

	# Create text representation
	if is_profile:
	text = create_profile_text(record)
	task_type = "retrieval_document"
	else:
	text = create_job_text(record)
	task_type = "retrieval_document"

	# Get embedding
	embedding = get_embedding(text, embed_model, task_type)

	# Create output object
	out_obj = {
	"id": doc_id,
	"text": text,
	"embedding": embedding,
	"original_data": record # Keep original data for reference
	}

	# Write to output file
	f_out.write(json.dumps(out_obj) + "\n")
	processed_count += 1

	print(f"✓ Embedded {('profile' if is_profile else 'job')} {doc_id} "
	f"(line {line_num})")

	# Small delay to avoid rate limiting
	time.sleep(0.1)

	except json.JSONDecodeError as e:
	error_count += 1
	print(f"✗ JSON decode error on line {line_num}: {e}")
	continue
	except Exception as e:
	error_count += 1
	print(f"✗ Error processing line {line_num}: {e}")
	continue

	print(f"\nProcessed: {processed_count}, Errors: {error_count}")

	def main():
	"""Main function to process both profiles and jobs."""
	# Create data directory if it doesn't exist
	os.makedirs(DATA_DIR, exist_ok=True)

	print("Starting embedding generation...")
	print(f"Using embedding model: models/text-embedding-004")

	# Process profiles
	if os.path.exists(PROFILE_IN):
	print(f"\n📋 Processing profiles from {PROFILE_IN}")
	embed_and_write(PROFILE_IN, PROFILE_OUT, is_profile=True)
	print(f"✓ Profile embeddings saved to {PROFILE_OUT}")
	else:
	print(f"⚠️ Profile file not found: {PROFILE_IN}")

	# Process job listings
	if os.path.exists(JOB_IN):
	print(f"\n💼 Processing jobs from {JOB_IN}")
	embed_and_write(JOB_IN, JOB_OUT, is_profile=False)
	print(f"✓ Job embeddings saved to {JOB_OUT}")
	else:
	print(f"⚠️ Job file not found: {JOB_IN}")

	print("\n🎉 Embedding generation completed!")

	if __name__ == "__main__":
	main()