Spaces:

shravanijadhav264
/

JobShield-AI

Sleeping

App Files Files Community

JobShield-AI / scripts /create_doc2vec_data.py

shravanijadhav264

Initial clean commit

984c70c 16 days ago

raw

history blame contribute delete

2.33 kB

	import os
	import random
	from utils.text_processing import clean_text

	def create_doc2vec_documents(output_path, num_docs=500):
	"""Create sample documents for Doc2Vec training"""
	os.makedirs(os.path.dirname(output_path), exist_ok=True)

	# Sample job descriptions
	templates = [
	"Experienced {role} with {experience} years in {skills} seeking position in {industry}",
	"Skilled {role} specializing in {skills} looking for opportunities in {domain}",
	"{role} with expertise in {skills} and experience with {tools}",
	"Professional {role} with background in {domain} and proficiency in {skills}",
	"Certified {role} with {experience}+ years experience in {industry} using {tools}"
	]

	# Sample data for placeholders
	roles = ["software engineer", "data scientist", "devops engineer",
	"web developer", "system administrator", "network engineer"]
	skills = ["Python", "Java", "JavaScript", "React", "AWS", "Docker",
	"Kubernetes", "SQL", "machine learning", "data analysis"]
	tools = ["Git", "Jenkins", "Docker", "Kubernetes", "AWS", "Azure",
	"TensorFlow", "PyTorch", "React", "Angular"]
	industries = ["tech", "finance", "healthcare", "e-commerce", "education"]
	domains = ["cloud computing", "AI development", "web applications",
	"data engineering", "cybersecurity"]
	experiences = ["3", "5", "7", "10", "15"]

	# Generate documents
	documents = []
	for _ in range(num_docs):
	template = random.choice(templates)
	doc = template.format(
	role=random.choice(roles),
	skills=", ".join(random.sample(skills, random.randint(2, 4))),
	tools=", ".join(random.sample(tools, random.randint(1, 3))),
	industry=random.choice(industries),
	domain=random.choice(domains),
	experience=random.choice(experiences)
	)
	documents.append(clean_text(doc))

	# Save to file
	with open(output_path, 'w', encoding='utf-8') as f:
	for doc in documents:
	f.write(doc + "\n")

	print(f"Created Doc2Vec documents at {output_path} with {num_docs} documents")
	return documents

	if __name__ == "__main__":
	create_doc2vec_documents("../data/documents/documents.txt")