JobShield-AI / scripts /create_doc2vec_data.py
shravanijadhav264's picture
Initial clean commit
984c70c
import os
import random
from utils.text_processing import clean_text
def create_doc2vec_documents(output_path, num_docs=500):
"""Create sample documents for Doc2Vec training"""
os.makedirs(os.path.dirname(output_path), exist_ok=True)
# Sample job descriptions
templates = [
"Experienced {role} with {experience} years in {skills} seeking position in {industry}",
"Skilled {role} specializing in {skills} looking for opportunities in {domain}",
"{role} with expertise in {skills} and experience with {tools}",
"Professional {role} with background in {domain} and proficiency in {skills}",
"Certified {role} with {experience}+ years experience in {industry} using {tools}"
]
# Sample data for placeholders
roles = ["software engineer", "data scientist", "devops engineer",
"web developer", "system administrator", "network engineer"]
skills = ["Python", "Java", "JavaScript", "React", "AWS", "Docker",
"Kubernetes", "SQL", "machine learning", "data analysis"]
tools = ["Git", "Jenkins", "Docker", "Kubernetes", "AWS", "Azure",
"TensorFlow", "PyTorch", "React", "Angular"]
industries = ["tech", "finance", "healthcare", "e-commerce", "education"]
domains = ["cloud computing", "AI development", "web applications",
"data engineering", "cybersecurity"]
experiences = ["3", "5", "7", "10", "15"]
# Generate documents
documents = []
for _ in range(num_docs):
template = random.choice(templates)
doc = template.format(
role=random.choice(roles),
skills=", ".join(random.sample(skills, random.randint(2, 4))),
tools=", ".join(random.sample(tools, random.randint(1, 3))),
industry=random.choice(industries),
domain=random.choice(domains),
experience=random.choice(experiences)
)
documents.append(clean_text(doc))
# Save to file
with open(output_path, 'w', encoding='utf-8') as f:
for doc in documents:
f.write(doc + "\n")
print(f"Created Doc2Vec documents at {output_path} with {num_docs} documents")
return documents
if __name__ == "__main__":
create_doc2vec_documents("../data/documents/documents.txt")