Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| from utils.text_processing import clean_text | |
| def create_doc2vec_documents(output_path, num_docs=500): | |
| """Create sample documents for Doc2Vec training""" | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| # Sample job descriptions | |
| templates = [ | |
| "Experienced {role} with {experience} years in {skills} seeking position in {industry}", | |
| "Skilled {role} specializing in {skills} looking for opportunities in {domain}", | |
| "{role} with expertise in {skills} and experience with {tools}", | |
| "Professional {role} with background in {domain} and proficiency in {skills}", | |
| "Certified {role} with {experience}+ years experience in {industry} using {tools}" | |
| ] | |
| # Sample data for placeholders | |
| roles = ["software engineer", "data scientist", "devops engineer", | |
| "web developer", "system administrator", "network engineer"] | |
| skills = ["Python", "Java", "JavaScript", "React", "AWS", "Docker", | |
| "Kubernetes", "SQL", "machine learning", "data analysis"] | |
| tools = ["Git", "Jenkins", "Docker", "Kubernetes", "AWS", "Azure", | |
| "TensorFlow", "PyTorch", "React", "Angular"] | |
| industries = ["tech", "finance", "healthcare", "e-commerce", "education"] | |
| domains = ["cloud computing", "AI development", "web applications", | |
| "data engineering", "cybersecurity"] | |
| experiences = ["3", "5", "7", "10", "15"] | |
| # Generate documents | |
| documents = [] | |
| for _ in range(num_docs): | |
| template = random.choice(templates) | |
| doc = template.format( | |
| role=random.choice(roles), | |
| skills=", ".join(random.sample(skills, random.randint(2, 4))), | |
| tools=", ".join(random.sample(tools, random.randint(1, 3))), | |
| industry=random.choice(industries), | |
| domain=random.choice(domains), | |
| experience=random.choice(experiences) | |
| ) | |
| documents.append(clean_text(doc)) | |
| # Save to file | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for doc in documents: | |
| f.write(doc + "\n") | |
| print(f"Created Doc2Vec documents at {output_path} with {num_docs} documents") | |
| return documents | |
| if __name__ == "__main__": | |
| create_doc2vec_documents("../data/documents/documents.txt") |