Spaces:
Sleeping
Sleeping
| import os | |
| import random | |
| from utils.text_processing import clean_text | |
| def create_glove_corpus(output_path, num_sentences=10000): | |
| """Create sample corpus data for GloVe training""" | |
| os.makedirs(os.path.dirname(output_path), exist_ok=True) | |
| # Sample job-related phrases | |
| phrases = [ | |
| "software engineer", "web development", "python programming", | |
| "machine learning", "data analysis", "cloud computing", | |
| "devops practices", "system administration", "network security", | |
| "frontend development", "backend systems", "database management", | |
| "artificial intelligence", "deep learning", "natural language processing", | |
| "cloud infrastructure", "containerization", "continuous integration", | |
| "agile methodology", "scrum framework", "version control systems", | |
| "restful apis", "microservices architecture", "test driven development" | |
| ] | |
| # Generate sample sentences | |
| with open(output_path, 'w', encoding='utf-8') as f: | |
| for _ in range(num_sentences): | |
| # Create a sentence with 3-8 random phrases | |
| sentence = ' '.join(random.sample(phrases, random.randint(3, 8))) | |
| f.write(clean_text(sentence) + "\n") | |
| print(f"Created GloVe corpus at {output_path} with {num_sentences} sentences") | |
| if __name__ == "__main__": | |
| create_glove_corpus("../data/corpus/corpus.txt") | |