turkmen-word2vec / train_turkmen_word2vec.py

mamed0v

Loaded W2V model

2aa8b90 almost 2 years ago

5.81 kB

	"""
	Turkmen Word2Vec Model

	This script preprocesses Turkmen text data and trains a Word2Vec model.
	It's designed for open-source use and easy adaptation to other projects.

	Requirements:
	- Python 3.6+
	- Dependencies: nltk, gensim, tqdm

	Usage:
	1. Prepare your Turkmen text data in a file (one sentence per line).
	2. Update the CONFIG dictionary with your desired parameters.
	3. Run the script: python turkmen_word2vec.py

	The script will preprocess the data, train the model, and save it for future use.
	"""

	import re
	import nltk
	import logging
	import multiprocessing
	from pathlib import Path
	from typing import List, Dict, Tuple

	import nltk
	from nltk.tokenize import word_tokenize
	from nltk.corpus import stopwords
	from tqdm import tqdm
	from concurrent.futures import ProcessPoolExecutor
	from gensim.models import Word2Vec

	# Configuration
	CONFIG = {
	"input_file": "path/to/your/input/file.txt",
	"output_dir": "path/to/output/directory",
	"model_name": "turkmen_word2vec",
	"vector_size": 300,
	"window": 5,
	"min_count": 15,
	"sg": 1,
	"epochs": 10,
	"negative": 15,
	"sample": 1e-5,
	}

	# Setup logging
	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

	# Ensure required NLTK data is available
	nltk.download('stopwords', quiet=True)
	nltk.download('punkt', quiet=True)

	# Load stop words (using Turkish as a close approximation to Turkmen)
	STOP_WORDS = set(stopwords.words('turkish'))

	# Character replacements for Turkmen-specific letters
	REPLACEMENTS = {
	'ä': 'a', 'ç': 'ch', 'ö': 'o', 'ü': 'u', 'ň': 'n', 'ý': 'y', 'ğ': 'g', 'ş': 's',
	'Ç': 'Ch', 'Ö': 'O', 'Ü': 'U', 'Ä': 'A', 'Ň': 'N', 'Ş': 'S', 'Ý': 'Y', 'Ğ': 'G'
	}

	def preprocess_sentence(sentence: str) -> List[str]:
	"""
	Preprocess a single sentence.

	Args:
	sentence (str): Input sentence.

	Returns:
	List[str]: List of preprocessed tokens.
	"""
	for original, replacement in REPLACEMENTS.items():
	sentence = sentence.replace(original, replacement)

	sentence = re.sub(r'[^a-zA-Z ]', ' ', sentence)
	sentence = sentence.lower()

	tokens = word_tokenize(sentence)
	return [word for word in tokens if word not in STOP_WORDS and len(word) > 2]

	def process_chunk(chunk: List[str]) -> List[List[str]]:
	"""
	Process a chunk of sentences in parallel.

	Args:
	chunk (List[str]): List of sentences to process.

	Returns:
	List[List[str]]: List of preprocessed sentences (as token lists).
	"""
	return [preprocess_sentence(sentence) for sentence in chunk]

	def load_and_preprocess(file_path: str) -> List[List[str]]:
	"""
	Load and preprocess the input file.

	Args:
	file_path (str): Path to the input file.

	Returns:
	List[List[str]]: List of preprocessed sentences (as token lists).
	"""
	with open(file_path, 'r', encoding='utf-8') as f:
	sentences = f.readlines()

	chunk_size = len(sentences) // multiprocessing.cpu_count()
	chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]

	processed_sentences = []
	with ProcessPoolExecutor() as executor:
	for chunk_result in tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Preprocessing"):
	processed_sentences.extend(chunk_result)

	return processed_sentences

	def train_word2vec(sentences: List[List[str]], params: Dict) -> Word2Vec:
	"""
	Train the Word2Vec model.

	Args:
	sentences (List[List[str]]): Preprocessed sentences.
	params (Dict): Model parameters.

	Returns:
	Word2Vec: Trained Word2Vec model.
	"""
	model = Word2Vec(sentences=sentences, vector_size=params['vector_size'],
	window=params['window'], min_count=params['min_count'],
	workers=multiprocessing.cpu_count(), sg=params['sg'],
	epochs=params['epochs'], negative=params['negative'],
	sample=params['sample'], compute_loss=True)
	return model

	def save_model(model: Word2Vec, output_dir: str, model_name: str) -> None:
	"""
	Save the trained model and its metadata.

	Args:
	model (Word2Vec): Trained Word2Vec model.
	output_dir (str): Directory to save the model.
	model_name (str): Name of the model.
	"""
	output_path = Path(output_dir)
	output_path.mkdir(parents=True, exist_ok=True)

	model_path = output_path / f"{model_name}.model"
	model.save(str(model_path))
	logging.info(f"Model saved to {model_path}")

	# Save model metadata
	metadata_path = output_path / f"{model_name}_metadata.txt"
	with open(metadata_path, 'w', encoding='utf-8') as f:
	f.write(f"Model: {model_name}\n")
	f.write(f"Vocabulary size: {len(model.wv.key_to_index)}\n")
	f.write(f"Vector size: {model.vector_size}\n")
	f.write(f"Window size: {model.window}\n")
	f.write(f"Min count: {model.min_count}\n")
	f.write(f"Training epochs: {model.epochs}\n")
	f.write(f"Final training loss: {model.get_latest_training_loss()}\n")
	logging.info(f"Model metadata saved to {metadata_path}")

	def main():
	"""Main execution function."""
	logging.info("Starting Turkmen Word2Vec model training")

	# Load and preprocess data
	processed_sentences = load_and_preprocess(CONFIG['input_file'])
	logging.info(f"Preprocessed {len(processed_sentences)} sentences")

	# Train model
	model = train_word2vec(processed_sentences, CONFIG)
	logging.info("Model training completed")

	# Save model and metadata
	save_model(model, CONFIG['output_dir'], CONFIG['model_name'])

	logging.info("Process completed successfully")

	if __name__ == "__main__":
	main()