turkmen-word2vec / train_turkmen_word2vec.py
mamed0v
Loaded W2V model
2aa8b90
"""
Turkmen Word2Vec Model
This script preprocesses Turkmen text data and trains a Word2Vec model.
It's designed for open-source use and easy adaptation to other projects.
Requirements:
- Python 3.6+
- Dependencies: nltk, gensim, tqdm
Usage:
1. Prepare your Turkmen text data in a file (one sentence per line).
2. Update the CONFIG dictionary with your desired parameters.
3. Run the script: python turkmen_word2vec.py
The script will preprocess the data, train the model, and save it for future use.
"""
import re
import nltk
import logging
import multiprocessing
from pathlib import Path
from typing import List, Dict, Tuple
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor
from gensim.models import Word2Vec
# Configuration
CONFIG = {
"input_file": "path/to/your/input/file.txt",
"output_dir": "path/to/output/directory",
"model_name": "turkmen_word2vec",
"vector_size": 300,
"window": 5,
"min_count": 15,
"sg": 1,
"epochs": 10,
"negative": 15,
"sample": 1e-5,
}
# Setup logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# Ensure required NLTK data is available
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)
# Load stop words (using Turkish as a close approximation to Turkmen)
STOP_WORDS = set(stopwords.words('turkish'))
# Character replacements for Turkmen-specific letters
REPLACEMENTS = {
'ä': 'a', 'ç': 'ch', 'ö': 'o', 'ü': 'u', 'ň': 'n', 'ý': 'y', 'ğ': 'g', 'ş': 's',
'Ç': 'Ch', 'Ö': 'O', 'Ü': 'U', 'Ä': 'A', 'Ň': 'N', 'Ş': 'S', 'Ý': 'Y', 'Ğ': 'G'
}
def preprocess_sentence(sentence: str) -> List[str]:
"""
Preprocess a single sentence.
Args:
sentence (str): Input sentence.
Returns:
List[str]: List of preprocessed tokens.
"""
for original, replacement in REPLACEMENTS.items():
sentence = sentence.replace(original, replacement)
sentence = re.sub(r'[^a-zA-Z ]', ' ', sentence)
sentence = sentence.lower()
tokens = word_tokenize(sentence)
return [word for word in tokens if word not in STOP_WORDS and len(word) > 2]
def process_chunk(chunk: List[str]) -> List[List[str]]:
"""
Process a chunk of sentences in parallel.
Args:
chunk (List[str]): List of sentences to process.
Returns:
List[List[str]]: List of preprocessed sentences (as token lists).
"""
return [preprocess_sentence(sentence) for sentence in chunk]
def load_and_preprocess(file_path: str) -> List[List[str]]:
"""
Load and preprocess the input file.
Args:
file_path (str): Path to the input file.
Returns:
List[List[str]]: List of preprocessed sentences (as token lists).
"""
with open(file_path, 'r', encoding='utf-8') as f:
sentences = f.readlines()
chunk_size = len(sentences) // multiprocessing.cpu_count()
chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)]
processed_sentences = []
with ProcessPoolExecutor() as executor:
for chunk_result in tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Preprocessing"):
processed_sentences.extend(chunk_result)
return processed_sentences
def train_word2vec(sentences: List[List[str]], params: Dict) -> Word2Vec:
"""
Train the Word2Vec model.
Args:
sentences (List[List[str]]): Preprocessed sentences.
params (Dict): Model parameters.
Returns:
Word2Vec: Trained Word2Vec model.
"""
model = Word2Vec(sentences=sentences, vector_size=params['vector_size'],
window=params['window'], min_count=params['min_count'],
workers=multiprocessing.cpu_count(), sg=params['sg'],
epochs=params['epochs'], negative=params['negative'],
sample=params['sample'], compute_loss=True)
return model
def save_model(model: Word2Vec, output_dir: str, model_name: str) -> None:
"""
Save the trained model and its metadata.
Args:
model (Word2Vec): Trained Word2Vec model.
output_dir (str): Directory to save the model.
model_name (str): Name of the model.
"""
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
model_path = output_path / f"{model_name}.model"
model.save(str(model_path))
logging.info(f"Model saved to {model_path}")
# Save model metadata
metadata_path = output_path / f"{model_name}_metadata.txt"
with open(metadata_path, 'w', encoding='utf-8') as f:
f.write(f"Model: {model_name}\n")
f.write(f"Vocabulary size: {len(model.wv.key_to_index)}\n")
f.write(f"Vector size: {model.vector_size}\n")
f.write(f"Window size: {model.window}\n")
f.write(f"Min count: {model.min_count}\n")
f.write(f"Training epochs: {model.epochs}\n")
f.write(f"Final training loss: {model.get_latest_training_loss()}\n")
logging.info(f"Model metadata saved to {metadata_path}")
def main():
"""Main execution function."""
logging.info("Starting Turkmen Word2Vec model training")
# Load and preprocess data
processed_sentences = load_and_preprocess(CONFIG['input_file'])
logging.info(f"Preprocessed {len(processed_sentences)} sentences")
# Train model
model = train_word2vec(processed_sentences, CONFIG)
logging.info("Model training completed")
# Save model and metadata
save_model(model, CONFIG['output_dir'], CONFIG['model_name'])
logging.info("Process completed successfully")
if __name__ == "__main__":
main()