| """ |
| Turkmen Word2Vec Model |
| |
| This script preprocesses Turkmen text data and trains a Word2Vec model. |
| It's designed for open-source use and easy adaptation to other projects. |
| |
| Requirements: |
| - Python 3.6+ |
| - Dependencies: nltk, gensim, tqdm |
| |
| Usage: |
| 1. Prepare your Turkmen text data in a file (one sentence per line). |
| 2. Update the CONFIG dictionary with your desired parameters. |
| 3. Run the script: python turkmen_word2vec.py |
| |
| The script will preprocess the data, train the model, and save it for future use. |
| """ |
|
|
| import re |
| import nltk |
| import logging |
| import multiprocessing |
| from pathlib import Path |
| from typing import List, Dict, Tuple |
|
|
| import nltk |
| from nltk.tokenize import word_tokenize |
| from nltk.corpus import stopwords |
| from tqdm import tqdm |
| from concurrent.futures import ProcessPoolExecutor |
| from gensim.models import Word2Vec |
|
|
| |
| CONFIG = { |
| "input_file": "path/to/your/input/file.txt", |
| "output_dir": "path/to/output/directory", |
| "model_name": "turkmen_word2vec", |
| "vector_size": 300, |
| "window": 5, |
| "min_count": 15, |
| "sg": 1, |
| "epochs": 10, |
| "negative": 15, |
| "sample": 1e-5, |
| } |
|
|
| |
| logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) |
|
|
| |
| nltk.download('stopwords', quiet=True) |
| nltk.download('punkt', quiet=True) |
|
|
| |
| STOP_WORDS = set(stopwords.words('turkish')) |
|
|
| |
| REPLACEMENTS = { |
| 'ä': 'a', 'ç': 'ch', 'ö': 'o', 'ü': 'u', 'ň': 'n', 'ý': 'y', 'ğ': 'g', 'ş': 's', |
| 'Ç': 'Ch', 'Ö': 'O', 'Ü': 'U', 'Ä': 'A', 'Ň': 'N', 'Ş': 'S', 'Ý': 'Y', 'Ğ': 'G' |
| } |
|
|
| def preprocess_sentence(sentence: str) -> List[str]: |
| """ |
| Preprocess a single sentence. |
| |
| Args: |
| sentence (str): Input sentence. |
| |
| Returns: |
| List[str]: List of preprocessed tokens. |
| """ |
| for original, replacement in REPLACEMENTS.items(): |
| sentence = sentence.replace(original, replacement) |
| |
| sentence = re.sub(r'[^a-zA-Z ]', ' ', sentence) |
| sentence = sentence.lower() |
|
|
| tokens = word_tokenize(sentence) |
| return [word for word in tokens if word not in STOP_WORDS and len(word) > 2] |
|
|
| def process_chunk(chunk: List[str]) -> List[List[str]]: |
| """ |
| Process a chunk of sentences in parallel. |
| |
| Args: |
| chunk (List[str]): List of sentences to process. |
| |
| Returns: |
| List[List[str]]: List of preprocessed sentences (as token lists). |
| """ |
| return [preprocess_sentence(sentence) for sentence in chunk] |
|
|
| def load_and_preprocess(file_path: str) -> List[List[str]]: |
| """ |
| Load and preprocess the input file. |
| |
| Args: |
| file_path (str): Path to the input file. |
| |
| Returns: |
| List[List[str]]: List of preprocessed sentences (as token lists). |
| """ |
| with open(file_path, 'r', encoding='utf-8') as f: |
| sentences = f.readlines() |
|
|
| chunk_size = len(sentences) // multiprocessing.cpu_count() |
| chunks = [sentences[i:i + chunk_size] for i in range(0, len(sentences), chunk_size)] |
|
|
| processed_sentences = [] |
| with ProcessPoolExecutor() as executor: |
| for chunk_result in tqdm(executor.map(process_chunk, chunks), total=len(chunks), desc="Preprocessing"): |
| processed_sentences.extend(chunk_result) |
|
|
| return processed_sentences |
|
|
| def train_word2vec(sentences: List[List[str]], params: Dict) -> Word2Vec: |
| """ |
| Train the Word2Vec model. |
| |
| Args: |
| sentences (List[List[str]]): Preprocessed sentences. |
| params (Dict): Model parameters. |
| |
| Returns: |
| Word2Vec: Trained Word2Vec model. |
| """ |
| model = Word2Vec(sentences=sentences, vector_size=params['vector_size'], |
| window=params['window'], min_count=params['min_count'], |
| workers=multiprocessing.cpu_count(), sg=params['sg'], |
| epochs=params['epochs'], negative=params['negative'], |
| sample=params['sample'], compute_loss=True) |
| return model |
|
|
| def save_model(model: Word2Vec, output_dir: str, model_name: str) -> None: |
| """ |
| Save the trained model and its metadata. |
| |
| Args: |
| model (Word2Vec): Trained Word2Vec model. |
| output_dir (str): Directory to save the model. |
| model_name (str): Name of the model. |
| """ |
| output_path = Path(output_dir) |
| output_path.mkdir(parents=True, exist_ok=True) |
|
|
| model_path = output_path / f"{model_name}.model" |
| model.save(str(model_path)) |
| logging.info(f"Model saved to {model_path}") |
|
|
| |
| metadata_path = output_path / f"{model_name}_metadata.txt" |
| with open(metadata_path, 'w', encoding='utf-8') as f: |
| f.write(f"Model: {model_name}\n") |
| f.write(f"Vocabulary size: {len(model.wv.key_to_index)}\n") |
| f.write(f"Vector size: {model.vector_size}\n") |
| f.write(f"Window size: {model.window}\n") |
| f.write(f"Min count: {model.min_count}\n") |
| f.write(f"Training epochs: {model.epochs}\n") |
| f.write(f"Final training loss: {model.get_latest_training_loss()}\n") |
| logging.info(f"Model metadata saved to {metadata_path}") |
|
|
| def main(): |
| """Main execution function.""" |
| logging.info("Starting Turkmen Word2Vec model training") |
|
|
| |
| processed_sentences = load_and_preprocess(CONFIG['input_file']) |
| logging.info(f"Preprocessed {len(processed_sentences)} sentences") |
|
|
| |
| model = train_word2vec(processed_sentences, CONFIG) |
| logging.info("Model training completed") |
|
|
| |
| save_model(model, CONFIG['output_dir'], CONFIG['model_name']) |
|
|
| logging.info("Process completed successfully") |
|
|
| if __name__ == "__main__": |
| main() |