| |
| """ |
| PyTorch Usage Example - Indonesian Embedding Model |
| Demonstrates how to use the PyTorch version of the model |
| """ |
|
|
| import time |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
| def load_model(): |
| """Load the Indonesian embedding model""" |
| print("Loading Indonesian embedding model (PyTorch)...") |
| model = SentenceTransformer('../pytorch') |
| print(f"β
Model loaded successfully!") |
| return model |
|
|
| def basic_usage_example(model): |
| """Basic usage example""" |
| print("\n" + "="*60) |
| print("π BASIC USAGE EXAMPLE") |
| print("="*60) |
| |
| |
| sentences = [ |
| "Teknologi artificial intelligence berkembang pesat", |
| "AI dan machine learning sangat canggih", |
| "Jakarta adalah ibu kota Indonesia", |
| "Saya suka makan nasi goreng" |
| ] |
| |
| print("Input sentences:") |
| for i, sentence in enumerate(sentences, 1): |
| print(f" {i}. {sentence}") |
| |
| |
| print("\nEncoding sentences...") |
| start_time = time.time() |
| embeddings = model.encode(sentences, show_progress_bar=False) |
| encoding_time = (time.time() - start_time) * 1000 |
| |
| print(f"β
Encoded {len(sentences)} sentences in {encoding_time:.1f}ms") |
| print(f"π Embedding shape: {embeddings.shape}") |
| print(f"π Embedding dimension: {embeddings.shape[1]}") |
|
|
| def similarity_example(model): |
| """Semantic similarity example""" |
| print("\n" + "="*60) |
| print("π― SEMANTIC SIMILARITY EXAMPLE") |
| print("="*60) |
| |
| |
| test_pairs = [ |
| ("AI akan mengubah dunia teknologi", "Kecerdasan buatan akan mengubah dunia", "High"), |
| ("Jakarta adalah ibu kota Indonesia", "Kota besar dengan banyak penduduk", "Medium"), |
| ("Mahasiswa belajar di universitas", "Siswa kuliah di kampus", "High"), |
| ("Teknologi sangat canggih", "Kucing suka makan ikan", "Low") |
| ] |
| |
| print("Testing semantic similarity on Indonesian text pairs:\n") |
| |
| for i, (text1, text2, expected) in enumerate(test_pairs, 1): |
| |
| embeddings = model.encode([text1, text2]) |
| |
| |
| similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0] |
| |
| |
| if similarity >= 0.7: |
| category = "High" |
| status = "π’" |
| elif similarity >= 0.3: |
| category = "Medium" |
| status = "π‘" |
| else: |
| category = "Low" |
| status = "π΄" |
| |
| |
| correct = "β
" if category == expected else "β" |
| |
| print(f"{correct} Pair {i} ({status} {category}): {similarity:.3f}") |
| print(f" Text 1: '{text1}'") |
| print(f" Text 2: '{text2}'") |
| print(f" Expected: {expected} | Predicted: {category}\n") |
|
|
| def clustering_example(model): |
| """Text clustering example""" |
| print("\n" + "="*60) |
| print("ποΈ TEXT CLUSTERING EXAMPLE") |
| print("="*60) |
| |
| |
| documents = [ |
| |
| "Artificial intelligence mengubah cara kita bekerja", |
| "Machine learning membantu prediksi data", |
| "Software development membutuhkan keahlian programming", |
| |
| |
| "Mahasiswa belajar di universitas negeri", |
| "Pendidikan tinggi sangat penting untuk masa depan", |
| "Dosen mengajar dengan metode yang inovatif", |
| |
| |
| "Nasi goreng adalah makanan favorit Indonesia", |
| "Rendang merupakan masakan tradisional Sumatra", |
| "Gado-gado menggunakan bumbu kacang yang lezat" |
| ] |
| |
| print("Documents to cluster:") |
| for i, doc in enumerate(documents, 1): |
| print(f" {i}. {doc}") |
| |
| |
| print("\nEncoding documents...") |
| embeddings = model.encode(documents, show_progress_bar=False) |
| |
| |
| from sklearn.cluster import KMeans |
| |
| |
| kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) |
| clusters = kmeans.fit_predict(embeddings) |
| |
| print(f"\nπ Clustering results (3 clusters):") |
| for cluster_id in range(3): |
| docs_in_cluster = [documents[i] for i, c in enumerate(clusters) if c == cluster_id] |
| print(f"\nπ·οΈ Cluster {cluster_id + 1}:") |
| for doc in docs_in_cluster: |
| print(f" - {doc}") |
|
|
| def search_example(model): |
| """Semantic search example""" |
| print("\n" + "="*60) |
| print("π SEMANTIC SEARCH EXAMPLE") |
| print("="*60) |
| |
| |
| corpus = [ |
| "Indonesia adalah negara kepulauan terbesar di dunia", |
| "Jakarta merupakan ibu kota dan pusat bisnis Indonesia", |
| "Bali terkenal sebagai destinasi wisata yang indah", |
| "Artificial intelligence mengubah industri teknologi", |
| "Machine learning membantu analisis data besar", |
| "Robotika masa depan akan sangat canggih", |
| "Nasi padang adalah makanan khas Sumatra Barat", |
| "Rendang dinobatkan sebagai makanan terlezat dunia", |
| "Kuliner Indonesia sangat beragam dan kaya rasa" |
| ] |
| |
| print("Document corpus:") |
| for i, doc in enumerate(corpus, 1): |
| print(f" {i}. {doc}") |
| |
| |
| print("\nEncoding corpus...") |
| corpus_embeddings = model.encode(corpus, show_progress_bar=False) |
| |
| |
| queries = [ |
| "teknologi AI dan machine learning", |
| "makanan tradisional Indonesia", |
| "ibu kota Indonesia" |
| ] |
| |
| for query in queries: |
| print(f"\nπ Query: '{query}'") |
| |
| |
| query_embedding = model.encode([query]) |
| |
| |
| similarities = cosine_similarity(query_embedding, corpus_embeddings)[0] |
| |
| |
| top_indices = np.argsort(similarities)[::-1][:3] |
| |
| print("π Top 3 most relevant documents:") |
| for rank, idx in enumerate(top_indices, 1): |
| print(f" {rank}. (Score: {similarities[idx]:.3f}) {corpus[idx]}") |
|
|
| def performance_benchmark(model): |
| """Performance benchmark""" |
| print("\n" + "="*60) |
| print("β‘ PERFORMANCE BENCHMARK") |
| print("="*60) |
| |
| |
| test_sentences = [ |
| "Ini adalah kalimat percobaan untuk mengukur performa", |
| "Teknologi artificial intelligence sangat membantu", |
| "Indonesia memiliki budaya yang sangat beragam" |
| ] * 10 |
| |
| batch_sizes = [1, 5, 10, 30] |
| |
| print("Testing encoding performance with different batch sizes:\n") |
| |
| for batch_size in batch_sizes: |
| sentences_batch = test_sentences[:batch_size] |
| |
| |
| model.encode(sentences_batch[:1], show_progress_bar=False) |
| |
| |
| times = [] |
| for _ in range(3): |
| start_time = time.time() |
| embeddings = model.encode(sentences_batch, show_progress_bar=False) |
| end_time = time.time() |
| times.append((end_time - start_time) * 1000) |
| |
| avg_time = np.mean(times) |
| throughput = batch_size / (avg_time / 1000) |
| |
| print(f"π Batch size {batch_size:2d}: {avg_time:6.1f}ms | {throughput:5.1f} sentences/sec") |
|
|
| def main(): |
| """Main example function""" |
| print("π Indonesian Embedding Model - PyTorch Examples") |
| print("This script demonstrates various use cases of the model\n") |
| |
| |
| model = load_model() |
| |
| |
| basic_usage_example(model) |
| similarity_example(model) |
| clustering_example(model) |
| search_example(model) |
| performance_benchmark(model) |
| |
| print("\n" + "="*60) |
| print("β
ALL EXAMPLES COMPLETED SUCCESSFULLY!") |
| print("="*60) |
| print("π‘ Tips:") |
| print(" - Use ONNX version for production (7.8x faster)") |
| print(" - Model works best with formal Indonesian text") |
| print(" - Maximum input length: 384 tokens") |
| print(" - For large batches, consider using GPU if available") |
|
|
| if __name__ == "__main__": |
| main() |