File size: 8,356 Bytes
4b80424
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
#!/usr/bin/env python3
"""
PyTorch Usage Example - Indonesian Embedding Model
Demonstrates how to use the PyTorch version of the model
"""

import time
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def load_model():
    """Load the Indonesian embedding model"""
    print("Loading Indonesian embedding model (PyTorch)...")
    model = SentenceTransformer('../pytorch')
    print(f"βœ… Model loaded successfully!")
    return model

def basic_usage_example(model):
    """Basic usage example"""
    print("\n" + "="*60)
    print("πŸ“ BASIC USAGE EXAMPLE")
    print("="*60)
    
    # Indonesian sentences for testing
    sentences = [
        "Teknologi artificial intelligence berkembang pesat",
        "AI dan machine learning sangat canggih",
        "Jakarta adalah ibu kota Indonesia",
        "Saya suka makan nasi goreng"
    ]
    
    print("Input sentences:")
    for i, sentence in enumerate(sentences, 1):
        print(f"  {i}. {sentence}")
    
    # Encode sentences
    print("\nEncoding sentences...")
    start_time = time.time()
    embeddings = model.encode(sentences, show_progress_bar=False)
    encoding_time = (time.time() - start_time) * 1000
    
    print(f"βœ… Encoded {len(sentences)} sentences in {encoding_time:.1f}ms")
    print(f"πŸ“Š Embedding shape: {embeddings.shape}")
    print(f"πŸ“Š Embedding dimension: {embeddings.shape[1]}")

def similarity_example(model):
    """Semantic similarity example"""
    print("\n" + "="*60)
    print("🎯 SEMANTIC SIMILARITY EXAMPLE")
    print("="*60)
    
    # Test pairs with expected similarities
    test_pairs = [
        ("AI akan mengubah dunia teknologi", "Kecerdasan buatan akan mengubah dunia", "High"),
        ("Jakarta adalah ibu kota Indonesia", "Kota besar dengan banyak penduduk", "Medium"), 
        ("Mahasiswa belajar di universitas", "Siswa kuliah di kampus", "High"),
        ("Teknologi sangat canggih", "Kucing suka makan ikan", "Low")
    ]
    
    print("Testing semantic similarity on Indonesian text pairs:\n")
    
    for i, (text1, text2, expected) in enumerate(test_pairs, 1):
        # Encode both sentences
        embeddings = model.encode([text1, text2])
        
        # Calculate cosine similarity
        similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
        
        # Determine similarity category
        if similarity >= 0.7:
            category = "High"
            status = "🟒"
        elif similarity >= 0.3:
            category = "Medium" 
            status = "🟑"
        else:
            category = "Low"
            status = "πŸ”΄"
        
        # Check if prediction matches expectation
        correct = "βœ…" if category == expected else "❌"
        
        print(f"{correct} Pair {i} ({status} {category}): {similarity:.3f}")
        print(f"   Text 1: '{text1}'")
        print(f"   Text 2: '{text2}'")
        print(f"   Expected: {expected} | Predicted: {category}\n")

def clustering_example(model):
    """Text clustering example"""
    print("\n" + "="*60)
    print("πŸ—‚οΈ TEXT CLUSTERING EXAMPLE")
    print("="*60)
    
    # Indonesian sentences from different domains
    documents = [
        # Technology
        "Artificial intelligence mengubah cara kita bekerja",
        "Machine learning membantu prediksi data",
        "Software development membutuhkan keahlian programming",
        
        # Education  
        "Mahasiswa belajar di universitas negeri",
        "Pendidikan tinggi sangat penting untuk masa depan",
        "Dosen mengajar dengan metode yang inovatif",
        
        # Food
        "Nasi goreng adalah makanan favorit Indonesia",
        "Rendang merupakan masakan tradisional Sumatra",
        "Gado-gado menggunakan bumbu kacang yang lezat"
    ]
    
    print("Documents to cluster:")
    for i, doc in enumerate(documents, 1):
        print(f"  {i}. {doc}")
    
    # Encode documents
    print("\nEncoding documents...")
    embeddings = model.encode(documents, show_progress_bar=False)
    
    # Simple clustering using similarity
    from sklearn.cluster import KMeans
    
    # Cluster into 3 groups
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(embeddings)
    
    print(f"\nπŸ“Š Clustering results (3 clusters):")
    for cluster_id in range(3):
        docs_in_cluster = [documents[i] for i, c in enumerate(clusters) if c == cluster_id]
        print(f"\n🏷️ Cluster {cluster_id + 1}:")
        for doc in docs_in_cluster:
            print(f"   - {doc}")

def search_example(model):
    """Semantic search example"""
    print("\n" + "="*60)
    print("πŸ” SEMANTIC SEARCH EXAMPLE") 
    print("="*60)
    
    # Document corpus
    corpus = [
        "Indonesia adalah negara kepulauan terbesar di dunia",
        "Jakarta merupakan ibu kota dan pusat bisnis Indonesia", 
        "Bali terkenal sebagai destinasi wisata yang indah",
        "Artificial intelligence mengubah industri teknologi",
        "Machine learning membantu analisis data besar",
        "Robotika masa depan akan sangat canggih",
        "Nasi padang adalah makanan khas Sumatra Barat",
        "Rendang dinobatkan sebagai makanan terlezat dunia",
        "Kuliner Indonesia sangat beragam dan kaya rasa"
    ]
    
    print("Document corpus:")
    for i, doc in enumerate(corpus, 1):
        print(f"  {i}. {doc}")
    
    # Encode corpus
    print("\nEncoding corpus...")
    corpus_embeddings = model.encode(corpus, show_progress_bar=False)
    
    # Search queries
    queries = [
        "teknologi AI dan machine learning",
        "makanan tradisional Indonesia", 
        "ibu kota Indonesia"
    ]
    
    for query in queries:
        print(f"\nπŸ” Query: '{query}'")
        
        # Encode query
        query_embedding = model.encode([query])
        
        # Calculate similarities
        similarities = cosine_similarity(query_embedding, corpus_embeddings)[0]
        
        # Get top 3 results
        top_indices = np.argsort(similarities)[::-1][:3]
        
        print("πŸ“‹ Top 3 most relevant documents:")
        for rank, idx in enumerate(top_indices, 1):
            print(f"  {rank}. (Score: {similarities[idx]:.3f}) {corpus[idx]}")

def performance_benchmark(model):
    """Performance benchmark"""
    print("\n" + "="*60)
    print("⚑ PERFORMANCE BENCHMARK")
    print("="*60)
    
    # Test different batch sizes
    test_sentences = [
        "Ini adalah kalimat percobaan untuk mengukur performa",
        "Teknologi artificial intelligence sangat membantu",
        "Indonesia memiliki budaya yang sangat beragam"
    ] * 10  # 30 sentences
    
    batch_sizes = [1, 5, 10, 30]
    
    print("Testing encoding performance with different batch sizes:\n")
    
    for batch_size in batch_sizes:
        sentences_batch = test_sentences[:batch_size]
        
        # Warm up
        model.encode(sentences_batch[:1], show_progress_bar=False)
        
        # Benchmark
        times = []
        for _ in range(3):  # 3 runs
            start_time = time.time()
            embeddings = model.encode(sentences_batch, show_progress_bar=False)
            end_time = time.time()
            times.append((end_time - start_time) * 1000)
        
        avg_time = np.mean(times)
        throughput = batch_size / (avg_time / 1000)
        
        print(f"πŸ“Š Batch size {batch_size:2d}: {avg_time:6.1f}ms | {throughput:5.1f} sentences/sec")

def main():
    """Main example function"""
    print("πŸš€ Indonesian Embedding Model - PyTorch Examples")
    print("This script demonstrates various use cases of the model\n")
    
    # Load model
    model = load_model()
    
    # Run examples
    basic_usage_example(model)
    similarity_example(model)
    clustering_example(model)
    search_example(model)
    performance_benchmark(model)
    
    print("\n" + "="*60)
    print("βœ… ALL EXAMPLES COMPLETED SUCCESSFULLY!")
    print("="*60)
    print("πŸ’‘ Tips:")
    print("   - Use ONNX version for production (7.8x faster)")
    print("   - Model works best with formal Indonesian text")
    print("   - Maximum input length: 384 tokens")
    print("   - For large batches, consider using GPU if available")

if __name__ == "__main__":
    main()