| # Indonesian Text Embedding Usage Examples | |
| ## 🔍 **Search & Retrieval** | |
| ```python | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| model = SentenceTransformer("asmud/nomic-embed-indonesian", trust_remote_code=True) | |
| # Indonesian search example | |
| query = "search_query: Bagaimana cara memasak rendang?" | |
| documents = [ | |
| "search_document: Rendang adalah masakan Minangkabau yang dimasak dengan santan dan rempah-rempah", | |
| "search_document: Nasi goreng adalah makanan yang dibuat dari nasi yang digoreng dengan bumbu", | |
| "search_document: Sate adalah makanan yang terdiri dari daging yang ditusuk dan dibakar" | |
| ] | |
| query_embedding = model.encode([query]) | |
| doc_embeddings = model.encode(documents) | |
| similarities = cosine_similarity(query_embedding, doc_embeddings)[0] | |
| best_match = np.argmax(similarities) | |
| print(f"Best match: {documents[best_match]}") | |
| print(f"Similarity score: {similarities[best_match]:.3f}") | |
| ``` | |
| ## 📊 **Text Classification** | |
| ```python | |
| # Sentiment analysis | |
| texts = [ | |
| "classification: Produk ini sangat berkualitas dan sesuai dengan harapan saya", | |
| "classification: Saya sangat kecewa dengan pelayanan yang diberikan", | |
| "classification: Lumayan bagus, ada beberapa kekurangan tapi overall oke" | |
| ] | |
| embeddings = model.encode(texts) | |
| # The embeddings can now be used with any classifier | |
| from sklearn.cluster import KMeans | |
| kmeans = KMeans(n_clusters=2) # Positive vs Negative | |
| labels = kmeans.fit_predict(embeddings) | |
| ``` | |
| ## 🎯 **Clustering Indonesian Content** | |
| ```python | |
| # Group similar content | |
| indonesian_texts = [ | |
| "clustering: teknologi kecerdasan buatan dan machine learning", | |
| "clustering: perkembangan teknologi digital di Indonesia", | |
| "clustering: makanan tradisional Jawa seperti gudeg dan tahu gimbal", | |
| "clustering: kuliner khas Sumatera termasuk rendang dan gulai", | |
| "clustering: politik dan pemerintahan Indonesia", | |
| "clustering: kebijakan publik dan reformasi birokrasi" | |
| ] | |
| embeddings = model.encode(indonesian_texts) | |
| from sklearn.cluster import AgglomerativeClustering | |
| clustering = AgglomerativeClustering(n_clusters=3) | |
| labels = clustering.fit_predict(embeddings) | |
| # Group texts by cluster | |
| for cluster_id in set(labels): | |
| print(f"\nCluster {cluster_id}:") | |
| for i, text in enumerate(indonesian_texts): | |
| if labels[i] == cluster_id: | |
| print(f" - {text}") | |
| ``` | |
| ## 🔗 **Semantic Similarity** | |
| ```python | |
| # Find similar Indonesian sentences | |
| sentences = [ | |
| "Jakarta adalah ibukota Indonesia", | |
| "Ibukota negara Indonesia adalah Jakarta", | |
| "Saya suka makan nasi goreng", | |
| "Cuaca hari ini sangat panas", | |
| "Hari ini udaranya sangat panas" | |
| ] | |
| embeddings = model.encode(sentences) | |
| similarity_matrix = cosine_similarity(embeddings) | |
| print("Similarity Matrix:") | |
| for i, sent1 in enumerate(sentences): | |
| for j, sent2 in enumerate(sentences): | |
| if i < j: # Only upper triangle | |
| sim = similarity_matrix[i][j] | |
| print(f"{sim:.3f}: '{sent1}' <-> '{sent2}'") | |
| ``` | |
| ## 🏢 **Business Applications** | |
| ### Customer Support Ticket Routing | |
| ```python | |
| # Route customer complaints to appropriate departments | |
| support_tickets = [ | |
| "search_query: Masalah pembayaran dengan kartu kredit tidak bisa diproses", | |
| "search_query: Aplikasi sering crash dan tidak bisa dibuka", | |
| "search_query: Pesanan belum sampai padahal sudah lewat estimasi" | |
| ] | |
| departments = [ | |
| "search_document: Tim finance menangani masalah pembayaran, refund, dan billing", | |
| "search_document: Tim technical support menangani bug aplikasi dan masalah teknis", | |
| "search_document: Tim logistics menangani pengiriman, tracking, dan fulfillment" | |
| ] | |
| ticket_embeddings = model.encode(support_tickets) | |
| dept_embeddings = model.encode(departments) | |
| for i, ticket in enumerate(support_tickets): | |
| similarities = cosine_similarity([ticket_embeddings[i]], dept_embeddings)[0] | |
| best_dept = np.argmax(similarities) | |
| print(f"Ticket: {ticket}") | |
| print(f"Route to: {departments[best_dept]}") | |
| print(f"Confidence: {similarities[best_dept]:.3f}\n") | |
| ``` | |
| ### Content Recommendation | |
| ```python | |
| # Recommend similar articles | |
| user_interest = "search_query: Teknologi AI untuk pendidikan" | |
| articles = [ | |
| "search_document: Penerapan machine learning dalam sistem pembelajaran adaptif di sekolah", | |
| "search_document: Resep masakan tradisional Indonesia yang mudah dibuat di rumah", | |
| "search_document: Startup EdTech Indonesia menggunakan AI untuk personalisasi belajar", | |
| "search_document: Tips kesehatan untuk menjaga imunitas tubuh di musim hujan" | |
| ] | |
| interest_embedding = model.encode([user_interest]) | |
| article_embeddings = model.encode(articles) | |
| similarities = cosine_similarity(interest_embedding, article_embeddings)[0] | |
| ranked_articles = sorted(zip(articles, similarities), key=lambda x: x[1], reverse=True) | |
| print("Recommended articles:") | |
| for article, score in ranked_articles: | |
| print(f"{score:.3f}: {article}") | |
| ``` | |
| ## 📈 **Performance Tips** | |
| 1. **Batch Processing**: Encode multiple texts at once for better performance | |
| ```python | |
| # Good: Batch processing | |
| texts = ["text1", "text2", "text3", ...] | |
| embeddings = model.encode(texts) # Process all at once | |
| # Avoid: One by one processing | |
| embeddings = [model.encode([text]) for text in texts] # Slower | |
| ``` | |
| 2. **Caching**: Cache embeddings for repeated use | |
| ```python | |
| import pickle | |
| # Compute once | |
| embeddings = model.encode(large_text_corpus) | |
| # Save for reuse | |
| with open('embeddings.pkl', 'wb') as f: | |
| pickle.dump(embeddings, f) | |
| # Load when needed | |
| with open('embeddings.pkl', 'rb') as f: | |
| cached_embeddings = pickle.load(f) | |
| ``` | |
| 3. **GPU Acceleration**: Use GPU for faster inference (if available) | |
| ```python | |
| import torch | |
| device = 'cuda' if torch.cuda.is_available() else 'cpu' | |
| model = SentenceTransformer("asmud/nomic-embed-indonesian", device=device) | |
| ``` |