nomic-embed-indonesian / USAGE_EXAMPLES.md
asmud's picture
Upload folder using huggingface_hub
57e0da1 verified

Indonesian Text Embedding Usage Examples

🔍 Search & Retrieval

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

model = SentenceTransformer("asmud/nomic-embed-indonesian", trust_remote_code=True)

# Indonesian search example
query = "search_query: Bagaimana cara memasak rendang?"
documents = [
    "search_document: Rendang adalah masakan Minangkabau yang dimasak dengan santan dan rempah-rempah",
    "search_document: Nasi goreng adalah makanan yang dibuat dari nasi yang digoreng dengan bumbu",
    "search_document: Sate adalah makanan yang terdiri dari daging yang ditusuk dan dibakar"
]

query_embedding = model.encode([query])
doc_embeddings = model.encode(documents)

similarities = cosine_similarity(query_embedding, doc_embeddings)[0]
best_match = np.argmax(similarities)

print(f"Best match: {documents[best_match]}")
print(f"Similarity score: {similarities[best_match]:.3f}")

📊 Text Classification

# Sentiment analysis
texts = [
    "classification: Produk ini sangat berkualitas dan sesuai dengan harapan saya",
    "classification: Saya sangat kecewa dengan pelayanan yang diberikan",
    "classification: Lumayan bagus, ada beberapa kekurangan tapi overall oke"
]

embeddings = model.encode(texts)

# The embeddings can now be used with any classifier
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=2)  # Positive vs Negative
labels = kmeans.fit_predict(embeddings)

🎯 Clustering Indonesian Content

# Group similar content
indonesian_texts = [
    "clustering: teknologi kecerdasan buatan dan machine learning",
    "clustering: perkembangan teknologi digital di Indonesia", 
    "clustering: makanan tradisional Jawa seperti gudeg dan tahu gimbal",
    "clustering: kuliner khas Sumatera termasuk rendang dan gulai",
    "clustering: politik dan pemerintahan Indonesia",
    "clustering: kebijakan publik dan reformasi birokrasi"
]

embeddings = model.encode(indonesian_texts)

from sklearn.cluster import AgglomerativeClustering
clustering = AgglomerativeClustering(n_clusters=3)
labels = clustering.fit_predict(embeddings)

# Group texts by cluster
for cluster_id in set(labels):
    print(f"\nCluster {cluster_id}:")
    for i, text in enumerate(indonesian_texts):
        if labels[i] == cluster_id:
            print(f"  - {text}")

🔗 Semantic Similarity

# Find similar Indonesian sentences
sentences = [
    "Jakarta adalah ibukota Indonesia",
    "Ibukota negara Indonesia adalah Jakarta", 
    "Saya suka makan nasi goreng",
    "Cuaca hari ini sangat panas",
    "Hari ini udaranya sangat panas"
]

embeddings = model.encode(sentences)
similarity_matrix = cosine_similarity(embeddings)

print("Similarity Matrix:")
for i, sent1 in enumerate(sentences):
    for j, sent2 in enumerate(sentences):
        if i < j:  # Only upper triangle
            sim = similarity_matrix[i][j]
            print(f"{sim:.3f}: '{sent1}' <-> '{sent2}'")

🏢 Business Applications

Customer Support Ticket Routing

# Route customer complaints to appropriate departments
support_tickets = [
    "search_query: Masalah pembayaran dengan kartu kredit tidak bisa diproses",
    "search_query: Aplikasi sering crash dan tidak bisa dibuka", 
    "search_query: Pesanan belum sampai padahal sudah lewat estimasi"
]

departments = [
    "search_document: Tim finance menangani masalah pembayaran, refund, dan billing",
    "search_document: Tim technical support menangani bug aplikasi dan masalah teknis",
    "search_document: Tim logistics menangani pengiriman, tracking, dan fulfillment"
]

ticket_embeddings = model.encode(support_tickets)
dept_embeddings = model.encode(departments)

for i, ticket in enumerate(support_tickets):
    similarities = cosine_similarity([ticket_embeddings[i]], dept_embeddings)[0]
    best_dept = np.argmax(similarities)
    print(f"Ticket: {ticket}")
    print(f"Route to: {departments[best_dept]}")
    print(f"Confidence: {similarities[best_dept]:.3f}\n")

Content Recommendation

# Recommend similar articles
user_interest = "search_query: Teknologi AI untuk pendidikan"

articles = [
    "search_document: Penerapan machine learning dalam sistem pembelajaran adaptif di sekolah",
    "search_document: Resep masakan tradisional Indonesia yang mudah dibuat di rumah",
    "search_document: Startup EdTech Indonesia menggunakan AI untuk personalisasi belajar",
    "search_document: Tips kesehatan untuk menjaga imunitas tubuh di musim hujan"
]

interest_embedding = model.encode([user_interest])
article_embeddings = model.encode(articles)

similarities = cosine_similarity(interest_embedding, article_embeddings)[0]
ranked_articles = sorted(zip(articles, similarities), key=lambda x: x[1], reverse=True)

print("Recommended articles:")
for article, score in ranked_articles:
    print(f"{score:.3f}: {article}")

📈 Performance Tips

  1. Batch Processing: Encode multiple texts at once for better performance
# Good: Batch processing
texts = ["text1", "text2", "text3", ...]
embeddings = model.encode(texts)  # Process all at once

# Avoid: One by one processing
embeddings = [model.encode([text]) for text in texts]  # Slower
  1. Caching: Cache embeddings for repeated use
import pickle

# Compute once
embeddings = model.encode(large_text_corpus)

# Save for reuse
with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

# Load when needed
with open('embeddings.pkl', 'rb') as f:
    cached_embeddings = pickle.load(f)
  1. GPU Acceleration: Use GPU for faster inference (if available)
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("asmud/nomic-embed-indonesian", device=device)