surazbhandari commited on Feb 15

Commit

adc0ea3

verified ·

1 Parent(s): 3da5f3c

Restore complete repository state (revert docs-only push)

Browse files

Files changed (24) hide show

.DS_Store +0 -0
.gitattributes +2 -0
LICENSE +21 -0
MODEL_CARD.md +3 -8
README.md +2 -12
data/sample_data.jsonl +10 -0
demo.py +510 -0
examples/basic_usage.py +85 -0
examples/clustering.py +109 -0
examples/semantic_search.py +108 -0
models/large/README.md +5 -0
models/medium/README.md +5 -0
models/mini/config.json +10 -0
models/mini/model.pt +3 -0
models/mini/model.safetensors +3 -0
models/mini/tokenizer.json +0 -0
models/mini/training_info.json +10 -0
models/product/README.md +5 -0
models/small/README.md +5 -0
requirements.txt +14 -0
src/__init__.py +15 -0
src/inference.py +353 -0
src/model.py +359 -0
src/tokenizer.py +173 -0

.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

.gitattributes ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ models/mini/model.pt filter=lfs diff=lfs merge=lfs -text
2	+ models/mini/model.safetensors filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

MODEL_CARD.md CHANGED Viewed

@@ -55,16 +55,11 @@ from src.inference import EmbeddingInference
 # Load -- just like sentence-transformers!
 model = EmbeddingInference.from_pretrained("surazbhandari/miniembed")
-# 1. Similarity
 score = model.similarity("Machine learning is great", "AI is wonderful")
 print(f"Similarity: {score:.4f}")  # 0.4287
-# 2. Normal Embeddings
-embeddings = model.encode(["Machine learning is great", "AI is wonderful"])
-import numpy as np
-manual_score = np.dot(embeddings[0], embeddings[1]) # Dot product = Cosine Similarity
-# 3. Semantic Search
 docs = ["Python is great for AI", "I love pizza", "Neural networks learn patterns"]
 results = model.search("deep learning frameworks", docs, top_k=2)
 for r in results:
@@ -72,7 +67,7 @@ for r in results:
 # [0.498] Neural networks learn patterns
 # [0.413] Python is great for AI
-# 4. Clustering
 result = model.cluster_texts(["ML is cool", "Pizza is food", "AI rocks"], n_clusters=2)
 # Cluster 1: ['Pizza is food']
 # Cluster 2: ['ML is cool', 'AI rocks']

 # Load -- just like sentence-transformers!
 model = EmbeddingInference.from_pretrained("surazbhandari/miniembed")
+# Similarity
 score = model.similarity("Machine learning is great", "AI is wonderful")
 print(f"Similarity: {score:.4f}")  # 0.4287
+# Semantic Search
 docs = ["Python is great for AI", "I love pizza", "Neural networks learn patterns"]
 results = model.search("deep learning frameworks", docs, top_k=2)
 for r in results:
 # [0.498] Neural networks learn patterns
 # [0.413] Python is great for AI
+# Clustering
 result = model.cluster_texts(["ML is cool", "Pizza is food", "AI rocks"], n_clusters=2)
 # Cluster 1: ['Pizza is food']
 # Cluster 2: ['ML is cool', 'AI rocks']

README.md CHANGED Viewed

@@ -82,25 +82,15 @@ from src.inference import EmbeddingInference
 model = EmbeddingInference.from_pretrained("models/mini")
-# 1. Similarity
 score = model.similarity("Machine learning is great", "AI is wonderful")
 print(f"Similarity: {score:.4f}")  # 0.4287
-# 2. Normal Embeddings
-embeddings = model.encode(["Machine learning is great", "AI is wonderful"])
-import numpy as np
-manual_score = np.dot(embeddings[0], embeddings[1]) # Dot product = Cosine Similarity
-# 3. Semantic Search
 docs = ["Python is great for AI", "I love pizza", "Neural networks learn patterns"]
 results = model.search("deep learning frameworks", docs, top_k=2)
 for r in results:
     print(f"  [{r['score']:.3f}] {r['text']}")
-# 4. Clustering
-result = model.cluster_texts(["ML is cool", "Pizza is food", "AI rocks"], n_clusters=2)
-# Cluster 1: ['Pizza is food']
-# Cluster 2: ['ML is cool', 'AI rocks']
 ```
 For full Hugging Face integration, ensure you have `huggingface_hub` installed:

 model = EmbeddingInference.from_pretrained("models/mini")
+# Similarity
 score = model.similarity("Machine learning is great", "AI is wonderful")
 print(f"Similarity: {score:.4f}")  # 0.4287
+# Semantic Search
 docs = ["Python is great for AI", "I love pizza", "Neural networks learn patterns"]
 results = model.search("deep learning frameworks", docs, top_k=2)
 for r in results:
     print(f"  [{r['score']:.3f}] {r['text']}")
 ```
 For full Hugging Face integration, ensure you have `huggingface_hub` installed:

data/sample_data.jsonl ADDED Viewed

	@@ -0,0 +1,10 @@

+{"query": "how to train an embedding model", "passage": "Training an embedding model involves using contrastive learning on query-passage pairs.", "source": "sample"}
+{"query": "what is a transformer", "passage": "The Transformer is a deep learning model that uses self-attention mechanisms to process sequence data.", "source": "sample"}
+{"query": "nike air max 90", "passage": "Men's Nike Air Max 90 Casual Shoes in Black and White.", "source": "sample"}
+{"query": "samsung galaxy s21", "passage": "Samsung Galaxy S21 5G 128GB Unlocked Smartphone - Phantom Gray.", "source": "sample"}
+{"query": "best winter coats", "passage": "The North Face Gotham Jacket III is one of the warmest winter parkas for heavy snow.", "source": "sample"}
+{"query": "python programming for beginners", "passage": "Learn Python with this comprehensive guide covering variables, loops, and functions.", "source": "sample"}
+{"query": "benefits of meditation", "passage": "Meditation can reduce stress, improve concentration, and increase happiness.", "source": "sample"}
+{"query": "how to bake chocolate cake", "passage": "Whisk eggs and sugar, then fold in flour and melted chocolate for a perfect moist cake.", "source": "sample"}
+{"query": "what is machine learning", "passage": "Machine learning is a field of AI that allows systems to learn patterns from data without explicit programming.", "source": "sample"}
+{"query": "running shoes for flat feet", "passage": "Brooks Adrenaline GTS 22 provides excellent stability and support for runners with low arches.", "source": "sample"}

demo.py ADDED Viewed

	@@ -0,0 +1,510 @@

+"""
+MiniEmbed - Interactive Demo
+================================
+Explore the embedding model's capabilities through a Streamlit dashboard.
+Features:
+  - Pairwise text similarity (cosine distance)
+  - Semantic document search with ranked results
+  - Unsupervised text clustering via K-Means
+  - Raw embedding vector inspection and visualization
+  - Bulk CSV-to-CSV record matching
+Run: streamlit run demo.py
+"""
+import streamlit as st
+import numpy as np
+import pandas as pd
+import os
+import sys
+import io
+# Add src to path
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+from src.inference import EmbeddingInference, EmbeddingModelManager
+# ============================================================================
+# PAGE CONFIG
+# ============================================================================
+st.set_page_config(
+    page_title="MiniEmbed Demo",
+    page_icon="M",
+    layout="wide"
+)
+# Custom CSS
+st.markdown("""
+<style>
+    .main-header {
+        font-size: 2.5rem;
+        font-weight: 700;
+        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+        -webkit-background-clip: text;
+        -webkit-text-fill-color: transparent;
+        text-align: center;
+        margin-bottom: 1rem;
+    }
+    .sub-header {
+        text-align: center;
+        color: #888;
+        margin-bottom: 2rem;
+    }
+    .result-box {
+        background: rgba(100, 100, 100, 0.1);
+        border-radius: 10px;
+        padding: 1rem;
+        margin: 0.5rem 0;
+        color: inherit;
+    }
+    .high-score { border-left: 4px solid #28a745; background: rgba(40, 167, 69, 0.1); }
+    .medium-score { border-left: 4px solid #ffc107; background: rgba(255, 193, 7, 0.1); }
+    .low-score { border-left: 4px solid #dc3545; background: rgba(220, 53, 69, 0.1); }
+    .score-text { font-weight: bold; }
+</style>
+""", unsafe_allow_html=True)
+# ============================================================================
+# LOAD MODEL
+# ============================================================================
+@st.cache_resource
+def load_model(model_name):
+    """Load the embedding model from disk."""
+    model_dir = f"models/{model_name}"
+    if model_name == "Legacy (model/)":
+        model_dir = "model"
+    return EmbeddingInference.from_pretrained(model_dir)
+# Header
+st.markdown('<h1 class="main-header">MiniEmbed Demo</h1>', unsafe_allow_html=True)
+st.markdown('<p class="sub-header">Explore semantic similarity, search, clustering, and bulk matching</p>', unsafe_allow_html=True)
+# -----------------------------------------------------------------------------
+# Model Selection
+# -----------------------------------------------------------------------------
+available_models = EmbeddingModelManager.list_models()
+if os.path.exists("model/model.pt"):
+    available_models.append("Legacy (model/)")
+if not available_models:
+    st.error("No models found. Train a model first or place weights in models/mini/model.pt.")
+    st.info("Models should be located in the `models/` directory (e.g., `models/mini/`).")
+    st.stop()
+selected_model_name = st.sidebar.selectbox(
+    "Select Model",
+    available_models,
+    index=0,
+    help="Select which trained model to load for inference."
+)
+model = load_model(selected_model_name)
+if model is None:
+    st.error("Model not found. Please train the model first.")
+    st.stop()
+# Model info
+with st.expander("Model Info", expanded=False):
+    st.markdown("""
+    This panel shows the architecture of the currently loaded model.
+    - **Embedding Dim**: The size of each output vector (higher = more expressive).
+    - **Layers**: Number of Transformer encoder layers stacked in the model.
+    - **Vocab Size**: Total number of unique tokens the model can recognize.
+    """)
+    col1, col2, col3 = st.columns(3)
+    with col1:
+        st.metric("Embedding Dim", model.model.d_model)
+    with col2:
+        st.metric("Layers", len(model.model.layers))
+    with col3:
+        st.metric("Vocab Size", len(model.tokenizer.word_to_id))
+# ============================================================================
+# TABS
+# ============================================================================
+tab1, tab2, tab3, tab4, tab5 = st.tabs([
+    "Similarity",
+    "Semantic Search",
+    "Clustering",
+    "Encode Text",
+    "CSV Matcher"
+])
+# ============================================================================
+# TAB 1: SIMILARITY
+# ============================================================================
+with tab1:
+    st.markdown("### Pairwise Text Similarity")
+    st.markdown("""
+    Enter two texts to compute their **cosine similarity** (range: 0 to 1).
+    The model encodes each text into a 256-dimensional vector and measures
+    the angular distance between them. A score close to 1.0 means the texts
+    are semantically equivalent; a score near 0.0 means they are unrelated.
+    """)
+    col1, col2 = st.columns(2)
+    with col1:
+        text1 = st.text_area(
+            "Text 1",
+            "Machine learning is a branch of artificial intelligence",
+            height=100,
+            key="sim_text1"
+        )
+    with col2:
+        text2 = st.text_area(
+            "Text 2",
+            "AI systems can learn patterns from data",
+            height=100,
+            key="sim_text2"
+        )
+    if st.button("Compute Similarity", type="primary", key="sim_btn"):
+        if text1 and text2:
+            with st.spinner("Computing..."):
+                similarity = model.similarity(text1, text2)
+            if similarity > 0.7:
+                color = "#28a745"
+                label = "Very Similar"
+            elif similarity > 0.4:
+                color = "#ffc107"
+                label = "Somewhat Similar"
+            else:
+                color = "#dc3545"
+                label = "Not Similar"
+            st.markdown(f"""
+            <div style="text-align: center; padding: 2rem;">
+                <div style="font-size: 4rem; font-weight: bold; color: {color};">
+                    {similarity:.3f}
+                </div>
+                <div style="font-size: 1.2rem; color: {color};">
+                    {label}
+                </div>
+            </div>
+            """, unsafe_allow_html=True)
+    # Example pairs
+    st.markdown("---")
+    st.markdown("#### Example Pairs")
+    st.markdown("These pairs demonstrate how the model distinguishes related from unrelated content:")
+    examples = [
+        ("Python is a programming language", "Java is used for software development"),
+        ("The cat sat on the mat", "A feline rested on the rug"),
+        ("Machine learning is fascinating", "I love eating pizza"),
+    ]
+    for t1, t2 in examples:
+        similarity = model.similarity(t1, t2)
+        if similarity > 0.5:
+            css_class = "high-score"
+        elif similarity > 0.3:
+            css_class = "medium-score"
+        else:
+            css_class = "low-score"
+        st.markdown(f"""
+        <div class="result-box {css_class}">
+            <strong>{similarity:.3f}</strong> | "{t1}" vs "{t2}"
+        </div>
+        """, unsafe_allow_html=True)
+# ============================================================================
+# TAB 2: SEMANTIC SEARCH
+# ============================================================================
+with tab2:
+    st.markdown("### Semantic Document Search")
+    st.markdown("""
+    Enter a natural-language query. The model encodes your query and all
+    documents into the same vector space, then ranks documents by cosine
+    similarity. This finds **meaning-based** matches, not just keyword overlap.
+    """)
+    default_docs = """Python is a high-level programming language
+Machine learning algorithms learn patterns from data
+The weather today is sunny and warm
+Neural networks are inspired by the human brain
+JavaScript is used for web development
+Deep learning has transformed computer vision
+Cats are popular pets around the world
+TensorFlow and PyTorch are ML frameworks
+The stock market had a volatile day
+Natural language processing understands text"""
+    query = st.text_input(
+        "Search Query",
+        "How do AI systems learn from examples?",
+        key="search_query"
+    )
+    documents_text = st.text_area(
+        "Documents (one per line)",
+        default_docs,
+        height=200,
+        key="search_docs"
+    )
+    top_k = st.slider("Number of results", 1, 10, 5, key="search_topk")
+    if st.button("Search", type="primary", key="search_btn"):
+        documents = [d.strip() for d in documents_text.split('\n') if d.strip()]
+        if query and documents:
+            with st.spinner("Searching..."):
+                results = model.search(query, documents, top_k=top_k)
+            st.markdown("### Results")
+            st.markdown("Documents ranked by semantic relevance to your query:")
+            for r in results:
+                score = r['score']
+                if score > 0.6:
+                    indicator = "[HIGH]"
+                    css_class = "high-score"
+                elif score > 0.4:
+                    indicator = "[MED]"
+                    css_class = "medium-score"
+                else:
+                    indicator = "[LOW]"
+                    css_class = "low-score"
+                st.markdown(f"""
+                <div class="result-box {css_class}">
+                    <strong>{indicator} #{r['rank']}</strong> (score: {score:.4f})<br>
+                    {r['text']}
+                </div>
+                """, unsafe_allow_html=True)
+# ============================================================================
+# TAB 3: CLUSTERING
+# ============================================================================
+with tab3:
+    st.markdown("### Unsupervised Text Clustering")
+    st.markdown("""
+    The model encodes each text into a dense vector. K-Means clustering
+    then groups these vectors by proximity in the embedding space.
+    Texts that are semantically similar end up in the same cluster,
+    even if they share no common words.
+    """)
+    default_cluster_texts = """Python programming language
+Machine learning algorithms
+Deep learning neural networks
+JavaScript web development
+Cats and dogs as pets
+Pizza and pasta Italian food
+Sunny weather today
+Rainy day forecast
+Stock market trends
+Financial news update"""
+    cluster_texts = st.text_area(
+        "Texts to cluster (one per line)",
+        default_cluster_texts,
+        height=200,
+        key="cluster_texts"
+    )
+    n_clusters = st.slider("Number of clusters", 2, 10, 3, key="n_clusters")
+    if st.button("Run Clustering", type="primary", key="cluster_btn"):
+        texts = [t.strip() for t in cluster_texts.split('\n') if t.strip()]
+        if len(texts) >= n_clusters:
+            with st.spinner("Clustering..."):
+                result = model.cluster_texts(texts, n_clusters=n_clusters)
+            st.markdown("### Cluster Assignments")
+            st.markdown("Each group contains texts that the model considers semantically related:")
+            colors = ["#667eea", "#28a745", "#ffc107", "#dc3545", "#17a2b8",
+                     "#6f42c1", "#fd7e14", "#20c997", "#e83e8c", "#6c757d"]
+            for cluster_id in sorted(result['texts_by_cluster'].keys()):
+                cluster_texts_list = result['texts_by_cluster'][cluster_id]
+                color = colors[cluster_id % len(colors)]
+                st.markdown(f"""
+                <div style="background: {color}15; border-left: 4px solid {color};
+                            padding: 1rem; border-radius: 5px; margin: 0.5rem 0;">
+                    <strong style="color: {color};">Cluster {cluster_id + 1}</strong>
+                    ({len(cluster_texts_list)} texts)
+                </div>
+                """, unsafe_allow_html=True)
+                for text in cluster_texts_list:
+                    st.markdown(f"  - {text}")
+        else:
+            st.warning(f"Need at least {n_clusters} texts to create {n_clusters} clusters.")
+# ============================================================================
+# TAB 4: ENCODE TEXT
+# ============================================================================
+with tab4:
+    st.markdown("### Raw Embedding Inspector")
+    st.markdown("""
+    Convert any text into its dense vector representation. The output is a
+    256-dimensional float vector that is **L2-normalized** (unit length = 1.0).
+    This is the same representation used internally for similarity and search.
+    """)
+    encode_text = st.text_area(
+        "Text to encode",
+        "Machine learning is a fascinating field of study.",
+        height=100,
+        key="encode_text"
+    )
+    if st.button("Encode", type="primary", key="encode_btn"):
+        if encode_text:
+            with st.spinner("Encoding..."):
+                embedding = model.encode(encode_text)
+            st.markdown("### Embedding Vector")
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Dimensions", embedding.shape[1])
+            with col2:
+                st.metric("L2 Norm", f"{np.linalg.norm(embedding[0]):.4f}")
+            with col3:
+                st.metric("Mean Value", f"{embedding[0].mean():.4f}")
+            st.markdown("#### First 20 values:")
+            st.code(str(embedding[0][:20].round(4).tolist()))
+            st.markdown("#### Value Distribution")
+            st.markdown("A well-trained model produces a roughly Gaussian distribution centered near zero:")
+            import plotly.express as px
+            fig = px.histogram(
+                x=embedding[0],
+                nbins=50,
+                title="Embedding Value Distribution",
+                labels={'x': 'Value', 'y': 'Count'}
+            )
+            fig.update_layout(showlegend=False)
+            st.plotly_chart(fig, width="stretch")
+# ============================================================================
+# TAB 5: CSV MATCHER
+# ============================================================================
+with tab5:
+    st.markdown("### Bulk CSV Record Matcher")
+    st.markdown("""
+    Upload two CSV files and match rows across them using semantic similarity.
+    This is useful for:
+    - **Product deduplication** across e-commerce platforms
+    - **Record linkage** between databases with inconsistent naming
+    - **Cross-platform mapping** (e.g., matching supplier catalogs to your inventory)
+    The model encodes the selected text column from each CSV, then ranks
+    every row in CSV 2 against each row in CSV 1 by cosine similarity.
+    """)
+    col1, col2 = st.columns(2)
+    with col1:
+        st.markdown("#### Upload CSV 1 (Queries)")
+        file1 = st.file_uploader("Upload primary CSV", type=['csv'], key="csv_file_1")
+    with col2:
+        st.markdown("#### Upload CSV 2 (Knowledge Base)")
+        file2 = st.file_uploader("Upload secondary CSV", type=['csv'], key="csv_file_2")
+    if file1 and file2:
+        df1 = pd.read_csv(file1)
+        df2 = pd.read_csv(file2)
+        st.markdown("---")
+        col_m1, col_m2 = st.columns(2)
+        with col_m1:
+            col1_name = st.selectbox("Select column to match from CSV 1", df1.columns, key="col1_sel")
+        with col_m2:
+            col2_name = st.selectbox("Select column to search in CSV 2", df2.columns, key="col2_sel")
+        col_p1, col_p2 = st.columns(2)
+        with col_p1:
+            top_n_candidates = st.slider("Step 1: Top candidates to fetch", 1, 50, 10, help="Initial semantic search depth")
+        with col_p2:
+            top_m_final = st.slider("Step 2: Top matches to keep", 1, 10, 3, help="Final number of matches per row")
+        if st.button("Start Bulk Matching", type="primary"):
+            progress_bar = st.progress(0)
+            status_text = st.empty()
+            queries = df1[col1_name].fillna("").astype(str).tolist()
+            corpus = df2[col2_name].fillna("").astype(str).tolist()
+            status_text.text("Encoding search corpus (CSV 2)...")
+            corpus_embs = model.encode(corpus, batch_size=128)
+            progress_bar.progress(20)
+            status_text.text("Encoding queries (CSV 1)...")
+            query_embs = model.encode(queries, batch_size=128)
+            progress_bar.progress(50)
+            status_text.text("Computing similarities and mapping...")
+            similarities = np.dot(query_embs, corpus_embs.T)
+            progress_bar.progress(80)
+            all_results = []
+            for i in range(len(queries)):
+                row_scores = similarities[i]
+                top_indices = np.argsort(row_scores)[::-1][:top_m_final]
+                res_row = df1.iloc[i].to_dict()
+                for rank, idx in enumerate(top_indices, 1):
+                    res_row[f'Match_{rank}_{col2_name}'] = corpus[idx]
+                    res_row[f'Match_{rank}_Score'] = round(float(row_scores[idx]), 4)
+                all_results.append(res_row)
+            res_df = pd.DataFrame(all_results)
+            progress_bar.progress(100)
+            status_text.text("Matching complete.")
+            st.markdown("### Results Preview")
+            st.dataframe(res_df.head(50), width="stretch")
+            output = io.StringIO()
+            res_df.to_csv(output, index=False)
+            csv_string = output.getvalue()
+            st.download_button(
+                label="Download Full Results CSV",
+                data=csv_string,
+                file_name="semantic_matching_results.csv",
+                mime="text/csv",
+            )
+    else:
+        st.info("Upload both CSV files to begin matching.")
+# ============================================================================
+# FOOTER
+# ============================================================================
+st.markdown("---")
+st.markdown("""
+<div style="text-align: center; color: #666; padding: 1rem;">
+    <strong>MiniEmbed</strong> | Lightweight Text Embeddings |
+    <a href="https://github.com/bhandarisuraz/miniembed">GitHub</a>
+</div>
+""", unsafe_allow_html=True)

examples/basic_usage.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Basic Usage Example
+===================
+Demonstrates encoding texts and computing similarity using MiniEmbed.
+This script shows the three core operations:
+  1. Encoding raw text into dense vectors
+  2. Computing pairwise similarity between two texts
+  3. Building a full similarity matrix across sets of texts
+"""
+import sys
+sys.path.insert(0, '..')
+from src.inference import EmbeddingInference
+def main():
+    print("=" * 60)
+    print("MiniEmbed - Basic Usage Example")
+    print("=" * 60)
+    # Load the model
+    print("\nLoading model...")
+    model = EmbeddingInference.from_pretrained("../models/mini")
+    print("Model loaded.\n")
+    # -------------------------------------------------------------------------
+    # Example 1: Encode texts
+    # -------------------------------------------------------------------------
+    print("-" * 40)
+    print("Example 1: Encoding Texts")
+    print("-" * 40)
+    texts = [
+        "Machine learning is a branch of artificial intelligence",
+        "Deep learning uses neural networks with many layers",
+        "I love eating pizza on weekends"
+    ]
+    embeddings = model.encode(texts)
+    print(f"Input: {len(texts)} texts")
+    print(f"Output: {embeddings.shape}")  # (3, 256)
+    # -------------------------------------------------------------------------
+    # Example 2: Compute similarity
+    # -------------------------------------------------------------------------
+    print("\n" + "-" * 40)
+    print("Example 2: Computing Similarity")
+    print("-" * 40)
+    pairs = [
+        ("Machine learning is great", "AI is wonderful"),
+        ("Machine learning is great", "I love pizza"),
+        ("The cat sat on the mat", "A feline rested on the rug"),
+    ]
+    for text1, text2 in pairs:
+        similarity = model.similarity(text1, text2)
+        tag = "MATCH" if similarity > 0.5 else "  LOW"
+        print(f"  [{tag}] {similarity:.4f} | '{text1}' vs '{text2}'")
+    # -------------------------------------------------------------------------
+    # Example 3: Pairwise similarity matrix
+    # -------------------------------------------------------------------------
+    print("\n" + "-" * 40)
+    print("Example 3: Pairwise Similarity Matrix")
+    print("-" * 40)
+    texts_a = ["Machine learning", "Deep learning", "Natural language"]
+    texts_b = ["AI models", "Neural networks", "Text processing"]
+    similarity_matrix = model.pairwise_similarity(texts_a, texts_b)
+    print("\nSimilarity Matrix:")
+    print("              ", "  ".join(f"{t[:10]:>10}" for t in texts_b))
+    for i, text in enumerate(texts_a):
+        row = "  ".join(f"{similarity_matrix[i, j]:>10.4f}" for j in range(len(texts_b)))
+        print(f"{text[:12]:>12}: {row}")
+    print("\nDone.")
+if __name__ == "__main__":
+    main()

examples/clustering.py ADDED Viewed

	@@ -0,0 +1,109 @@

+"""
+Text Clustering Example
+=======================
+Demonstrates how to cluster texts by semantic similarity using MiniEmbed.
+The model encodes each text into a dense vector. K-Means clustering then
+groups these vectors by proximity in the embedding space, even if the texts
+share no common words.
+"""
+import sys
+sys.path.insert(0, '..')
+from src.inference import EmbeddingInference
+def main():
+    print("=" * 60)
+    print("MiniEmbed - Text Clustering Example")
+    print("=" * 60)
+    # Load the model
+    print("\nLoading model...")
+    model = EmbeddingInference.from_pretrained("../models/mini")
+    print("Model loaded.\n")
+    # -------------------------------------------------------------------------
+    # Text collection (mixed topics)
+    # -------------------------------------------------------------------------
+    texts = [
+        # Technology
+        "Python is a versatile programming language",
+        "Machine learning models learn from data",
+        "JavaScript is used for web development",
+        "Neural networks process information like the brain",
+        "Software engineering involves designing systems",
+        # Food
+        "Pizza is my favorite Italian dish",
+        "Sushi is a traditional Japanese cuisine",
+        "Tacos are delicious Mexican street food",
+        "Pasta with marinara sauce is comforting",
+        "Ramen noodles are popular in Japan",
+        # Sports
+        "Football is the most popular sport worldwide",
+        "Basketball requires teamwork and skill",
+        "Tennis is an exciting individual sport",
+        "Swimming is great for cardiovascular health",
+        "Soccer World Cup attracts billions of viewers",
+        # Nature
+        "Mountains offer breathtaking scenic views",
+        "Oceans cover most of the Earth's surface",
+        "Forests are home to diverse wildlife",
+        "Rivers provide fresh water to ecosystems",
+        "Deserts have extreme temperature variations",
+    ]
+    print(f"Text Collection: {len(texts)} texts (4 topics)")
+    # -------------------------------------------------------------------------
+    # Cluster texts
+    # -------------------------------------------------------------------------
+    print("\nClustering texts into 4 groups...")
+    result = model.cluster_texts(texts, n_clusters=4)
+    # -------------------------------------------------------------------------
+    # Display results
+    # -------------------------------------------------------------------------
+    print("\n" + "=" * 60)
+    print("Clustering Results")
+    print("=" * 60)
+    for cluster_id in sorted(result['texts_by_cluster'].keys()):
+        cluster_texts = result['texts_by_cluster'][cluster_id]
+        print(f"\n  Cluster {cluster_id + 1} ({len(cluster_texts)} texts)")
+        print("-" * 40)
+        for text in cluster_texts:
+            print(f"   - {text}")
+    # -------------------------------------------------------------------------
+    # Evaluate clustering (simple check)
+    # -------------------------------------------------------------------------
+    print("\n" + "=" * 60)
+    print("Clustering Analysis")
+    print("=" * 60)
+    # Expected groupings (approximate)
+    expected = {
+        "Technology": texts[0:5],
+        "Food": texts[5:10],
+        "Sports": texts[10:15],
+        "Nature": texts[15:20],
+    }
+    print("\nLabels assigned to each text:")
+    for i, (text, label) in enumerate(zip(texts, result['labels'])):
+        topic = list(expected.keys())[i // 5]
+        print(f"   [{label}] ({topic}) {text[:50]}...")
+    print("\nDone.")
+if __name__ == "__main__":
+    main()

examples/semantic_search.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+Semantic Search Example
+=======================
+Demonstrates how to use MiniEmbed for document retrieval.
+The model encodes a query and a corpus of documents into the same vector space,
+then ranks documents by cosine similarity to the query. This finds results based
+on meaning, not keyword overlap.
+"""
+import sys
+sys.path.insert(0, '..')
+from src.inference import EmbeddingInference
+def main():
+    print("=" * 60)
+    print("MiniEmbed - Semantic Search Example")
+    print("=" * 60)
+    # Load the model
+    print("\nLoading model...")
+    model = EmbeddingInference.from_pretrained("../models/mini")
+    print("Model loaded.\n")
+    # -------------------------------------------------------------------------
+    # Document collection
+    # -------------------------------------------------------------------------
+    documents = [
+        "Python is a high-level programming language known for its simplicity",
+        "Machine learning algorithms can learn patterns from data",
+        "The weather today is sunny with a high of 75 degrees",
+        "Neural networks are computational models inspired by the brain",
+        "JavaScript is widely used for web development",
+        "Deep learning has revolutionized computer vision and NLP",
+        "Cats are popular pets known for their independence",
+        "TensorFlow and PyTorch are popular deep learning frameworks",
+        "The stock market showed strong gains today",
+        "Natural language processing helps computers understand text"
+    ]
+    print(f"Document Collection: {len(documents)} documents")
+    for i, doc in enumerate(documents, 1):
+        print(f"   {i}. {doc[:60]}...")
+    # -------------------------------------------------------------------------
+    # Search queries
+    # -------------------------------------------------------------------------
+    queries = [
+        "How do AI systems learn from examples?",
+        "What programming language is good for beginners?",
+        "Tell me about artificial neural networks",
+    ]
+    print("\n" + "=" * 60)
+    print("Search Results")
+    print("=" * 60)
+    for query in queries:
+        print(f"\n  Query: \"{query}\"")
+        print("-" * 50)
+        results = model.search(query, documents, top_k=3)
+        for r in results:
+            score = r['score']
+            if score > 0.6:
+                tag = "[HIGH]"
+            elif score > 0.4:
+                tag = "[ MED]"
+            else:
+                tag = "[ LOW]"
+            print(f"   {tag} #{r['rank']} (score: {score:.4f})")
+            print(f"         {r['text']}")
+    # -------------------------------------------------------------------------
+    # Interactive search (optional)
+    # -------------------------------------------------------------------------
+    print("\n" + "=" * 60)
+    print("Interactive Search")
+    print("=" * 60)
+    print("Enter your own queries (type 'quit' to exit):\n")
+    while True:
+        try:
+            query = input("  Query: ").strip()
+            if query.lower() in ['quit', 'exit', 'q']:
+                break
+            if not query:
+                continue
+            results = model.search(query, documents, top_k=3)
+            print("\n   Results:")
+            for r in results:
+                print(f"   - [{r['score']:.3f}] {r['text'][:70]}...")
+            print()
+        except (KeyboardInterrupt, EOFError):
+            break
+    print("\nDone.")
+if __name__ == "__main__":
+    main()

models/large/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# MiniEmbed - Large
+Full-scale variant for maximum accuracy on complex semantic tasks.
+Coming soon...

models/medium/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# MiniEmbed - Medium
+Balanced variant offering higher accuracy with moderate compute requirements.
+Coming soon...

models/mini/config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "vocab_size": 30000,
+  "d_model": 256,
+  "num_heads": 4,
+  "num_layers": 4,
+  "d_ff": 1024,
+  "max_seq_len": 128,
+  "pad_token_id": 0,
+  "size_name": "mini"
+}

models/mini/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b039b35819583641fc877c8aa3ce28aadc098d0ef79757c7bb8683141d2cde21
+size 43508675

models/mini/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f31f7aa0a6ea31a7cb1c0e68cf3a914b589866eef6b4379314de8ce64a139c8
+size 43495744

models/mini/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/mini/training_info.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "epochs": 10,
+  "total_steps": 149130,
+  "final_loss": 0.07483314797282219,
+  "training_samples": 3817707,
+  "batch_size": 256,
+  "learning_rate": 0.0002,
+  "date": "2026-02-13 22:53:59",
+  "training_time_minutes": 2940.0035917321843
+}

models/product/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# MiniEmbed - Product
+Fine-tuned variant of Mini, specialized for high-accuracy product matching.
+Coming soon...

models/small/README.md ADDED Viewed

	@@ -0,0 +1,5 @@

+# MiniEmbed - Small
+A larger variant with increased capacity for general-purpose embeddings.
+Coming soon...

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+# Core
+torch>=2.0.0
+numpy>=1.21.0
+tqdm>=4.64.0
+# Demo UI
+streamlit>=1.30.0
+plotly>=5.0.0
+# Optional (for clustering, CSV processing, & Benchmarking)
+scikit-learn>=1.0.0
+pandas>=2.0.0
+psutil>=5.9.0
+sentence-transformers>=2.2.0

src/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+MiniEmbed - Lightweight Text Embedding Model
+"""
+from .model import MiniTransformerEmbedding
+from .tokenizer import SimpleTokenizer
+from .inference import EmbeddingInference, EmbeddingModelManager
+__version__ = "1.0.0"
+__all__ = [
+    "MiniTransformerEmbedding",
+    "SimpleTokenizer",
+    "EmbeddingInference",
+    "EmbeddingModelManager"
+]

src/inference.py ADDED Viewed

	@@ -0,0 +1,353 @@

+"""
+Model Saving & Inference Module
+===================================
+Easy-to-use API for loading and running inference with the embedding model.
+"""
+import torch
+import torch.nn.functional as F
+import numpy as np
+import json
+import os
+from pathlib import Path
+from typing import List, Dict, Union, Tuple
+from .model import MiniTransformerEmbedding
+from .tokenizer import SimpleTokenizer
+class EmbeddingModelManager:
+    """
+    Handles saving and loading the embedding model.
+    Save structure:
+    model_dir/
+    ├── config.json          # Model architecture config
+    ├── model.pt             # Model weights
+    ├── tokenizer.json       # Vocabulary
+    └── training_info.json   # Training metadata (optional)
+    """
+    @staticmethod
+    def save_model(
+        model: MiniTransformerEmbedding,
+        tokenizer: SimpleTokenizer,
+        save_dir: str,
+        training_info: dict = None
+    ):
+        """
+        Save model, tokenizer, and config for later use.
+        Args:
+            model: Trained MiniTransformerEmbedding
+            tokenizer: SimpleTokenizer with vocabulary
+            save_dir: Directory to save model
+            training_info: Optional training metadata
+        """
+        save_dir = Path(save_dir)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        # 1. Save model config
+        config = {
+            'vocab_size': len(tokenizer.word_to_id),
+            'd_model': model.d_model,
+            'num_heads': model.layers[0].attention.num_heads,
+            'num_layers': len(model.layers),
+            'd_ff': model.layers[0].feed_forward.linear1.out_features,
+            'max_seq_len': model.positional_encoding.pe.size(1),
+            'pad_token_id': model.pad_token_id,
+            'size_name': save_dir.name # Use folder name as size name
+        }
+        with open(save_dir / 'config.json', 'w') as f:
+            json.dump(config, f, indent=2)
+        # 2. Save model weights
+        torch.save(model.state_dict(), save_dir / 'model.pt')
+        # 3. Save tokenizer vocabulary
+        tokenizer.save(str(save_dir / 'tokenizer.json'))
+        # 4. Save training info (optional)
+        if training_info:
+            with open(save_dir / 'training_info.json', 'w') as f:
+                json.dump(training_info, f, indent=2)
+        print(f"Model saved to: {save_dir}")
+    @staticmethod
+    def load_model(model_dir: str, device: str = None) -> Tuple[MiniTransformerEmbedding, SimpleTokenizer]:
+        """
+        Load model and tokenizer from a local directory or HuggingFace repo.
+        Args:
+            model_dir: Local directory path OR HuggingFace repo ID
+                       (e.g., "surazbhandari/miniembed")
+            device: Device to load model on ('cpu', 'cuda', 'mps')
+        Returns:
+            (model, tokenizer) tuple
+        """
+        # Auto-detect HuggingFace repo ID (contains "/" but is not a local path)
+        if '/' in model_dir and not os.path.exists(model_dir):
+            model_dir = EmbeddingModelManager._download_from_hub(model_dir)
+        model_dir = Path(model_dir)
+        if device is None:
+            if torch.cuda.is_available():
+                device = 'cuda'
+            elif torch.backends.mps.is_available():
+                device = 'mps'
+            else:
+                device = 'cpu'
+        # 1. Load config
+        config_path = model_dir / 'config.json'
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+        # 2. Load tokenizer
+        tokenizer_path = model_dir / 'tokenizer.json'
+        tokenizer = SimpleTokenizer(vocab_size=config['vocab_size'])
+        tokenizer.load(str(tokenizer_path))
+        # 3. Create and load model
+        model = MiniTransformerEmbedding(
+            vocab_size=config['vocab_size'],
+            d_model=config['d_model'],
+            num_heads=config['num_heads'],
+            num_layers=config['num_layers'],
+            d_ff=config['d_ff'],
+            max_seq_len=config['max_seq_len'],
+            pad_token_id=config['pad_token_id']
+        )
+        # Load weights (prefer safetensors)
+        st_path = model_dir / 'model.safetensors'
+        pt_path = model_dir / 'model.pt'
+        if st_path.exists():
+            from safetensors.torch import load_file
+            state_dict = load_file(str(st_path), device=device)
+        elif pt_path.exists():
+            state_dict = torch.load(pt_path, map_location=device, weights_only=True)
+        else:
+            raise FileNotFoundError(f"Neither model.safetensors nor model.pt found in {model_dir}")
+        model.load_state_dict(state_dict)
+        model = model.to(device)
+        model.eval()
+        return model, tokenizer
+    @staticmethod
+    def _download_from_hub(repo_id: str) -> str:
+        """
+        Download model files from a HuggingFace repository.
+        Args:
+            repo_id: HuggingFace repo ID (e.g., "surazbhandari/miniembed")
+        Returns:
+            Local directory path containing the downloaded files.
+        """
+        try:
+            from huggingface_hub import hf_hub_download, snapshot_download
+        except ImportError:
+            raise ImportError(
+                "huggingface_hub is required to download models from HuggingFace. "
+                "Install it with: pip install huggingface_hub"
+            )
+        # Download the full model snapshot
+        local_dir = snapshot_download(
+            repo_id=repo_id,
+            allow_patterns=["config.json", "model.safetensors", "model.pt", "tokenizer.json", "training_info.json"],
+        )
+        return local_dir
+    @staticmethod
+    def list_models(base_dir: str = "models") -> List[str]:
+        """
+        List available model names in the base directory.
+        Returns:
+            List of directory names containing valid models
+        """
+        path = Path(base_dir)
+        if not path.exists():
+            return []
+        return sorted([d.name for d in path.iterdir() if d.is_dir() and (d / "model.pt").exists()])
+class EmbeddingInference:
+    """
+    High-level inference API for the embedding model.
+    Usage:
+        # From local directory
+        model = EmbeddingInference.from_pretrained("./models/mini")
+        # From HuggingFace
+        model = EmbeddingInference.from_pretrained("surazbhandari/miniembed")
+        # Encode texts
+        embeddings = model.encode(["Hello world", "Machine learning"])
+        # Compute similarity
+        score = model.similarity("query", "document")
+        # Semantic search
+        results = model.search("python programming", documents)
+    """
+    def __init__(
+        self,
+        model: MiniTransformerEmbedding,
+        tokenizer: SimpleTokenizer,
+        device: str = 'cpu',
+        max_length: int = 64
+    ):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.max_length = max_length
+        self.model.eval()
+    @classmethod
+    def from_pretrained(cls, model_dir: str, device: str = None):
+        """
+        Load model from a local directory or HuggingFace repo ID.
+        Args:
+            model_dir: Local path (e.g., "models/mini") or
+                       HuggingFace repo ID (e.g., "surazbhandari/miniembed")
+            device: Device to load on ('cpu', 'cuda', 'mps'). Auto-detected if None.
+        """
+        model, tokenizer = EmbeddingModelManager.load_model(model_dir, device)
+        if device is None:
+            device = next(model.parameters()).device.type
+        return cls(model, tokenizer, device)
+    def encode(
+        self,
+        texts: Union[str, List[str]],
+        batch_size: int = 32,
+        show_progress: bool = False
+    ) -> np.ndarray:
+        """
+        Encode texts to embeddings.
+        Args:
+            texts: Single text or list of texts
+            batch_size: Batch size for encoding
+            show_progress: Show progress bar
+        Returns:
+            numpy array of shape (n_texts, d_model)
+        """
+        if isinstance(texts, str):
+            texts = [texts]
+        all_embeddings = []
+        # Process in batches
+        for i in range(0, len(texts), batch_size):
+            batch_texts = texts[i:i + batch_size]
+            # Tokenize
+            encodings = [
+                self.tokenizer.encode(t, self.max_length)
+                for t in batch_texts
+            ]
+            input_ids = torch.stack([e['input_ids'] for e in encodings]).to(self.device)
+            attention_mask = torch.stack([e['attention_mask'] for e in encodings]).to(self.device)
+            # Encode
+            with torch.no_grad():
+                embeddings = self.model.encode(input_ids, attention_mask)
+            all_embeddings.append(embeddings.cpu().numpy())
+        return np.vstack(all_embeddings)
+    def similarity(self, text1: str, text2: str) -> float:
+        """Compute cosine similarity between two texts."""
+        emb1 = self.encode(text1)
+        emb2 = self.encode(text2)
+        return float(np.dot(emb1[0], emb2[0]))
+    def pairwise_similarity(self, texts1: List[str], texts2: List[str]) -> np.ndarray:
+        """
+        Compute pairwise similarity between two lists.
+        Returns:
+            Matrix of shape (len(texts1), len(texts2))
+        """
+        emb1 = self.encode(texts1)
+        emb2 = self.encode(texts2)
+        return np.dot(emb1, emb2.T)
+    def search(
+        self,
+        query: str,
+        documents: List[str],
+        top_k: int = 5
+    ) -> List[Dict]:
+        """
+        Semantic search: Find most similar documents to query.
+        Args:
+            query: Search query
+            documents: List of documents to search
+            top_k: Number of results to return
+        Returns:
+            List of dicts with 'text', 'score', 'rank'
+        """
+        query_emb = self.encode(query)
+        doc_embs = self.encode(documents)
+        # Compute similarities
+        scores = np.dot(doc_embs, query_emb.T).flatten()
+        # Get top-k indices
+        top_indices = np.argsort(scores)[::-1][:top_k]
+        results = []
+        for rank, idx in enumerate(top_indices, 1):
+            results.append({
+                'rank': rank,
+                'text': documents[idx],
+                'score': float(scores[idx]),
+                'index': int(idx)
+            })
+        return results
+    def cluster_texts(self, texts: List[str], n_clusters: int = 5) -> Dict:
+        """
+        Cluster texts by embedding similarity.
+        Returns:
+            Dict with 'labels' and 'texts_by_cluster'
+        """
+        from sklearn.cluster import KMeans
+        embeddings = self.encode(texts)
+        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
+        labels = kmeans.fit_predict(embeddings)
+        return {
+            'labels': labels.tolist(),
+            'centroids': kmeans.cluster_centers_,
+            'texts_by_cluster': {
+                i: [texts[j] for j in range(len(texts)) if labels[j] == i]
+                for i in range(n_clusters)
+            }
+        }

src/model.py ADDED Viewed

	@@ -0,0 +1,359 @@

+"""
+Mini-Transformer Embedding Model
+====================================
+A lightweight transformer encoder for generating text embeddings.
+Built from scratch using PyTorch.
+Architecture:
+- Token Embeddings + Sinusoidal Positional Encoding
+- N Transformer Encoder Layers (Pre-LayerNorm)
+- Multi-Head Self-Attention
+- Position-wise Feed-Forward Networks
+- Mean Pooling + L2 Normalization
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from typing import Optional
+class SinusoidalPositionalEncoding(nn.Module):
+    """
+    Sinusoidal positional encoding from "Attention Is All You Need".
+    Adds position information to token embeddings using sin/cos functions
+    at different frequencies, allowing the model to understand token order.
+    """
+    def __init__(self, d_model: int, max_seq_len: int = 512, dropout: float = 0.1):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # Create positional encoding matrix [max_seq_len, d_model]
+        pe = torch.zeros(max_seq_len, d_model)
+        position = torch.arange(0, max_seq_len, dtype=torch.float).unsqueeze(1)
+        # Compute division term for frequencies
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        # Apply sin to even indices, cos to odd indices
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        # Add batch dimension and register as buffer (not a parameter)
+        pe = pe.unsqueeze(0)  # [1, max_seq_len, d_model]
+        self.register_buffer('pe', pe)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Tensor of shape [batch_size, seq_len, d_model]
+        Returns:
+            Tensor with positional encoding added
+        """
+        x = x + self.pe[:, :x.size(1), :]
+        return self.dropout(x)
+class MultiHeadSelfAttention(nn.Module):
+    """
+    Multi-Head Self-Attention mechanism.
+    Allows the model to jointly attend to information from different
+    representation subspaces at different positions.
+    """
+    def __init__(self, d_model: int, num_heads: int, dropout: float = 0.1):
+        super().__init__()
+        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
+        self.d_model = d_model
+        self.num_heads = num_heads
+        self.d_k = d_model // num_heads  # Dimension per head
+        # Linear projections for Q, K, V
+        self.W_q = nn.Linear(d_model, d_model)
+        self.W_k = nn.Linear(d_model, d_model)
+        self.W_v = nn.Linear(d_model, d_model)
+        # Output projection
+        self.W_o = nn.Linear(d_model, d_model)
+        self.dropout = nn.Dropout(dropout)
+        self.scale = math.sqrt(self.d_k)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+            attention_mask: Optional mask [batch_size, seq_len]
+        Returns:
+            Output tensor [batch_size, seq_len, d_model]
+        """
+        batch_size, seq_len, _ = x.size()
+        # Linear projections
+        Q = self.W_q(x)  # [batch, seq, d_model]
+        K = self.W_k(x)
+        V = self.W_v(x)
+        # Reshape to [batch, num_heads, seq, d_k]
+        Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        K = K.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        V = V.view(batch_size, seq_len, self.num_heads, self.d_k).transpose(1, 2)
+        # Scaled dot-product attention
+        scores = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
+        # scores: [batch, num_heads, seq, seq]
+        # Apply attention mask (for padding)
+        if attention_mask is not None:
+            # Expand mask: [batch, 1, 1, seq]
+            mask = attention_mask.unsqueeze(1).unsqueeze(2)
+            scores = scores.masked_fill(mask == 0, float('-inf'))
+        # Softmax and dropout
+        attn_weights = F.softmax(scores, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+        # Apply attention to values
+        context = torch.matmul(attn_weights, V)
+        # context: [batch, num_heads, seq, d_k]
+        # Reshape back: [batch, seq, d_model]
+        context = context.transpose(1, 2).contiguous().view(batch_size, seq_len, self.d_model)
+        # Output projection
+        output = self.W_o(context)
+        return output
+class PositionwiseFeedForward(nn.Module):
+    """
+    Position-wise Feed-Forward Network.
+    Two linear transformations with a GELU activation in between.
+    Applied to each position separately and identically.
+    """
+    def __init__(self, d_model: int, d_ff: int, dropout: float = 0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)
+        self.linear2 = nn.Linear(d_ff, d_model)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+        Returns:
+            Output tensor [batch_size, seq_len, d_model]
+        """
+        x = self.linear1(x)
+        x = F.gelu(x)
+        x = self.dropout(x)
+        x = self.linear2(x)
+        return x
+class TransformerEncoderLayer(nn.Module):
+    """
+    Single Transformer Encoder Layer with Pre-LayerNorm.
+    Components:
+    1. Multi-Head Self-Attention with residual connection
+    2. Position-wise Feed-Forward with residual connection
+    Uses Pre-LayerNorm for better training stability.
+    """
+    def __init__(
+        self,
+        d_model: int,
+        num_heads: int,
+        d_ff: int,
+        dropout: float = 0.1
+    ):
+        super().__init__()
+        # Layer normalization
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        # Sub-layers
+        self.attention = MultiHeadSelfAttention(d_model, num_heads, dropout)
+        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
+        # Dropout for residual connections
+        self.dropout = nn.Dropout(dropout)
+    def forward(
+        self,
+        x: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Args:
+            x: Input tensor [batch_size, seq_len, d_model]
+            attention_mask: Optional mask [batch_size, seq_len]
+        Returns:
+            Output tensor [batch_size, seq_len, d_model]
+        """
+        # Pre-norm attention block
+        normed = self.norm1(x)
+        attn_output = self.attention(normed, attention_mask)
+        x = x + self.dropout(attn_output)  # Residual connection
+        # Pre-norm feed-forward block
+        normed = self.norm2(x)
+        ff_output = self.feed_forward(normed)
+        x = x + self.dropout(ff_output)  # Residual connection
+        return x
+class MiniTransformerEmbedding(nn.Module):
+    """
+    Mini-Transformer Embedding Model.
+    Converts variable-length text sequences into fixed-size dense vectors
+    suitable for semantic similarity, search, and clustering tasks.
+    Architecture:
+    1. Token Embedding Layer (vocab → d_model)
+    2. Sinusoidal Positional Encoding
+    3. N Transformer Encoder Layers
+    4. Mean Pooling (sequence → single vector)
+    5. L2 Normalization (for cosine similarity)
+    """
+    def __init__(
+        self,
+        vocab_size: int = 30000,
+        d_model: int = 256,
+        num_heads: int = 4,
+        num_layers: int = 4,
+        d_ff: int = 1024,
+        max_seq_len: int = 128,
+        dropout: float = 0.1,
+        pad_token_id: int = 0
+    ):
+        super().__init__()
+        self.d_model = d_model
+        self.pad_token_id = pad_token_id
+        # Token embedding
+        self.token_embedding = nn.Embedding(
+            vocab_size, d_model, padding_idx=pad_token_id
+        )
+        # Positional encoding
+        self.positional_encoding = SinusoidalPositionalEncoding(
+            d_model, max_seq_len, dropout
+        )
+        # Transformer encoder layers
+        self.layers = nn.ModuleList([
+            TransformerEncoderLayer(d_model, num_heads, d_ff, dropout)
+            for _ in range(num_layers)
+        ])
+        # Final layer norm
+        self.final_norm = nn.LayerNorm(d_model)
+        # Initialize weights
+        self._init_weights()
+    def _init_weights(self):
+        """Initialize weights using Xavier/Glorot initialization."""
+        for module in self.modules():
+            if isinstance(module, nn.Linear):
+                nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.zeros_(module.bias)
+            elif isinstance(module, nn.Embedding):
+                nn.init.normal_(module.weight, mean=0, std=0.02)
+                if module.padding_idx is not None:
+                    nn.init.zeros_(module.weight[module.padding_idx])
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Forward pass through the encoder.
+        Args:
+            input_ids: Token IDs [batch_size, seq_len]
+            attention_mask: Mask for padding [batch_size, seq_len]
+        Returns:
+            Token-level representations [batch_size, seq_len, d_model]
+        """
+        # Token embeddings with scaling
+        x = self.token_embedding(input_ids) * math.sqrt(self.d_model)
+        # Add positional encoding
+        x = self.positional_encoding(x)
+        # Pass through transformer layers
+        for layer in self.layers:
+            x = layer(x, attention_mask)
+        # Final layer norm
+        x = self.final_norm(x)
+        return x
+    def encode(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """
+        Encode input tokens to a single embedding vector per sequence.
+        Uses mean pooling over non-padded tokens, followed by L2 normalization.
+        Args:
+            input_ids: Token IDs [batch_size, seq_len]
+            attention_mask: Mask for padding [batch_size, seq_len]
+        Returns:
+            Normalized embeddings [batch_size, d_model]
+        """
+        # Get token-level representations
+        token_embeddings = self.forward(input_ids, attention_mask)
+        # Mean pooling
+        if attention_mask is not None:
+            # Expand mask for broadcasting: [batch, seq, 1]
+            mask_expanded = attention_mask.unsqueeze(-1).float()
+            # Sum of embeddings (masked)
+            sum_embeddings = torch.sum(token_embeddings * mask_expanded, dim=1)
+            # Count of non-padded tokens
+            sum_mask = torch.clamp(mask_expanded.sum(dim=1), min=1e-9)
+            # Mean
+            embeddings = sum_embeddings / sum_mask
+        else:
+            # Simple mean over all tokens
+            embeddings = torch.mean(token_embeddings, dim=1)
+        # L2 normalization for cosine similarity
+        embeddings = F.normalize(embeddings, p=2, dim=1)
+        return embeddings

src/tokenizer.py ADDED Viewed

	@@ -0,0 +1,173 @@

+"""
+Simple Word-Level Tokenizer
+==============================
+A basic tokenizer for demonstration purposes.
+Converts text to token IDs with special tokens.
+"""
+import re
+import json
+from typing import Dict, List, Optional
+from collections import Counter
+from tqdm import tqdm
+class SimpleTokenizer:
+    """
+    A simple word-level tokenizer with special tokens.
+    Special Tokens:
+    - [PAD]: Padding token (id=0)
+    - [UNK]: Unknown token (id=1)
+    - [CLS]: Classification token (id=2)
+    - [SEP]: Separator token (id=3)
+    """
+    def __init__(self, vocab_size: int = 30000):
+        self.vocab_size = vocab_size
+        # Special tokens
+        self.special_tokens = {
+            '[PAD]': 0,
+            '[UNK]': 1,
+            '[CLS]': 2,
+            '[SEP]': 3,
+        }
+        # Word to ID mapping
+        self.word_to_id: Dict[str, int] = dict(self.special_tokens)
+        self.id_to_word: Dict[int, str] = {v: k for k, v in self.special_tokens.items()}
+        # Special token IDs
+        self.pad_token_id = 0
+        self.unk_token_id = 1
+        self.cls_token_id = 2
+        self.sep_token_id = 3
+    def _tokenize(self, text: str) -> List[str]:
+        """
+        Split text into tokens (simple word-level tokenization).
+        Args:
+            text: Input text string
+        Returns:
+            List of tokens
+        """
+        # Lowercase and basic cleaning
+        text = text.lower().strip()
+        # Simple word tokenization with punctuation handling
+        tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
+        return tokens
+    def build_vocab(self, texts: List[str], min_freq: int = 2):
+        """
+        Build vocabulary from a list of texts.
+        Args:
+            texts: List of text strings
+            min_freq: Minimum frequency for a word to be included
+        """
+        # Count word frequencies
+        word_counts = Counter()
+        for text in tqdm(texts, desc="Building vocabulary"):
+            tokens = self._tokenize(text)
+            word_counts.update(tokens)
+        # Sort by frequency and take top vocab_size - special_tokens
+        max_words = self.vocab_size - len(self.special_tokens)
+        sorted_words = sorted(
+            word_counts.items(),
+            key=lambda x: x[1],
+            reverse=True
+        )
+        # Add words to vocabulary
+        for word, count in sorted_words[:max_words]:
+            if count >= min_freq and word not in self.word_to_id:
+                idx = len(self.word_to_id)
+                self.word_to_id[word] = idx
+                self.id_to_word[idx] = word
+        print(f"Vocabulary size: {len(self.word_to_id)}")
+    def encode(self, text: str, max_length: int = 128) -> Dict:
+        """
+        Encode text to token IDs with attention mask.
+        Args:
+            text: Input text string
+            max_length: Maximum sequence length
+        Returns:
+            Dictionary with 'input_ids' and 'attention_mask' tensors
+        """
+        import torch
+        # Tokenize
+        tokens = self._tokenize(text)
+        # Convert to IDs (with CLS and SEP)
+        token_ids = [self.cls_token_id]
+        for token in tokens[:max_length - 2]:  # Reserve space for CLS and SEP
+            token_id = self.word_to_id.get(token, self.unk_token_id)
+            token_ids.append(token_id)
+        token_ids.append(self.sep_token_id)
+        # Create attention mask
+        attention_mask = [1] * len(token_ids)
+        # Pad to max_length
+        padding_length = max_length - len(token_ids)
+        token_ids.extend([self.pad_token_id] * padding_length)
+        attention_mask.extend([0] * padding_length)
+        return {
+            'input_ids': torch.tensor(token_ids, dtype=torch.long),
+            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
+        }
+    def decode(self, token_ids: List[int]) -> str:
+        """
+        Decode token IDs back to text.
+        Args:
+            token_ids: List of token IDs
+        Returns:
+            Decoded text string
+        """
+        tokens = []
+        for idx in token_ids:
+            if idx in [self.pad_token_id, self.cls_token_id, self.sep_token_id]:
+                continue
+            token = self.id_to_word.get(idx, '[UNK]')
+            tokens.append(token)
+        return ' '.join(tokens)
+    def save(self, path: str):
+        """Save tokenizer vocabulary to JSON file."""
+        data = {
+            'vocab_size': self.vocab_size,
+            'word_to_id': self.word_to_id,
+        }
+        with open(path, 'w') as f:
+            json.dump(data, f, indent=2)
+    def load(self, path: str):
+        """Load tokenizer vocabulary from JSON file."""
+        with open(path, 'r') as f:
+            data = json.load(f)
+        self.vocab_size = data['vocab_size']
+        self.word_to_id = data['word_to_id']
+        self.id_to_word = {int(v): k for k, v in self.word_to_id.items()}
+    def __len__(self) -> int:
+        return len(self.word_to_id)