XiaSheng
/

FreeChunk-nomic

@@ -1,3 +1,16 @@
 # FreeChunker-Nomic
 FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
@@ -32,8 +45,8 @@ text_file = "text.md"
 query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
 print(f"Loading model: {model_name}...")
-# Initialize encoder (auto-load from current directory)
-encoder = UnifiedEncoder(model_name=model_name, local_model_path=".", granularities=[2, 4])
 # Read text
 if not os.path.exists(text_file):

+---
+language:
+- en
+license: apache-2.0
+library_name: transformers
+tags:
+- embedding
+- rag
+- chunking
+- sentence-transformers
+base_model: nomic-ai/nomic-embed-text-v1.5
+---
 # FreeChunker-Nomic
 FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
 query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
 print(f"Loading model: {model_name}...")
+# Initialize encoder (load from Hugging Face Hub)
+encoder = UnifiedEncoder.from_pretrained("XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
 # Read text
 if not os.path.exists(text_file):

demo_long_text.py CHANGED Viewed

@@ -9,8 +9,8 @@ text_file = "text.md"
 query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
 print(f"Loading model: {model_name}...")
-# Initialize encoder (auto-load from current directory)
-encoder = UnifiedEncoder(model_name=model_name, local_model_path=".", granularities=[2, 4])
 # Read text
 if not os.path.exists(text_file):

 query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
 print(f"Loading model: {model_name}...")
+# Initialize encoder from HF Hub
+encoder = UnifiedEncoder(model_name=model_name, model_name_or_path="XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
 # Read text
 if not os.path.exists(text_file):

encoder.py CHANGED Viewed

@@ -18,13 +18,13 @@ class UnifiedEncoder:
     Unified text encoder, supporting text sentence splitting and encoding for multiple models
     """
-    def __init__(self, model_name: str, local_model_path: str = None, granularities: List[int] = None):
         """
         Initialize unified text encoder
         Args:
             model_name (str): Model name
-            local_model_path (str, optional): Local model path for loading fine-tuned weights
             granularities (List[int], optional): Granularities for chunking
         """
         self.model_name = model_name
@@ -35,10 +35,14 @@ class UnifiedEncoder:
         self.aggregator = TextAggregator()
         print(f"Initializing unified text encoder, model: {model_name}")
-        print(f"Using local model path: {local_model_path}")
         print(f"Using device: {self.device}")
-        self.model = FreeChunkerModel.from_pretrained(local_model_path)
         self.model.to(self.device)
         self.model.eval()
@@ -60,6 +64,22 @@ class UnifiedEncoder:
         print("Unified text encoder initialized!")
     def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
         """
         Split text and encode, return results grouped by shift_matrix

     Unified text encoder, supporting text sentence splitting and encoding for multiple models
     """
+    def __init__(self, model_name: str, model_name_or_path: str = None, granularities: List[int] = None, **kwargs):
         """
         Initialize unified text encoder
         Args:
             model_name (str): Model name
+            model_name_or_path (str, optional): Model path or HF Hub ID
             granularities (List[int], optional): Granularities for chunking
         """
         self.model_name = model_name
         self.aggregator = TextAggregator()
         print(f"Initializing unified text encoder, model: {model_name}")
+        print(f"Using model path: {model_name_or_path}")
         print(f"Using device: {self.device}")
+        # If model_name_or_path is not provided, try to use model_name if it looks like a path/ID
+        if model_name_or_path is None:
+             model_name_or_path = model_name
+        self.model = FreeChunkerModel.from_pretrained(model_name_or_path, **kwargs)
         self.model.to(self.device)
         self.model.eval()
         print("Unified text encoder initialized!")
+    @classmethod
+    def from_pretrained(cls, model_name_or_path: str, model_name: str = None, **kwargs):
+        """
+        Load UnifiedEncoder from a pretrained model
+        Args:
+            model_name_or_path (str): HF Hub ID or local path
+            model_name (str, optional): Backbone model name (e.g. 'nomic-embed-text-v1.5').
+                                      If not provided, defaults to model_name_or_path.
+        """
+        if model_name is None:
+            # Try to infer or default to the path itself, though typically model_name implies the backbone type
+            model_name = "nomic-embed-text-v1.5" # Default for this repo
+        return cls(model_name=model_name, model_name_or_path=model_name_or_path, **kwargs)
     def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
         """
         Split text and encode, return results grouped by shift_matrix