Initial upload of FreeChunk model with custom code
Browse files- README.md +15 -2
- demo_long_text.py +2 -2
- encoder.py +24 -4
README.md
CHANGED
|
@@ -1,3 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# FreeChunker-Nomic
|
| 2 |
|
| 3 |
FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
|
|
@@ -32,8 +45,8 @@ text_file = "text.md"
|
|
| 32 |
query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
|
| 33 |
|
| 34 |
print(f"Loading model: {model_name}...")
|
| 35 |
-
# Initialize encoder (
|
| 36 |
-
encoder = UnifiedEncoder(
|
| 37 |
|
| 38 |
# Read text
|
| 39 |
if not os.path.exists(text_file):
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
license: apache-2.0
|
| 5 |
+
library_name: transformers
|
| 6 |
+
tags:
|
| 7 |
+
- embedding
|
| 8 |
+
- rag
|
| 9 |
+
- chunking
|
| 10 |
+
- sentence-transformers
|
| 11 |
+
base_model: nomic-ai/nomic-embed-text-v1.5
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
# FreeChunker-Nomic
|
| 15 |
|
| 16 |
FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
|
|
|
|
| 45 |
query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
|
| 46 |
|
| 47 |
print(f"Loading model: {model_name}...")
|
| 48 |
+
# Initialize encoder (load from Hugging Face Hub)
|
| 49 |
+
encoder = UnifiedEncoder.from_pretrained("XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
|
| 50 |
|
| 51 |
# Read text
|
| 52 |
if not os.path.exists(text_file):
|
demo_long_text.py
CHANGED
|
@@ -9,8 +9,8 @@ text_file = "text.md"
|
|
| 9 |
query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
|
| 10 |
|
| 11 |
print(f"Loading model: {model_name}...")
|
| 12 |
-
# Initialize encoder
|
| 13 |
-
encoder = UnifiedEncoder(model_name=model_name,
|
| 14 |
|
| 15 |
# Read text
|
| 16 |
if not os.path.exists(text_file):
|
|
|
|
| 9 |
query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
|
| 10 |
|
| 11 |
print(f"Loading model: {model_name}...")
|
| 12 |
+
# Initialize encoder from HF Hub
|
| 13 |
+
encoder = UnifiedEncoder(model_name=model_name, model_name_or_path="XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
|
| 14 |
|
| 15 |
# Read text
|
| 16 |
if not os.path.exists(text_file):
|
encoder.py
CHANGED
|
@@ -18,13 +18,13 @@ class UnifiedEncoder:
|
|
| 18 |
Unified text encoder, supporting text sentence splitting and encoding for multiple models
|
| 19 |
"""
|
| 20 |
|
| 21 |
-
def __init__(self, model_name: str,
|
| 22 |
"""
|
| 23 |
Initialize unified text encoder
|
| 24 |
|
| 25 |
Args:
|
| 26 |
model_name (str): Model name
|
| 27 |
-
|
| 28 |
granularities (List[int], optional): Granularities for chunking
|
| 29 |
"""
|
| 30 |
self.model_name = model_name
|
|
@@ -35,10 +35,14 @@ class UnifiedEncoder:
|
|
| 35 |
self.aggregator = TextAggregator()
|
| 36 |
|
| 37 |
print(f"Initializing unified text encoder, model: {model_name}")
|
| 38 |
-
print(f"Using
|
| 39 |
print(f"Using device: {self.device}")
|
| 40 |
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
self.model.to(self.device)
|
| 43 |
self.model.eval()
|
| 44 |
|
|
@@ -60,6 +64,22 @@ class UnifiedEncoder:
|
|
| 60 |
|
| 61 |
print("Unified text encoder initialized!")
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
|
| 64 |
"""
|
| 65 |
Split text and encode, return results grouped by shift_matrix
|
|
|
|
| 18 |
Unified text encoder, supporting text sentence splitting and encoding for multiple models
|
| 19 |
"""
|
| 20 |
|
| 21 |
+
def __init__(self, model_name: str, model_name_or_path: str = None, granularities: List[int] = None, **kwargs):
|
| 22 |
"""
|
| 23 |
Initialize unified text encoder
|
| 24 |
|
| 25 |
Args:
|
| 26 |
model_name (str): Model name
|
| 27 |
+
model_name_or_path (str, optional): Model path or HF Hub ID
|
| 28 |
granularities (List[int], optional): Granularities for chunking
|
| 29 |
"""
|
| 30 |
self.model_name = model_name
|
|
|
|
| 35 |
self.aggregator = TextAggregator()
|
| 36 |
|
| 37 |
print(f"Initializing unified text encoder, model: {model_name}")
|
| 38 |
+
print(f"Using model path: {model_name_or_path}")
|
| 39 |
print(f"Using device: {self.device}")
|
| 40 |
|
| 41 |
+
# If model_name_or_path is not provided, try to use model_name if it looks like a path/ID
|
| 42 |
+
if model_name_or_path is None:
|
| 43 |
+
model_name_or_path = model_name
|
| 44 |
+
|
| 45 |
+
self.model = FreeChunkerModel.from_pretrained(model_name_or_path, **kwargs)
|
| 46 |
self.model.to(self.device)
|
| 47 |
self.model.eval()
|
| 48 |
|
|
|
|
| 64 |
|
| 65 |
print("Unified text encoder initialized!")
|
| 66 |
|
| 67 |
+
@classmethod
|
| 68 |
+
def from_pretrained(cls, model_name_or_path: str, model_name: str = None, **kwargs):
|
| 69 |
+
"""
|
| 70 |
+
Load UnifiedEncoder from a pretrained model
|
| 71 |
+
|
| 72 |
+
Args:
|
| 73 |
+
model_name_or_path (str): HF Hub ID or local path
|
| 74 |
+
model_name (str, optional): Backbone model name (e.g. 'nomic-embed-text-v1.5').
|
| 75 |
+
If not provided, defaults to model_name_or_path.
|
| 76 |
+
"""
|
| 77 |
+
if model_name is None:
|
| 78 |
+
# Try to infer or default to the path itself, though typically model_name implies the backbone type
|
| 79 |
+
model_name = "nomic-embed-text-v1.5" # Default for this repo
|
| 80 |
+
|
| 81 |
+
return cls(model_name=model_name, model_name_or_path=model_name_or_path, **kwargs)
|
| 82 |
+
|
| 83 |
def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
|
| 84 |
"""
|
| 85 |
Split text and encode, return results grouped by shift_matrix
|