XiaSheng commited on
Commit
76cd6b1
·
verified ·
1 Parent(s): 82daf6c

Initial upload of FreeChunk model with custom code

Browse files
Files changed (3) hide show
  1. README.md +15 -2
  2. demo_long_text.py +2 -2
  3. encoder.py +24 -4
README.md CHANGED
@@ -1,3 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # FreeChunker-Nomic
2
 
3
  FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
@@ -32,8 +45,8 @@ text_file = "text.md"
32
  query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
33
 
34
  print(f"Loading model: {model_name}...")
35
- # Initialize encoder (auto-load from current directory)
36
- encoder = UnifiedEncoder(model_name=model_name, local_model_path=".", granularities=[2, 4])
37
 
38
  # Read text
39
  if not os.path.exists(text_file):
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: apache-2.0
5
+ library_name: transformers
6
+ tags:
7
+ - embedding
8
+ - rag
9
+ - chunking
10
+ - sentence-transformers
11
+ base_model: nomic-ai/nomic-embed-text-v1.5
12
+ ---
13
+
14
  # FreeChunker-Nomic
15
 
16
  FreeChunker is a training-free embedding optimization method that dynamically chunks text to improve retrieval performance. This repository contains the **FreeChunker** model initialized with **nomic-ai/nomic-embed-text-v1.5** embeddings.
 
45
  query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
46
 
47
  print(f"Loading model: {model_name}...")
48
+ # Initialize encoder (load from Hugging Face Hub)
49
+ encoder = UnifiedEncoder.from_pretrained("XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
50
 
51
  # Read text
52
  if not os.path.exists(text_file):
demo_long_text.py CHANGED
@@ -9,8 +9,8 @@ text_file = "text.md"
9
  query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
10
 
11
  print(f"Loading model: {model_name}...")
12
- # Initialize encoder (auto-load from current directory)
13
- encoder = UnifiedEncoder(model_name=model_name, local_model_path=".", granularities=[2, 4])
14
 
15
  # Read text
16
  if not os.path.exists(text_file):
 
9
  query = "How has the relationship between Machine Learning and Artificial Intelligence evolved from the 1950s to the 1990s, specifically regarding the shift from symbolic approaches to statistical methods, and what role did neural networks play in this transition?"
10
 
11
  print(f"Loading model: {model_name}...")
12
+ # Initialize encoder from HF Hub
13
+ encoder = UnifiedEncoder(model_name=model_name, model_name_or_path="XiaSheng/FreeChunk-nomic", granularities=[2, 4], trust_remote_code=True)
14
 
15
  # Read text
16
  if not os.path.exists(text_file):
encoder.py CHANGED
@@ -18,13 +18,13 @@ class UnifiedEncoder:
18
  Unified text encoder, supporting text sentence splitting and encoding for multiple models
19
  """
20
 
21
- def __init__(self, model_name: str, local_model_path: str = None, granularities: List[int] = None):
22
  """
23
  Initialize unified text encoder
24
 
25
  Args:
26
  model_name (str): Model name
27
- local_model_path (str, optional): Local model path for loading fine-tuned weights
28
  granularities (List[int], optional): Granularities for chunking
29
  """
30
  self.model_name = model_name
@@ -35,10 +35,14 @@ class UnifiedEncoder:
35
  self.aggregator = TextAggregator()
36
 
37
  print(f"Initializing unified text encoder, model: {model_name}")
38
- print(f"Using local model path: {local_model_path}")
39
  print(f"Using device: {self.device}")
40
 
41
- self.model = FreeChunkerModel.from_pretrained(local_model_path)
 
 
 
 
42
  self.model.to(self.device)
43
  self.model.eval()
44
 
@@ -60,6 +64,22 @@ class UnifiedEncoder:
60
 
61
  print("Unified text encoder initialized!")
62
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
64
  """
65
  Split text and encode, return results grouped by shift_matrix
 
18
  Unified text encoder, supporting text sentence splitting and encoding for multiple models
19
  """
20
 
21
+ def __init__(self, model_name: str, model_name_or_path: str = None, granularities: List[int] = None, **kwargs):
22
  """
23
  Initialize unified text encoder
24
 
25
  Args:
26
  model_name (str): Model name
27
+ model_name_or_path (str, optional): Model path or HF Hub ID
28
  granularities (List[int], optional): Granularities for chunking
29
  """
30
  self.model_name = model_name
 
35
  self.aggregator = TextAggregator()
36
 
37
  print(f"Initializing unified text encoder, model: {model_name}")
38
+ print(f"Using model path: {model_name_or_path}")
39
  print(f"Using device: {self.device}")
40
 
41
+ # If model_name_or_path is not provided, try to use model_name if it looks like a path/ID
42
+ if model_name_or_path is None:
43
+ model_name_or_path = model_name
44
+
45
+ self.model = FreeChunkerModel.from_pretrained(model_name_or_path, **kwargs)
46
  self.model.to(self.device)
47
  self.model.eval()
48
 
 
64
 
65
  print("Unified text encoder initialized!")
66
 
67
+ @classmethod
68
+ def from_pretrained(cls, model_name_or_path: str, model_name: str = None, **kwargs):
69
+ """
70
+ Load UnifiedEncoder from a pretrained model
71
+
72
+ Args:
73
+ model_name_or_path (str): HF Hub ID or local path
74
+ model_name (str, optional): Backbone model name (e.g. 'nomic-embed-text-v1.5').
75
+ If not provided, defaults to model_name_or_path.
76
+ """
77
+ if model_name is None:
78
+ # Try to infer or default to the path itself, though typically model_name implies the backbone type
79
+ model_name = "nomic-embed-text-v1.5" # Default for this repo
80
+
81
+ return cls(model_name=model_name, model_name_or_path=model_name_or_path, **kwargs)
82
+
83
  def encode(self, text: str, show_progress: bool = True) -> Tuple[List[str], np.ndarray, List[List[str]]]:
84
  """
85
  Split text and encode, return results grouped by shift_matrix